bugfixes and refactor of summary report

f248fc0c · Roger Kramer · 46aec1b7 · f248fc0c · f248fc0c · f248fc0c
Commit f248fc0c authored 8 years ago by Roger Kramer
7 changed files
--- a/src/MANIFEST
+++ b/src/MANIFEST
@@ -10,11 +10,11 @@ bdqc/dump.py
 bdqc/plugin.py
 bdqc/report-d3.js
 bdqc/report.css
-bdqc/report.py
 bdqc/scan.py
 bdqc/statistic.py
 bdqc/statpath.py
 bdqc/strings.py
+bdqc/summary.py
 bdqc/tsort.py
 bdqc/builtin/__init__.py
 bdqc/builtin/extrinsic/__init__.py

--- a/src/bdqc/analysis.py
+++ b/src/bdqc/analysis.py
@@ -12,17 +12,17 @@ from os.path import isfile

 from bdqc.statistic import Descriptor
 from bdqc.column import Vector
-from bdqc.report import HTML,Plaintext
 from bdqc.statpath import Selector
+from bdqc.summary import Summary
 from bdqc.data import flatten

 # Warning: the values of these constants are used to index function pointer
 # arrays. Don't change these without changing code that uses them!
 STATUS_NO_OUTLIERS     = 0
-STATUS_INCOMPARABLES   = 1
+STATUS_INCOMPARABLES   = 1 # different statistics present for diff files
 STATUS_AMBIGUOUS_STATS = 2
-STATUS_NULL_OUTLIERS   = 3
-STATUS_VALUE_OUTLIERS  = 4
+STATUS_NULL_OUTLIERS   = 3 # missing values for *present* statistics
+STATUS_VALUE_OUTLIERS  = 4 # quantitative or categorical

 STATUS_MSG = [
 	"no anomalies detected",
@@ -107,20 +107,26 @@ class Matrix(object):
 		Conditionally include the given file's within-file analysis in the
 		across-file analysis.

+		This method "flattens" the JSON data associated with filename and
+		appends it as the last row of the matrix (self), adding and padding
+		columns if necessary to preserve a matrix structure.
+
 		Upon exit from this method, the matrix is one row longer--that is,
 		each of self.column should be exactly one datum longer.

-		An important side effect of this method is that this is the only
-		place that incomparable files can be detected. Files' results are
-		allowed to contain null for statistics--a null value is not the
-		same as a missing statistic, but every file should have some value
-		for every statistic. Concretely, this means:
+		Detection of incomparable files is an important side effect of this
+		method. This is the ONLY PLACE that incomparable files are detected.
+		"Incomparable" files are files for which the within-file analyses
+		don't contain exactly the same set of statistics--that is, the same
+		set of *keys*. A statistic is allowed to contain null--a null value
+		represents a missing *value* which is not the same as a missing
+		* statistic*. Concretely, this means:
 		1. The very first file added establishes the matrix' columns.
 		2. If columns are added subsequently (to preserve the matrix'
 		   dimensions), then files are incomparable.
 		3.  If column padding is ever required (again, to preserve the 
 		   matrix' dimensions because a file was missing statistics
-		   presenct in the others), then files are incomparable.
+		   present in the others), then files are incomparable.

 		Returns a boolean indicating whether or not the file was actually
 		included.
@@ -186,14 +192,14 @@ class Matrix(object):
 		"""
 		if self.incomparables:
 			self.anom_col = sorted( list( self.incomparables ) )
-			H = hash( self.column[self.anom_col[0]].missing_indices() )
-			if all([hash(c.missing_indices()) == H for c in self.column.values() ]):
+			H = hash( self.column[self.anom_col[0]].indices_with_null() )
+			if all([hash(c.indices_with_null()) == H for c in self.column.values() ]):
 				# all statistics (columns) that are missing value, are
 				# missing those value from the same set of files, so the
 				# columns are collapsable. TODO
 				pass # TODO: 
 			self.anom_row = sorted( list( set().union(*[
-				self.column[k].missing_indices() for k in self.anom_col ]) ) )
+				self.column[k].indices_with_null() for k in self.anom_col ]) ) )
 			self.status = STATUS_INCOMPARABLES
 			return self.status

@@ -202,7 +208,7 @@ class Matrix(object):
 			self.column.keys() ) ) )
 		if len(self.anom_col) > 0:
 			self.anom_row = sorted( list( set().union(*[
-				self.column[k].minor_type_indices() for k in self.anom_col ]) ) )
+				self.column[k].indices_with_minority_types() for k in self.anom_col ]) ) )
 			self.status = STATUS_AMBIGUOUS_STATS
 			return self.status

@@ -213,7 +219,7 @@ class Matrix(object):
 			self.column.keys() ) ) )
 		if len(self.anom_col) > 0:
 			self.anom_row = sorted( list( set().union(*[
-				self.column[k].missing_indices() for k in self.anom_col ]) ) )
+				self.column[k].indices_with_null() for k in self.anom_col ]) ) )
 			self.status = STATUS_NULL_OUTLIERS
 			return self.status

@@ -228,7 +234,7 @@ class Matrix(object):
 			# contain anomalies (by virtue of being included in any
 			# column's anomaly list).
 			self.anom_row = sorted( list( set().union(*[
-				self.column[k].outlier_indices() for k in self.anom_col ]) ) )
+				self.column[k].indices_with_outliers() for k in self.anom_col ]) ) )
 			self.status = STATUS_VALUE_OUTLIERS
 		else:
 			self.status = STATUS_NO_OUTLIERS
@@ -240,19 +246,31 @@ class Matrix(object):
 		Return an incidence matrix the content of which depends on the
 		nature of the anomalies (missing data, ambiguous types, or
 		value discrepancies).
+		Matrix content depends on exit status:
+		1. incomparables
+		2. ambiguous stats
+		3. outliers (NULL,quantitative, or categorical)
+
+		Matrix content is constrained to the union of rows mentioned
+		in self.anom_row and columns mentioned in self.anom_col...so
+		it is implicitly minified.
 		"""
 		body = []
 		rows = []
 		cols = []
 		if self.status != STATUS_NO_OUTLIERS:
-			rows_of_interest = ( # depend on analysis status
+			# The method chosen for "row_selector" determines whether
+			# the cell in the rendition of the incidence matrix is
+			# positive/negative or foreground/background--the meaning
+			# depends on the analysis result.
+			row_selector = (
 				None,
-				Vector.present_indices,
-				Vector.minor_type_indices,
-				Vector.present_indices,
-				Vector.outlier_indices)[ self.status ]
-			is_row_of_interest = lambda rnum,cobj:rnum in rows_of_interest(cobj)
-			body = [ [ is_row_of_interest(r, self.column[c] )
+				Vector.indices_with_values,
+				Vector.indices_with_minority_types,
+				Vector.indices_with_values,
+				Vector.indices_with_outliers)[ self.status ]
+			is_cell_positive = lambda rnum,cobj:rnum in row_selector(cobj)
+			body = [ [ is_cell_positive(r, self.column[c] )
 				for c in self.anom_col ]
 				for r in self.anom_row ]
 			rows = [ self.files[ r ] for r in self.anom_row ]
@@ -262,6 +280,13 @@ class Matrix(object):
 	def status_msg( self ):
 		return STATUS_MSG[ self.status ]

+	def summary( self ):
+		"""
+		Returns a summary consisting of a variety of supporting evidence
+		for the aggregate result, including an incidence matrix.
+		"""
+		return Summary( self.status, STATUS_MSG[ self.status ], **self.incidence_matrix() )
+
 	def dump( self, fp=sys.stdout ):
 		head = list(sorted(self.column.keys()))
 		cols = [ self.column[k] for k in head ]
@@ -298,7 +323,7 @@ class _Loader(object):
 		self.target.add_file_data( basename, analysis )


-def _main( args, output ):
+def _main( args ):
 	"""
 	Aggregate JSON into a Matrix then call the Matrix' analyze method.
 	This function allows 
@@ -329,14 +354,16 @@ def _main( args, output ):
 					m.add_file_data( filename, content )
 		else:
 			raise RuntimeError( "{} is neither file nor directory".format(s) )
+
 	m.analyze()
+
 	if m.status: # ...is other than STATUS_NO_OUTLIERS
 		if args.report:
-			report = args.report.lower()
-			if report.startswith("text"):
-				Plaintext(m).render( sys.stdout )
-			elif report.startswith("html"):
-				HTML(m).render( sys.stdout )
+			with open(args.report,"w") as fp:
+				if args.report.lower().endswith("html"):
+					m.summary().render_html( fp )
+				else:
+					m.summary().render_text( fp )
 	if args.dump:
 		with open(args.dump,"w") as fp:
 			m.dump( fp )
@@ -415,5 +442,5 @@ if __name__=="__main__":
 	if _args.exclude:
 		re.compile( _args.exclude )

-	_main( _args, sys.stdout )
+	sys.exit( _main( _args ) )

--- a/src/bdqc/column.py
+++ b/src/bdqc/column.py
@@ -214,7 +214,7 @@ class Vector(object):

 		return len(self.value_histogram) == 1

-	def missing_indices( self ):
+	def indices_with_null( self ):
 		"""
 		Return a list of the indices of elements with missing data (None).
 		Incidentally, this list is guaranteed to be sorted.
@@ -223,14 +223,14 @@ class Vector(object):
 		"""
 		return tuple([ i for i in range(len(self)) if self[i] is None ])

-	def present_indices( self ):
+	def indices_with_values( self ):
 		"""
 		Return a list of the indices of elements with NON-MISSING data.
-		This is the complement of the set returned by missing_indices.
+		This is the complement of the set returned by indices_with_null.
 		"""
 		return tuple([ i for i in range(len(self)) if self[i] is not None ])

-	def minor_type_indices( self ):
+	def indices_with_minority_types( self ):
 		"""
 		Return a list of the indices of elements with types in the minority.
 		"""
@@ -239,7 +239,7 @@ class Vector(object):
 			if self.types[k] == MINOR_TYPE_COUNT ])
 		return tuple([ i for i in range(len(self)) if type(self[i]).__name__[0] in MINOR_TYPES ])

-	def outlier_indices( self ):
+	def indices_with_outliers( self ):
 		"""
 		Return a list of the indices of outlier elements.

@@ -295,6 +295,6 @@ if __name__=="__main__":
 	elif vec.is_single_valued():
 		print( "single-valued" )
 	else: # "outliers" exist. Name them!
-		for i in vec.outlier_indices():
+		for i in vec.indices_with_outliers():
 			print( i, vec[i] )

--- a/src/bdqc/incidence.py
+++ b/src/bdqc/incidence.py
-
-class Matrix(object):
-	"""
-	Encapsulates generation of an HTML table representing an incidence
-	matrix.
-	Alternative input representations.
-	Create an incidence matrix of {rows} X {columns} from either:
-	1. explicit NxM table
-	2. tagged lists
-		name1: {'foo','baz',...}
-		name2: {'bar','baz',...}
-		name3: {'foo','bar',...}
-	3.	incidence bitstrings with key
-		name1: 0xa8b889c
-		name2: 0xb988194
-		...
-		key [ 'foo','bar','baz','bozzle',... ]
-	"""
-	def __init__( self, body, row_major=True, **args ):
-		self.body = body
-		# Insure body is complete, not a ragged array.
-		assert all([ len(r)==len(body[0]) for r in body[1:] ])
-		if 'row_labels' in args:
-			self.row_labels = args['row_labels']
-		else:
-			self.row_labels = [ str(i+1) for i in range(len(body)) ]
-		if 'column_labels' in args:
-			self.column_labels = args['column_labels']
-		else:
-			self.column_labels = [ str(i+1) for i in range(len(body[0])) ]
-		if 'hl' in args:
-			self.hl = args['hl']
-
-	def render_html( self, fp ):
-		"""
-		Emit HTML5 markup representing an incidence matrix.
-		"""
-		# The prelude
-		print( '<table id="incidence_matrix">\n<caption></caption>\n', file=fp )
-		# The header
-		print( '<thead class="im">\n<tr>\n<th></th>\n', file=fp )
-		for label in self.column_labels:
-			print( '<th scope="col" class="im">{}</th>\n'.format(label), file=fp )
-		print( '</tr>\n</thead>\n', file=fp )
-		# The body
-		print( '<tbody class="im">\n', file=fp)
-		for ro in range(len(self.body)):
-			print( '<tr>\n<th scope="row" class="im">{}</th>\n'.format( self.row_labels[ro] ), file=fp )
-			row = self.body[ro]
-			for co in range(len(row)):
-				color = "red" if self.hl[ro][co] else "white"
-				print( '<td class="im" id="{}" style="background:{}"></td>'.format( self.body[ro][co], color ), file=fp )
-			print( '</tr>\n', file=fp)
-		# The trailer
-		print( ' </tbody>\n<tfoot class="im">\n</tfoot>\n</table>', file=fp )
-
-# Unit test
-if __name__=="__main__":
-	import random
-	import sys
-
--- a/src/bdqc/scan.py
+++ b/src/bdqc/scan.py
@@ -29,7 +29,6 @@ import io

 import bdqc.plugin
 import bdqc.dir
-from bdqc.report import HTML,Plaintext
 from bdqc.analysis import Matrix
 from bdqc.statpath import selectors

@@ -308,7 +307,7 @@ class Executor(object):
 		return missing


-def main( args ):
+def _main( args ):

 	# Build lists of plugins...

@@ -376,14 +375,16 @@ def main( args ):
 			prog_fp.close()

 		if m:
+
 			status = m.analyze()
+
 			if status: # ...is other than STATUS_NO_OUTLIERS
 				if args.report:
-					report = args.report.lower()
-					if report.startswith("text"):
-						Plaintext(m).render( sys.stdout )
-					elif report.startswith("html"):
-						HTML(m).render( sys.stdout )
+					with open(args.report,"w") as fp:
+						if args.report.lower().endswith("html"):
+							m.summary().render_html( fp )
+						else:
+							m.summary().render_text( fp )

 	if missing > 0:
 		logging.warning( "{} file(s) were missing".format( missing ) )
@@ -527,6 +528,6 @@ if __name__=="__main__":
 		re.compile( _args.include )
 	if _args.exclude:
 		re.compile( _args.exclude )
-	
-	main( _args )
+
+	sys.exit( _main( _args ) )

--- a/src/bdqc/report.py
+++ b/src/bdqc/report.py

 import pkgutil

-class HTML(object):
-	"""
-	TODO: lots
-	"""
-	def __init__( self, source ):
-		self.source = source
-
-	def _render_incidence_matrix( self, fp ):
-		"""
-		Emit HTML5 markup representing an incidence matrix.
-		"""
-		assert hasattr(self.source,"anom_col") \
-			and isinstance(self.source.anom_col,list) \
-			and all([ isinstance(i,str) for i in self.source.anom_col])
-
-		body = [ [ "c{}_{}".format(r,c)
-			for c in range(len(self.source.anom_col)) ]
-			for r in range(len(self.source.anom_row)) ]
-		hl   = [ [ fi in self.source.column[k].outlier_indices()
-			for k  in self.source.anom_col ]
-			for fi in self.source.anom_row ]
-		row_labels = [ self.source.files[fi] for fi in self.source.anom_row ]
-		# The prelude
-		print( '<table id="incidence_matrix">\n<caption></caption>\n', file=fp )
-		# The header
-		print( '<thead class="im">\n<tr>\n<th></th>\n', file=fp )
-		for label in self.source.anom_col:
-			print( '<th scope="col" class="im">{}</th>\n'.format(label), file=fp )
-		print( '</tr>\n</thead>\n', file=fp )
-		# The body
-		print( '<tbody class="im">\n', file=fp)
-		for ro in range(len(body)):
-			print( '<tr>\n<th scope="row" class="im">{}</th>\n'.format( row_labels[ro] ), file=fp )
-			row = body[ro]
-			for co in range(len(row)):
-				color = "red" if hl[ro][co] else "white"
-				print( '<td class="im" id="{}" style="background:{}"></td>'.format( body[ro][co], color ), file=fp )
-			print( '</tr>\n', file=fp)
-		# The trailer
-		print( ' </tbody>\n<tfoot class="im">\n</tfoot>\n</table>', file=fp )
-
-	def render( self, fp ):
-		assert hasattr(self.source,"status")
+class Summary(object):
+
+	def __init__( self, status, msg, body, rows, cols ):
+		self.status = status
+		self.msg  = msg
+		self.body = body
+		self.rows = rows
+		self.cols = cols
+
+
+	def render_html( self, fp ):
 		print( """<!DOCTYPE html>
 		<html>
 		<head>
@@ -108,8 +76,6 @@ class HTML(object):
 		<body onload="initDocument()">
 		""" )

-		self._render_incidence_matrix( fp )
-
 		####################################################################
 		# ...allowing mouse hover to select plots for display
 		####################################################################
@@ -147,41 +113,43 @@ class HTML(object):
 		</html>
 		""" )

-
-class Plaintext(object):
-	"""
-	TODO: lots
-	"""
-
-	def __init__( self, source ):
-		self.source = source
-
-	@staticmethod
-	def _render_im( im, fp ):
+	def render_text( self, fp ):
+		"""
+		Render summary, including an incidence matrix, as ASCII.
+		"""
+		print( "Status:", self.msg, file=fp )
 		print( "Incidence matrix:", file=fp )
-		NR = len(im['body'])
-		NC = len(im['cols'])
-		assert NR == len(im['rows'])
-		wid = max([len(r) for r in im['rows']])
+		NR = len(self.body)
+		NC = len(self.cols)
+		assert NR == len(self.rows)
+		wid = max([len(r) for r in self.rows])
 		FMT = "{{0:>{}s}} {{1}}".format(wid)
 		# Print headers
 		# This will handle up to 9999 columns but only the 10's and 1's
 		# values will be printed as column headers.
 		CIX = [ "{:04d}".format(i+1) for i in range(0,NC) ]
-		print( ' '*wid, ''.join([s[2] for s in CIX]) )
-		print( ' '*wid, ''.join([s[3] for s in CIX]) )
+		print( ' '*wid, ''.join([s[2] for s in CIX]), file=fp )
+		print( ' '*wid, ''.join([s[3] for s in CIX]), file=fp )
 		for rn in range(NR):
 			print( FMT.format(
-				im['rows'][rn],
-				''.join([ '+' if f else '-' for f in im['body'][rn] ]) ),
+				self.rows[rn],
+				''.join([ '+' if f else '-' for f in self.body[rn] ]) ),
 				file=fp )
 		print( "Column legend:", file=fp )
 		for cn in range(NC):
-			print( cn+1, im['cols'][cn], sep="\t", file=fp )
-
-	def render( self, fp ):
-		print( "Status:", self.source.status_msg() )
-		if self.source.status:
-			im = self.source.incidence_matrix()
-			Plaintext._render_im( im, fp )
+			print( cn+1, self.cols[cn], sep="\t", file=fp )
+
+if __name__=="__main__":
+	import sys
+	import bdqc.analysis
+	s = Summary(
+		bdqc.analysis.STATUS_VALUE_OUTLIERS,
+		"whatever",
+		[ [ 0, 1, 0 ],
+		  [ 0, 0, 1 ],
+		  [ 1, 0, 0 ],
+		  [ 0, 1, 1 ] ],
+		["r1","r2","r3","r4"],
+		["c1","c2","c3"] )
+	s.render_text( sys.stdout )	

--- a/src/setup.py
+++ b/src/setup.py
 from distutils.core import setup,Extension

 setup(name="bdqc",
-	version="0.45.1",
+	version="0.45.2",
 	description="Framework for QC of \"Big Data\"",
 	long_description="""\
 	Framework for QC of \"Big Data\"