Commit f248fc0c authored by Roger Kramer's avatar Roger Kramer
Browse files

bugfixes and refactor of summary report

parent 46aec1b7
......@@ -10,11 +10,11 @@ bdqc/dump.py
bdqc/plugin.py
bdqc/report-d3.js
bdqc/report.css
bdqc/report.py
bdqc/scan.py
bdqc/statistic.py
bdqc/statpath.py
bdqc/strings.py
bdqc/summary.py
bdqc/tsort.py
bdqc/builtin/__init__.py
bdqc/builtin/extrinsic/__init__.py
......
......@@ -12,17 +12,17 @@ from os.path import isfile
from bdqc.statistic import Descriptor
from bdqc.column import Vector
from bdqc.report import HTML,Plaintext
from bdqc.statpath import Selector
from bdqc.summary import Summary
from bdqc.data import flatten
# Warning: the values of these constants are used to index function pointer
# arrays. Don't change these without changing code that uses them!
STATUS_NO_OUTLIERS = 0
STATUS_INCOMPARABLES = 1
STATUS_INCOMPARABLES = 1 # different statistics present for diff files
STATUS_AMBIGUOUS_STATS = 2
STATUS_NULL_OUTLIERS = 3
STATUS_VALUE_OUTLIERS = 4
STATUS_NULL_OUTLIERS = 3 # missing values for *present* statistics
STATUS_VALUE_OUTLIERS = 4 # quantitative or categorical
STATUS_MSG = [
"no anomalies detected",
......@@ -107,20 +107,26 @@ class Matrix(object):
Conditionally include the given file's within-file analysis in the
across-file analysis.
This method "flattens" the JSON data associated with filename and
appends it as the last row of the matrix (self), adding and padding
columns if necessary to preserve a matrix structure.
Upon exit from this method, the matrix is one row longer--that is,
each of self.column should be exactly one datum longer.
An important side effect of this method is that this is the only
place that incomparable files can be detected. Files' results are
allowed to contain null for statistics--a null value is not the
same as a missing statistic, but every file should have some value
for every statistic. Concretely, this means:
Detection of incomparable files is an important side effect of this
method. This is the ONLY PLACE that incomparable files are detected.
"Incomparable" files are files for which the within-file analyses
don't contain exactly the same set of statistics--that is, the same
set of *keys*. A statistic is allowed to contain null--a null value
represents a missing *value* which is not the same as a missing
* statistic*. Concretely, this means:
1. The very first file added establishes the matrix' columns.
2. If columns are added subsequently (to preserve the matrix'
dimensions), then files are incomparable.
3. If column padding is ever required (again, to preserve the
matrix' dimensions because a file was missing statistics
presenct in the others), then files are incomparable.
present in the others), then files are incomparable.
Returns a boolean indicating whether or not the file was actually
included.
......@@ -186,14 +192,14 @@ class Matrix(object):
"""
if self.incomparables:
self.anom_col = sorted( list( self.incomparables ) )
H = hash( self.column[self.anom_col[0]].missing_indices() )
if all([hash(c.missing_indices()) == H for c in self.column.values() ]):
H = hash( self.column[self.anom_col[0]].indices_with_null() )
if all([hash(c.indices_with_null()) == H for c in self.column.values() ]):
# all statistics (columns) that are missing value, are
# missing those value from the same set of files, so the
# columns are collapsable. TODO
pass # TODO:
self.anom_row = sorted( list( set().union(*[
self.column[k].missing_indices() for k in self.anom_col ]) ) )
self.column[k].indices_with_null() for k in self.anom_col ]) ) )
self.status = STATUS_INCOMPARABLES
return self.status
......@@ -202,7 +208,7 @@ class Matrix(object):
self.column.keys() ) ) )
if len(self.anom_col) > 0:
self.anom_row = sorted( list( set().union(*[
self.column[k].minor_type_indices() for k in self.anom_col ]) ) )
self.column[k].indices_with_minority_types() for k in self.anom_col ]) ) )
self.status = STATUS_AMBIGUOUS_STATS
return self.status
......@@ -213,7 +219,7 @@ class Matrix(object):
self.column.keys() ) ) )
if len(self.anom_col) > 0:
self.anom_row = sorted( list( set().union(*[
self.column[k].missing_indices() for k in self.anom_col ]) ) )
self.column[k].indices_with_null() for k in self.anom_col ]) ) )
self.status = STATUS_NULL_OUTLIERS
return self.status
......@@ -228,7 +234,7 @@ class Matrix(object):
# contain anomalies (by virtue of being included in any
# column's anomaly list).
self.anom_row = sorted( list( set().union(*[
self.column[k].outlier_indices() for k in self.anom_col ]) ) )
self.column[k].indices_with_outliers() for k in self.anom_col ]) ) )
self.status = STATUS_VALUE_OUTLIERS
else:
self.status = STATUS_NO_OUTLIERS
......@@ -240,19 +246,31 @@ class Matrix(object):
Return an incidence matrix the content of which depends on the
nature of the anomalies (missing data, ambiguous types, or
value discrepancies).
Matrix content depends on exit status:
1. incomparables
2. ambiguous stats
3. outliers (NULL,quantitative, or categorical)
Matrix content is constrained to the union of rows mentioned
in self.anom_row and columns mentioned in self.anom_col...so
it is implicitly minified.
"""
body = []
rows = []
cols = []
if self.status != STATUS_NO_OUTLIERS:
rows_of_interest = ( # depend on analysis status
# The method chosen for "row_selector" determines whether
# the cell in the rendition of the incidence matrix is
# positive/negative or foreground/background--the meaning
# depends on the analysis result.
row_selector = (
None,
Vector.present_indices,
Vector.minor_type_indices,
Vector.present_indices,
Vector.outlier_indices)[ self.status ]
is_row_of_interest = lambda rnum,cobj:rnum in rows_of_interest(cobj)
body = [ [ is_row_of_interest(r, self.column[c] )
Vector.indices_with_values,
Vector.indices_with_minority_types,
Vector.indices_with_values,
Vector.indices_with_outliers)[ self.status ]
is_cell_positive = lambda rnum,cobj:rnum in row_selector(cobj)
body = [ [ is_cell_positive(r, self.column[c] )
for c in self.anom_col ]
for r in self.anom_row ]
rows = [ self.files[ r ] for r in self.anom_row ]
......@@ -262,6 +280,13 @@ class Matrix(object):
def status_msg( self ):
return STATUS_MSG[ self.status ]
def summary( self ):
"""
Returns a summary consisting of a variety of supporting evidence
for the aggregate result, including an incidence matrix.
"""
return Summary( self.status, STATUS_MSG[ self.status ], **self.incidence_matrix() )
def dump( self, fp=sys.stdout ):
head = list(sorted(self.column.keys()))
cols = [ self.column[k] for k in head ]
......@@ -298,7 +323,7 @@ class _Loader(object):
self.target.add_file_data( basename, analysis )
def _main( args, output ):
def _main( args ):
"""
Aggregate JSON into a Matrix then call the Matrix' analyze method.
This function allows
......@@ -329,14 +354,16 @@ def _main( args, output ):
m.add_file_data( filename, content )
else:
raise RuntimeError( "{} is neither file nor directory".format(s) )
m.analyze()
if m.status: # ...is other than STATUS_NO_OUTLIERS
if args.report:
report = args.report.lower()
if report.startswith("text"):
Plaintext(m).render( sys.stdout )
elif report.startswith("html"):
HTML(m).render( sys.stdout )
with open(args.report,"w") as fp:
if args.report.lower().endswith("html"):
m.summary().render_html( fp )
else:
m.summary().render_text( fp )
if args.dump:
with open(args.dump,"w") as fp:
m.dump( fp )
......@@ -415,5 +442,5 @@ if __name__=="__main__":
if _args.exclude:
re.compile( _args.exclude )
_main( _args, sys.stdout )
sys.exit( _main( _args ) )
......@@ -214,7 +214,7 @@ class Vector(object):
return len(self.value_histogram) == 1
def missing_indices( self ):
def indices_with_null( self ):
"""
Return a list of the indices of elements with missing data (None).
Incidentally, this list is guaranteed to be sorted.
......@@ -223,14 +223,14 @@ class Vector(object):
"""
return tuple([ i for i in range(len(self)) if self[i] is None ])
def present_indices( self ):
def indices_with_values( self ):
"""
Return a list of the indices of elements with NON-MISSING data.
This is the complement of the set returned by missing_indices.
This is the complement of the set returned by indices_with_null.
"""
return tuple([ i for i in range(len(self)) if self[i] is not None ])
def minor_type_indices( self ):
def indices_with_minority_types( self ):
"""
Return a list of the indices of elements with types in the minority.
"""
......@@ -239,7 +239,7 @@ class Vector(object):
if self.types[k] == MINOR_TYPE_COUNT ])
return tuple([ i for i in range(len(self)) if type(self[i]).__name__[0] in MINOR_TYPES ])
def outlier_indices( self ):
def indices_with_outliers( self ):
"""
Return a list of the indices of outlier elements.
......@@ -295,6 +295,6 @@ if __name__=="__main__":
elif vec.is_single_valued():
print( "single-valued" )
else: # "outliers" exist. Name them!
for i in vec.outlier_indices():
for i in vec.indices_with_outliers():
print( i, vec[i] )
class Matrix(object):
"""
Encapsulates generation of an HTML table representing an incidence
matrix.
Alternative input representations.
Create an incidence matrix of {rows} X {columns} from either:
1. explicit NxM table
2. tagged lists
name1: {'foo','baz',...}
name2: {'bar','baz',...}
name3: {'foo','bar',...}
3. incidence bitstrings with key
name1: 0xa8b889c
name2: 0xb988194
...
key [ 'foo','bar','baz','bozzle',... ]
"""
def __init__( self, body, row_major=True, **args ):
self.body = body
# Insure body is complete, not a ragged array.
assert all([ len(r)==len(body[0]) for r in body[1:] ])
if 'row_labels' in args:
self.row_labels = args['row_labels']
else:
self.row_labels = [ str(i+1) for i in range(len(body)) ]
if 'column_labels' in args:
self.column_labels = args['column_labels']
else:
self.column_labels = [ str(i+1) for i in range(len(body[0])) ]
if 'hl' in args:
self.hl = args['hl']
def render_html( self, fp ):
"""
Emit HTML5 markup representing an incidence matrix.
"""
# The prelude
print( '<table id="incidence_matrix">\n<caption></caption>\n', file=fp )
# The header
print( '<thead class="im">\n<tr>\n<th></th>\n', file=fp )
for label in self.column_labels:
print( '<th scope="col" class="im">{}</th>\n'.format(label), file=fp )
print( '</tr>\n</thead>\n', file=fp )
# The body
print( '<tbody class="im">\n', file=fp)
for ro in range(len(self.body)):
print( '<tr>\n<th scope="row" class="im">{}</th>\n'.format( self.row_labels[ro] ), file=fp )
row = self.body[ro]
for co in range(len(row)):
color = "red" if self.hl[ro][co] else "white"
print( '<td class="im" id="{}" style="background:{}"></td>'.format( self.body[ro][co], color ), file=fp )
print( '</tr>\n', file=fp)
# The trailer
print( ' </tbody>\n<tfoot class="im">\n</tfoot>\n</table>', file=fp )
# Unit test
if __name__=="__main__":
import random
import sys
......@@ -29,7 +29,6 @@ import io
import bdqc.plugin
import bdqc.dir
from bdqc.report import HTML,Plaintext
from bdqc.analysis import Matrix
from bdqc.statpath import selectors
......@@ -308,7 +307,7 @@ class Executor(object):
return missing
def main( args ):
def _main( args ):
# Build lists of plugins...
......@@ -376,14 +375,16 @@ def main( args ):
prog_fp.close()
if m:
status = m.analyze()
if status: # ...is other than STATUS_NO_OUTLIERS
if args.report:
report = args.report.lower()
if report.startswith("text"):
Plaintext(m).render( sys.stdout )
elif report.startswith("html"):
HTML(m).render( sys.stdout )
with open(args.report,"w") as fp:
if args.report.lower().endswith("html"):
m.summary().render_html( fp )
else:
m.summary().render_text( fp )
if missing > 0:
logging.warning( "{} file(s) were missing".format( missing ) )
......@@ -527,6 +528,6 @@ if __name__=="__main__":
re.compile( _args.include )
if _args.exclude:
re.compile( _args.exclude )
main( _args )
sys.exit( _main( _args ) )
import pkgutil
class HTML(object):
"""
TODO: lots
"""
def __init__( self, source ):
self.source = source
def _render_incidence_matrix( self, fp ):
"""
Emit HTML5 markup representing an incidence matrix.
"""
assert hasattr(self.source,"anom_col") \
and isinstance(self.source.anom_col,list) \
and all([ isinstance(i,str) for i in self.source.anom_col])
body = [ [ "c{}_{}".format(r,c)
for c in range(len(self.source.anom_col)) ]
for r in range(len(self.source.anom_row)) ]
hl = [ [ fi in self.source.column[k].outlier_indices()
for k in self.source.anom_col ]
for fi in self.source.anom_row ]
row_labels = [ self.source.files[fi] for fi in self.source.anom_row ]
# The prelude
print( '<table id="incidence_matrix">\n<caption></caption>\n', file=fp )
# The header
print( '<thead class="im">\n<tr>\n<th></th>\n', file=fp )
for label in self.source.anom_col:
print( '<th scope="col" class="im">{}</th>\n'.format(label), file=fp )
print( '</tr>\n</thead>\n', file=fp )
# The body
print( '<tbody class="im">\n', file=fp)
for ro in range(len(body)):
print( '<tr>\n<th scope="row" class="im">{}</th>\n'.format( row_labels[ro] ), file=fp )
row = body[ro]
for co in range(len(row)):
color = "red" if hl[ro][co] else "white"
print( '<td class="im" id="{}" style="background:{}"></td>'.format( body[ro][co], color ), file=fp )
print( '</tr>\n', file=fp)
# The trailer
print( ' </tbody>\n<tfoot class="im">\n</tfoot>\n</table>', file=fp )
def render( self, fp ):
assert hasattr(self.source,"status")
class Summary(object):
def __init__( self, status, msg, body, rows, cols ):
self.status = status
self.msg = msg
self.body = body
self.rows = rows
self.cols = cols
def render_html( self, fp ):
print( """<!DOCTYPE html>
<html>
<head>
......@@ -108,8 +76,6 @@ class HTML(object):
<body onload="initDocument()">
""" )
self._render_incidence_matrix( fp )
####################################################################
# ...allowing mouse hover to select plots for display
####################################################################
......@@ -147,41 +113,43 @@ class HTML(object):
</html>
""" )
class Plaintext(object):
"""
TODO: lots
"""
def __init__( self, source ):
self.source = source
@staticmethod
def _render_im( im, fp ):
def render_text( self, fp ):
"""
Render summary, including an incidence matrix, as ASCII.
"""
print( "Status:", self.msg, file=fp )
print( "Incidence matrix:", file=fp )
NR = len(im['body'])
NC = len(im['cols'])
assert NR == len(im['rows'])
wid = max([len(r) for r in im['rows']])
NR = len(self.body)
NC = len(self.cols)
assert NR == len(self.rows)
wid = max([len(r) for r in self.rows])
FMT = "{{0:>{}s}} {{1}}".format(wid)
# Print headers
# This will handle up to 9999 columns but only the 10's and 1's
# values will be printed as column headers.
CIX = [ "{:04d}".format(i+1) for i in range(0,NC) ]
print( ' '*wid, ''.join([s[2] for s in CIX]) )
print( ' '*wid, ''.join([s[3] for s in CIX]) )
print( ' '*wid, ''.join([s[2] for s in CIX]), file=fp )
print( ' '*wid, ''.join([s[3] for s in CIX]), file=fp )
for rn in range(NR):
print( FMT.format(
im['rows'][rn],
''.join([ '+' if f else '-' for f in im['body'][rn] ]) ),
self.rows[rn],
''.join([ '+' if f else '-' for f in self.body[rn] ]) ),
file=fp )
print( "Column legend:", file=fp )
for cn in range(NC):
print( cn+1, im['cols'][cn], sep="\t", file=fp )
def render( self, fp ):
print( "Status:", self.source.status_msg() )
if self.source.status:
im = self.source.incidence_matrix()
Plaintext._render_im( im, fp )
print( cn+1, self.cols[cn], sep="\t", file=fp )
if __name__=="__main__":
import sys
import bdqc.analysis
s = Summary(
bdqc.analysis.STATUS_VALUE_OUTLIERS,
"whatever",
[ [ 0, 1, 0 ],
[ 0, 0, 1 ],
[ 1, 0, 0 ],
[ 0, 1, 1 ] ],
["r1","r2","r3","r4"],
["c1","c2","c3"] )
s.render_text( sys.stdout )
from distutils.core import setup,Extension
setup(name="bdqc",
version="0.45.1",
version="0.45.2",
description="Framework for QC of \"Big Data\"",
long_description="""\
Framework for QC of \"Big Data\"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment