Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Yi Lu
bdqc
Commits
f248fc0c
Commit
f248fc0c
authored
8 years ago
by
Roger Kramer
Browse files
Options
Download
Email Patches
Plain Diff
bugfixes and refactor of summary report
parent
46aec1b7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
114 additions
and
179 deletions
+114
-179
src/MANIFEST
src/MANIFEST
+1
-1
src/bdqc/analysis.py
src/bdqc/analysis.py
+57
-30
src/bdqc/column.py
src/bdqc/column.py
+6
-6
src/bdqc/incidence.py
src/bdqc/incidence.py
+0
-61
src/bdqc/scan.py
src/bdqc/scan.py
+10
-9
src/bdqc/summary.py
src/bdqc/summary.py
+39
-71
src/setup.py
src/setup.py
+1
-1
No files found.
src/MANIFEST
View file @
f248fc0c
...
...
@@ -10,11 +10,11 @@ bdqc/dump.py
bdqc/plugin.py
bdqc/report-d3.js
bdqc/report.css
bdqc/report.py
bdqc/scan.py
bdqc/statistic.py
bdqc/statpath.py
bdqc/strings.py
bdqc/summary.py
bdqc/tsort.py
bdqc/builtin/__init__.py
bdqc/builtin/extrinsic/__init__.py
...
...
This diff is collapsed.
Click to expand it.
src/bdqc/analysis.py
View file @
f248fc0c
...
...
@@ -12,17 +12,17 @@ from os.path import isfile
from
bdqc.statistic
import
Descriptor
from
bdqc.column
import
Vector
from
bdqc.report
import
HTML
,
Plaintext
from
bdqc.statpath
import
Selector
from
bdqc.summary
import
Summary
from
bdqc.data
import
flatten
# Warning: the values of these constants are used to index function pointer
# arrays. Don't change these without changing code that uses them!
STATUS_NO_OUTLIERS
=
0
STATUS_INCOMPARABLES
=
1
STATUS_INCOMPARABLES
=
1
# different statistics present for diff files
STATUS_AMBIGUOUS_STATS
=
2
STATUS_NULL_OUTLIERS
=
3
STATUS_VALUE_OUTLIERS
=
4
STATUS_NULL_OUTLIERS
=
3
# missing values for *present* statistics
STATUS_VALUE_OUTLIERS
=
4
# quantitative or categorical
STATUS_MSG
=
[
"no anomalies detected"
,
...
...
@@ -107,20 +107,26 @@ class Matrix(object):
Conditionally include the given file's within-file analysis in the
across-file analysis.
This method "flattens" the JSON data associated with filename and
appends it as the last row of the matrix (self), adding and padding
columns if necessary to preserve a matrix structure.
Upon exit from this method, the matrix is one row longer--that is,
each of self.column should be exactly one datum longer.
An important side effect of this method is that this is the only
place that incomparable files can be detected. Files' results are
allowed to contain null for statistics--a null value is not the
same as a missing statistic, but every file should have some value
for every statistic. Concretely, this means:
Detection of incomparable files is an important side effect of this
method. This is the ONLY PLACE that incomparable files are detected.
"Incomparable" files are files for which the within-file analyses
don't contain exactly the same set of statistics--that is, the same
set of *keys*. A statistic is allowed to contain null--a null value
represents a missing *value* which is not the same as a missing
* statistic*. Concretely, this means:
1. The very first file added establishes the matrix' columns.
2. If columns are added subsequently (to preserve the matrix'
dimensions), then files are incomparable.
3. If column padding is ever required (again, to preserve the
matrix' dimensions because a file was missing statistics
presen
c
t in the others), then files are incomparable.
present in the others), then files are incomparable.
Returns a boolean indicating whether or not the file was actually
included.
...
...
@@ -186,14 +192,14 @@ class Matrix(object):
"""
if
self
.
incomparables
:
self
.
anom_col
=
sorted
(
list
(
self
.
incomparables
)
)
H
=
hash
(
self
.
column
[
self
.
anom_col
[
0
]].
missing_indices
()
)
if
all
([
hash
(
c
.
missing_indices
())
==
H
for
c
in
self
.
column
.
values
()
]):
H
=
hash
(
self
.
column
[
self
.
anom_col
[
0
]].
indices_with_null
()
)
if
all
([
hash
(
c
.
indices_with_null
())
==
H
for
c
in
self
.
column
.
values
()
]):
# all statistics (columns) that are missing value, are
# missing those value from the same set of files, so the
# columns are collapsable. TODO
pass
# TODO:
self
.
anom_row
=
sorted
(
list
(
set
().
union
(
*
[
self
.
column
[
k
].
missing_indices
()
for
k
in
self
.
anom_col
])
)
)
self
.
column
[
k
].
indices_with_null
()
for
k
in
self
.
anom_col
])
)
)
self
.
status
=
STATUS_INCOMPARABLES
return
self
.
status
...
...
@@ -202,7 +208,7 @@ class Matrix(object):
self
.
column
.
keys
()
)
)
)
if
len
(
self
.
anom_col
)
>
0
:
self
.
anom_row
=
sorted
(
list
(
set
().
union
(
*
[
self
.
column
[
k
].
minor_type
_indice
s
()
for
k
in
self
.
anom_col
])
)
)
self
.
column
[
k
].
indices_with_
minor
ity
_types
()
for
k
in
self
.
anom_col
])
)
)
self
.
status
=
STATUS_AMBIGUOUS_STATS
return
self
.
status
...
...
@@ -213,7 +219,7 @@ class Matrix(object):
self
.
column
.
keys
()
)
)
)
if
len
(
self
.
anom_col
)
>
0
:
self
.
anom_row
=
sorted
(
list
(
set
().
union
(
*
[
self
.
column
[
k
].
missing_indices
()
for
k
in
self
.
anom_col
])
)
)
self
.
column
[
k
].
indices_with_null
()
for
k
in
self
.
anom_col
])
)
)
self
.
status
=
STATUS_NULL_OUTLIERS
return
self
.
status
...
...
@@ -228,7 +234,7 @@ class Matrix(object):
# contain anomalies (by virtue of being included in any
# column's anomaly list).
self
.
anom_row
=
sorted
(
list
(
set
().
union
(
*
[
self
.
column
[
k
].
outlier
_indice
s
()
for
k
in
self
.
anom_col
])
)
)
self
.
column
[
k
].
indices_with_
outliers
()
for
k
in
self
.
anom_col
])
)
)
self
.
status
=
STATUS_VALUE_OUTLIERS
else
:
self
.
status
=
STATUS_NO_OUTLIERS
...
...
@@ -240,19 +246,31 @@ class Matrix(object):
Return an incidence matrix the content of which depends on the
nature of the anomalies (missing data, ambiguous types, or
value discrepancies).
Matrix content depends on exit status:
1. incomparables
2. ambiguous stats
3. outliers (NULL,quantitative, or categorical)
Matrix content is constrained to the union of rows mentioned
in self.anom_row and columns mentioned in self.anom_col...so
it is implicitly minified.
"""
body
=
[]
rows
=
[]
cols
=
[]
if
self
.
status
!=
STATUS_NO_OUTLIERS
:
rows_of_interest
=
(
# depend on analysis status
# The method chosen for "row_selector" determines whether
# the cell in the rendition of the incidence matrix is
# positive/negative or foreground/background--the meaning
# depends on the analysis result.
row_selector
=
(
None
,
Vector
.
present_indic
es
,
Vector
.
minor_type
_indice
s
,
Vector
.
present_indic
es
,
Vector
.
outlier
_indice
s
)[
self
.
status
]
is_
row_of_interest
=
lambda
rnum
,
cobj
:
rnum
in
row
s_of_interest
(
cobj
)
body
=
[
[
is_
row_of_interest
(
r
,
self
.
column
[
c
]
)
Vector
.
indices_with_valu
es
,
Vector
.
indices_with_
minor
ity
_types
,
Vector
.
indices_with_valu
es
,
Vector
.
indices_with_
outliers
)[
self
.
status
]
is_
cell_positive
=
lambda
rnum
,
cobj
:
rnum
in
row
_selector
(
cobj
)
body
=
[
[
is_
cell_positive
(
r
,
self
.
column
[
c
]
)
for
c
in
self
.
anom_col
]
for
r
in
self
.
anom_row
]
rows
=
[
self
.
files
[
r
]
for
r
in
self
.
anom_row
]
...
...
@@ -262,6 +280,13 @@ class Matrix(object):
def
status_msg
(
self
):
return
STATUS_MSG
[
self
.
status
]
def
summary
(
self
):
"""
Returns a summary consisting of a variety of supporting evidence
for the aggregate result, including an incidence matrix.
"""
return
Summary
(
self
.
status
,
STATUS_MSG
[
self
.
status
],
**
self
.
incidence_matrix
()
)
def
dump
(
self
,
fp
=
sys
.
stdout
):
head
=
list
(
sorted
(
self
.
column
.
keys
()))
cols
=
[
self
.
column
[
k
]
for
k
in
head
]
...
...
@@ -298,7 +323,7 @@ class _Loader(object):
self
.
target
.
add_file_data
(
basename
,
analysis
)
def
_main
(
args
,
output
):
def
_main
(
args
):
"""
Aggregate JSON into a Matrix then call the Matrix' analyze method.
This function allows
...
...
@@ -329,14 +354,16 @@ def _main( args, output ):
m
.
add_file_data
(
filename
,
content
)
else
:
raise
RuntimeError
(
"{} is neither file nor directory"
.
format
(
s
)
)
m
.
analyze
()
if
m
.
status
:
# ...is other than STATUS_NO_OUTLIERS
if
args
.
report
:
report
=
args
.
report
.
lower
()
if
report
.
start
swith
(
"
text
"
):
Plaintext
(
m
).
render
(
sys
.
stdout
)
el
if
report
.
startswith
(
"html"
)
:
HTML
(
m
).
render
(
sys
.
stdout
)
with
open
(
args
.
report
,
"w"
)
as
fp
:
if
args
.
report
.
lower
().
end
swith
(
"
html
"
):
m
.
summary
().
render_html
(
fp
)
el
se
:
m
.
summary
().
render_text
(
fp
)
if
args
.
dump
:
with
open
(
args
.
dump
,
"w"
)
as
fp
:
m
.
dump
(
fp
)
...
...
@@ -415,5 +442,5 @@ if __name__=="__main__":
if
_args
.
exclude
:
re
.
compile
(
_args
.
exclude
)
_main
(
_args
,
sys
.
stdout
)
sys
.
exit
(
_main
(
_args
)
)
This diff is collapsed.
Click to expand it.
src/bdqc/column.py
View file @
f248fc0c
...
...
@@ -214,7 +214,7 @@ class Vector(object):
return
len
(
self
.
value_histogram
)
==
1
def
missing_indices
(
self
):
def
indices_with_null
(
self
):
"""
Return a list of the indices of elements with missing data (None).
Incidentally, this list is guaranteed to be sorted.
...
...
@@ -223,14 +223,14 @@ class Vector(object):
"""
return
tuple
([
i
for
i
in
range
(
len
(
self
))
if
self
[
i
]
is
None
])
def
present_indic
es
(
self
):
def
indices_with_valu
es
(
self
):
"""
Return a list of the indices of elements with NON-MISSING data.
This is the complement of the set returned by
missing_indices
.
This is the complement of the set returned by
indices_with_null
.
"""
return
tuple
([
i
for
i
in
range
(
len
(
self
))
if
self
[
i
]
is
not
None
])
def
minor_type
_indice
s
(
self
):
def
indices_with_
minor
ity
_types
(
self
):
"""
Return a list of the indices of elements with types in the minority.
"""
...
...
@@ -239,7 +239,7 @@ class Vector(object):
if
self
.
types
[
k
]
==
MINOR_TYPE_COUNT
])
return
tuple
([
i
for
i
in
range
(
len
(
self
))
if
type
(
self
[
i
]).
__name__
[
0
]
in
MINOR_TYPES
])
def
outlier
_indice
s
(
self
):
def
indices_with_
outliers
(
self
):
"""
Return a list of the indices of outlier elements.
...
...
@@ -295,6 +295,6 @@ if __name__=="__main__":
elif
vec
.
is_single_valued
():
print
(
"single-valued"
)
else
:
# "outliers" exist. Name them!
for
i
in
vec
.
outlier
_indice
s
():
for
i
in
vec
.
indices_with_
outliers
():
print
(
i
,
vec
[
i
]
)
This diff is collapsed.
Click to expand it.
src/bdqc/incidence.py
deleted
100644 → 0
View file @
46aec1b7
class
Matrix
(
object
):
"""
Encapsulates generation of an HTML table representing an incidence
matrix.
Alternative input representations.
Create an incidence matrix of {rows} X {columns} from either:
1. explicit NxM table
2. tagged lists
name1: {'foo','baz',...}
name2: {'bar','baz',...}
name3: {'foo','bar',...}
3. incidence bitstrings with key
name1: 0xa8b889c
name2: 0xb988194
...
key [ 'foo','bar','baz','bozzle',... ]
"""
def
__init__
(
self
,
body
,
row_major
=
True
,
**
args
):
self
.
body
=
body
# Insure body is complete, not a ragged array.
assert
all
([
len
(
r
)
==
len
(
body
[
0
])
for
r
in
body
[
1
:]
])
if
'row_labels'
in
args
:
self
.
row_labels
=
args
[
'row_labels'
]
else
:
self
.
row_labels
=
[
str
(
i
+
1
)
for
i
in
range
(
len
(
body
))
]
if
'column_labels'
in
args
:
self
.
column_labels
=
args
[
'column_labels'
]
else
:
self
.
column_labels
=
[
str
(
i
+
1
)
for
i
in
range
(
len
(
body
[
0
]))
]
if
'hl'
in
args
:
self
.
hl
=
args
[
'hl'
]
def
render_html
(
self
,
fp
):
"""
Emit HTML5 markup representing an incidence matrix.
"""
# The prelude
print
(
'<table id="incidence_matrix">
\n
<caption></caption>
\n
'
,
file
=
fp
)
# The header
print
(
'<thead class="im">
\n
<tr>
\n
<th></th>
\n
'
,
file
=
fp
)
for
label
in
self
.
column_labels
:
print
(
'<th scope="col" class="im">{}</th>
\n
'
.
format
(
label
),
file
=
fp
)
print
(
'</tr>
\n
</thead>
\n
'
,
file
=
fp
)
# The body
print
(
'<tbody class="im">
\n
'
,
file
=
fp
)
for
ro
in
range
(
len
(
self
.
body
)):
print
(
'<tr>
\n
<th scope="row" class="im">{}</th>
\n
'
.
format
(
self
.
row_labels
[
ro
]
),
file
=
fp
)
row
=
self
.
body
[
ro
]
for
co
in
range
(
len
(
row
)):
color
=
"red"
if
self
.
hl
[
ro
][
co
]
else
"white"
print
(
'<td class="im" id="{}" style="background:{}"></td>'
.
format
(
self
.
body
[
ro
][
co
],
color
),
file
=
fp
)
print
(
'</tr>
\n
'
,
file
=
fp
)
# The trailer
print
(
' </tbody>
\n
<tfoot class="im">
\n
</tfoot>
\n
</table>'
,
file
=
fp
)
# Unit test
if
__name__
==
"__main__"
:
import
random
import
sys
This diff is collapsed.
Click to expand it.
src/bdqc/scan.py
View file @
f248fc0c
...
...
@@ -29,7 +29,6 @@ import io
import
bdqc.plugin
import
bdqc.dir
from
bdqc.report
import
HTML
,
Plaintext
from
bdqc.analysis
import
Matrix
from
bdqc.statpath
import
selectors
...
...
@@ -308,7 +307,7 @@ class Executor(object):
return
missing
def
main
(
args
):
def
_
main
(
args
):
# Build lists of plugins...
...
...
@@ -376,14 +375,16 @@ def main( args ):
prog_fp
.
close
()
if
m
:
status
=
m
.
analyze
()
if
status
:
# ...is other than STATUS_NO_OUTLIERS
if
args
.
report
:
report
=
args
.
report
.
lower
()
if
report
.
start
swith
(
"
text
"
):
Plaintext
(
m
).
render
(
sys
.
stdout
)
el
if
report
.
startswith
(
"html"
)
:
HTML
(
m
).
render
(
sys
.
stdout
)
with
open
(
args
.
report
,
"w"
)
as
fp
:
if
args
.
report
.
lower
().
end
swith
(
"
html
"
):
m
.
summary
().
render_html
(
fp
)
el
se
:
m
.
summary
().
render_text
(
fp
)
if
missing
>
0
:
logging
.
warning
(
"{} file(s) were missing"
.
format
(
missing
)
)
...
...
@@ -527,6 +528,6 @@ if __name__=="__main__":
re
.
compile
(
_args
.
include
)
if
_args
.
exclude
:
re
.
compile
(
_args
.
exclude
)
main
(
_args
)
sys
.
exit
(
_
main
(
_args
)
)
This diff is collapsed.
Click to expand it.
src/bdqc/
report
.py
→
src/bdqc/
summary
.py
View file @
f248fc0c
import
pkgutil
class
HTML
(
object
):
"""
TODO: lots
"""
def
__init__
(
self
,
source
):
self
.
source
=
source
def
_render_incidence_matrix
(
self
,
fp
):
"""
Emit HTML5 markup representing an incidence matrix.
"""
assert
hasattr
(
self
.
source
,
"anom_col"
)
\
and
isinstance
(
self
.
source
.
anom_col
,
list
)
\
and
all
([
isinstance
(
i
,
str
)
for
i
in
self
.
source
.
anom_col
])
body
=
[
[
"c{}_{}"
.
format
(
r
,
c
)
for
c
in
range
(
len
(
self
.
source
.
anom_col
))
]
for
r
in
range
(
len
(
self
.
source
.
anom_row
))
]
hl
=
[
[
fi
in
self
.
source
.
column
[
k
].
outlier_indices
()
for
k
in
self
.
source
.
anom_col
]
for
fi
in
self
.
source
.
anom_row
]
row_labels
=
[
self
.
source
.
files
[
fi
]
for
fi
in
self
.
source
.
anom_row
]
# The prelude
print
(
'<table id="incidence_matrix">
\n
<caption></caption>
\n
'
,
file
=
fp
)
# The header
print
(
'<thead class="im">
\n
<tr>
\n
<th></th>
\n
'
,
file
=
fp
)
for
label
in
self
.
source
.
anom_col
:
print
(
'<th scope="col" class="im">{}</th>
\n
'
.
format
(
label
),
file
=
fp
)
print
(
'</tr>
\n
</thead>
\n
'
,
file
=
fp
)
# The body
print
(
'<tbody class="im">
\n
'
,
file
=
fp
)
for
ro
in
range
(
len
(
body
)):
print
(
'<tr>
\n
<th scope="row" class="im">{}</th>
\n
'
.
format
(
row_labels
[
ro
]
),
file
=
fp
)
row
=
body
[
ro
]
for
co
in
range
(
len
(
row
)):
color
=
"red"
if
hl
[
ro
][
co
]
else
"white"
print
(
'<td class="im" id="{}" style="background:{}"></td>'
.
format
(
body
[
ro
][
co
],
color
),
file
=
fp
)
print
(
'</tr>
\n
'
,
file
=
fp
)
# The trailer
print
(
' </tbody>
\n
<tfoot class="im">
\n
</tfoot>
\n
</table>'
,
file
=
fp
)
def
render
(
self
,
fp
):
assert
hasattr
(
self
.
source
,
"status"
)
class
Summary
(
object
):
def
__init__
(
self
,
status
,
msg
,
body
,
rows
,
cols
):
self
.
status
=
status
self
.
msg
=
msg
self
.
body
=
body
self
.
rows
=
rows
self
.
cols
=
cols
def
render_html
(
self
,
fp
):
print
(
"""<!DOCTYPE html>
<html>
<head>
...
...
@@ -108,8 +76,6 @@ class HTML(object):
<body onload="initDocument()">
"""
)
self
.
_render_incidence_matrix
(
fp
)
####################################################################
# ...allowing mouse hover to select plots for display
####################################################################
...
...
@@ -147,41 +113,43 @@ class HTML(object):
</html>
"""
)
class
Plaintext
(
object
):
"""
TODO: lots
"""
def
__init__
(
self
,
source
):
self
.
source
=
source
@
staticmethod
def
_render_im
(
im
,
fp
):
def
render_text
(
self
,
fp
):
"""
Render summary, including an incidence matrix, as ASCII.
"""
print
(
"Status:"
,
self
.
msg
,
file
=
fp
)
print
(
"Incidence matrix:"
,
file
=
fp
)
NR
=
len
(
im
[
'
body
'
]
)
NC
=
len
(
im
[
'
cols
'
]
)
assert
NR
==
len
(
im
[
'
rows
'
]
)
wid
=
max
([
len
(
r
)
for
r
in
im
[
'
rows
'
]
])
NR
=
len
(
self
.
body
)
NC
=
len
(
self
.
cols
)
assert
NR
==
len
(
self
.
rows
)
wid
=
max
([
len
(
r
)
for
r
in
self
.
rows
])
FMT
=
"{{0:>{}s}} {{1}}"
.
format
(
wid
)
# Print headers
# This will handle up to 9999 columns but only the 10's and 1's
# values will be printed as column headers.
CIX
=
[
"{:04d}"
.
format
(
i
+
1
)
for
i
in
range
(
0
,
NC
)
]
print
(
' '
*
wid
,
''
.
join
([
s
[
2
]
for
s
in
CIX
])
)
print
(
' '
*
wid
,
''
.
join
([
s
[
3
]
for
s
in
CIX
])
)
print
(
' '
*
wid
,
''
.
join
([
s
[
2
]
for
s
in
CIX
])
,
file
=
fp
)
print
(
' '
*
wid
,
''
.
join
([
s
[
3
]
for
s
in
CIX
])
,
file
=
fp
)
for
rn
in
range
(
NR
):
print
(
FMT
.
format
(
im
[
'
rows
'
]
[
rn
],
''
.
join
([
'+'
if
f
else
'-'
for
f
in
im
[
'
body
'
]
[
rn
]
])
),
self
.
rows
[
rn
],
''
.
join
([
'+'
if
f
else
'-'
for
f
in
self
.
body
[
rn
]
])
),
file
=
fp
)
print
(
"Column legend:"
,
file
=
fp
)
for
cn
in
range
(
NC
):
print
(
cn
+
1
,
im
[
'cols'
][
cn
],
sep
=
"
\t
"
,
file
=
fp
)
def
render
(
self
,
fp
):
print
(
"Status:"
,
self
.
source
.
status_msg
()
)
if
self
.
source
.
status
:
im
=
self
.
source
.
incidence_matrix
()
Plaintext
.
_render_im
(
im
,
fp
)
print
(
cn
+
1
,
self
.
cols
[
cn
],
sep
=
"
\t
"
,
file
=
fp
)
if
__name__
==
"__main__"
:
import
sys
import
bdqc.analysis
s
=
Summary
(
bdqc
.
analysis
.
STATUS_VALUE_OUTLIERS
,
"whatever"
,
[
[
0
,
1
,
0
],
[
0
,
0
,
1
],
[
1
,
0
,
0
],
[
0
,
1
,
1
]
],
[
"r1"
,
"r2"
,
"r3"
,
"r4"
],
[
"c1"
,
"c2"
,
"c3"
]
)
s
.
render_text
(
sys
.
stdout
)
This diff is collapsed.
Click to expand it.
src/setup.py
View file @
f248fc0c
from
distutils.core
import
setup
,
Extension
setup
(
name
=
"bdqc"
,
version
=
"0.45.
1
"
,
version
=
"0.45.
2
"
,
description
=
"Framework for QC of
\"
Big Data
\"
"
,
long_description
=
"""
\
Framework for QC of
\"
Big Data
\"
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment