Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Open sidebar
Ian Bowman
Hippocampus Gene Expression Atlas -- CIC Data Integrated Software
Commits
923688ec
Commit
923688ec
authored
7 years ago
by
Ian Bowman
Browse files
Options
Download
Email Patches
Plain Diff
New multiscale k-means code.
parent
32acfae0
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
315 additions
and
109 deletions
+315
-109
src/char_clust.py
src/char_clust.py
+134
-0
src/hpf_kmeans.py
src/hpf_kmeans.py
+29
-3
src/hpf_utils.py
src/hpf_utils.py
+6
-0
src/ms_char_clust.sh
src/ms_char_clust.sh
+23
-0
src/ms_run_k-means_row_col_mat.sh
src/ms_run_k-means_row_col_mat.sh
+28
-0
src/qsub_char_clust.sh
src/qsub_char_clust.sh
+26
-0
src/qsub_run_k-means_row_col_mat.sh
src/qsub_run_k-means_row_col_mat.sh
+23
-0
src/run_k-means_row_col_mat.py
src/run_k-means_row_col_mat.py
+45
-105
tests/test_hpf_kmeans.py
tests/test_hpf_kmeans.py
+1
-1
No files found.
src/char_clust.py
0 → 100755
View file @
923688ec
#!/usr/bin/python
# Initial assumptions: N x N connectivity matrix (if not N x N will need to pad)
from
__future__
import
print_function
import
argparse
,
csv
,
os
import
numpy
as
np
import
hpf_utils
,
hpf_ms
import
sys
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Characterize repeated runs of k-means clustering recorded in CSV"
)
parser
.
add_argument
(
'-i'
,
'--input_csv'
,
help
=
'Input, CSV containing multiple runs of k-means clustering'
,
required
=
True
)
parser
.
add_argument
(
'-H'
,
'--header'
,
help
=
'Include header info, typically used first write'
,
action
=
'store_true'
)
parser
.
add_argument
(
'-v'
,
'--verbose'
,
help
=
'Print relevant but optional output'
,
action
=
'store_true'
)
args
=
vars
(
parser
.
parse_args
())
input_csv_path
=
args
[
'input_csv'
]
assert
os
.
path
.
isfile
(
input_csv_path
),
\
"can't find input csv file {}"
.
format
(
input_csv_path
)
display_header
=
args
[
'header'
]
verbose
=
args
[
'verbose'
]
#parse input csv into kmeans run arr dict, all values are strings
# [ { 'run' : run
# 'k' : k
# 'inertia' : inertia,
# 'partition' : k-means partition frozen set}, ... ]
kmeans_run_dct_arr
=
hpf_utils
.
read_kmeans_run_dct_arr
(
input_csv_path
)
run_npa
=
np
.
array
([
x
[
'run'
]
for
x
in
kmeans_run_dct_arr
])
inertia_npa
=
np
.
array
([
x
[
'inertia'
]
for
x
in
kmeans_run_dct_arr
])
#FIND METRIC PREREQS
roi_name_lst
=
[]
partition_lst
=
[]
for
run
in
kmeans_run_dct_arr
:
partition
=
eval
(
run
[
'partition'
])
#sorry God
if
len
(
roi_name_lst
)
==
0
:
roi_name_lst
=
sorted
(
hpf_utils
.
flatten
(
partition
))
partition_lst
.
append
(
partition
)
num_runs
=
np
.
max
(
run_npa
)
#CALCULATE METRICS
# #multi-scale stuff
res_dct
=
{}
if
verbose
:
print
(
"calculating std_dev_w_alpha_beta..."
)
import
time
start
=
time
.
time
()
std_dev_w_alpha_beta
=
hpf_ms
.
calc_std_w_alpha_beta
(
roi_name_lst
=
roi_name_lst
,
cmt_str_lst_lst
=
partition_lst
,
res_dct
=
res_dct
)
if
verbose
:
print
(
"done in {}s"
.
format
(
time
.
time
()
-
start
))
if
verbose
:
print
(
"calculating mean_var_z_alpha_beta..."
)
start
=
time
.
time
()
mean_var
=
hpf_ms
.
calc_mean_var_z_alpha_beta
(
roi_name_lst
=
roi_name_lst
,
std_w_alpha_beta
=
std_dev_w_alpha_beta
,
cmt_str_lst_lst
=
partition_lst
,
M
=
hpf_ms
.
n_choose_2
(
len
(
partition_lst
)),
res_dct
=
res_dct
)
if
verbose
:
print
(
"done in {}s"
.
format
(
time
.
time
()
-
start
))
min_inertia_partition
=
[]
for
run
in
kmeans_run_dct_arr
:
if
len
(
min_inertia_partition
)
==
0
and
\
run
[
'inertia'
]
==
np
.
min
(
inertia_npa
):
min_inertia_partition
=
run
[
'partition'
]
# #okay phew! now calculate consensus community structure
# if verbose:
# print("calculating cons_cmt_str...")
# start = time.time()
# cons_cmt_str = hpf_ms.calc_cons_cmt_str(roi_name_lst=roi_name_lst,
# cmt_str_lst_lst = cmt_str_lst_lst,
# gamma=louvain_run_arr_dict[0]['gamma'],
# runs=num_runs,
# tau=0.1)
# if verbose:
# print("done in {}s".format(time.time()-start))
assert
len
(
kmeans_run_dct_arr
)
>
0
#reasonable assumption i hope
# create print dict with these values
print_dict
=
{}
print_dict
[(
0
,
'num runs'
)]
=
num_runs
print_dict
[(
1
,
'k'
)]
=
kmeans_run_dct_arr
[
0
][
'k'
]
# print_dict[(1, 'max num com')] = np.max(num_com_npa)
# print_dict[(2, 'mean num com')] = np.mean(num_com_npa)
# print_dict[(3, 'std dev num com')]=np.std(num_com_npa)
print_dict
[(
4
,
'min i'
)]
=
np
.
min
(
inertia_npa
)
print_dict
[(
5
,
'mean i'
)]
=
np
.
mean
(
inertia_npa
)
print_dict
[(
6
,
'std i'
)]
=
np
.
std
(
inertia_npa
)
# print_dict[(7 ,'unique com str')] = "{}/{}".format(unique_cmt_str, num_runs)
# print_dict[(8, 'com str mode count')]= "{}/{}".\
# format(cmt_str_mode_count.tolist(), num_runs)
# print_dict[(9, 'com str var index')] = cmt_str_var_index
print_dict
[(
10
,
'mean part sim'
)]
=
mean_var
[
0
]
print_dict
[(
11
,
'var part sim'
)]
=
mean_var
[
1
]
# print_dict[(11, 'consensus com str')] = cons_cmt_str
print_dict
[(
12
,
'min i part'
)]
=
min_inertia_partition
#write to std out in csv file format
if
verbose
:
print
(
"writing csv output to STDOUT"
)
csvwriter
=
csv
.
writer
(
sys
.
stdout
)
if
display_header
:
row
=
[]
for
key
in
sorted
(
print_dict
.
keys
()):
row
.
append
(
key
[
1
])
csvwriter
.
writerow
(
row
)
row
=
[]
for
key
in
sorted
(
print_dict
.
keys
()):
row
.
append
(
print_dict
[
key
])
csvwriter
.
writerow
(
row
)
if
__name__
==
"__main__"
:
main
()
This diff is collapsed.
Click to expand it.
src/hpf_kmeans.py
View file @
923688ec
#k-means analysis methods
from
__future__
import
print_function
import
sys
from
sklearn.cluster
import
KMeans
#returns kmeans_lst: list of kmeans results
def
run_kmeans
(
row_col_mat_npa
,
k
,
runs
):
#returns kmeans_lst: list of kmeans
#see example here for more info:
#http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
def
run_kmeans
(
row_col_mat_npa
,
k
,
runs
,
verbose
=
False
):
kmeans_lst
=
[]
for
run
in
xrange
(
runs
):
for
idx
,
run
in
enumerate
(
xrange
(
runs
)):
if
verbose
:
pct_str
=
"
\r
{0:0.2f}% complete... "
.
\
format
((
float
(
idx
)
/
float
(
runs
))
*
100.0
)
print
(
pct_str
,
end
=
''
)
sys
.
stdout
.
flush
()
kmeans_lst
.
append
(
KMeans
(
n_clusters
=
k
).
fit
(
row_col_mat_npa
))
if
verbose
:
pct_str
=
"
\r
{0:0.2f}% complete... "
.
format
(
100
)
print
(
pct_str
)
return
kmeans_lst
#returns
# {'run' : number of run
# 'k' : k value
# 'partition' : partition of clusters found by k-means
def
build_kmeans_run_dct
(
run_index
,
kmeans
,
k
,
partition
):
return
{
'run'
:
run_index
+
1
,
'k'
:
k
,
'partition'
:
partition
,
'inertia'
:
kmeans
.
inertia_
}
This diff is collapsed.
Click to expand it.
src/hpf_utils.py
View file @
923688ec
...
...
@@ -206,6 +206,8 @@ def build_louvain_run_dict(run_index, q, community_structure_dict, gamma):
'gamma'
:
gamma
,
'community_structure'
:
community_structure_dict
}
def
parse_louvain_run_dict
(
key_index_arr
,
vals_row
):
assert
len
(
key_index_arr
)
==
len
(
vals_row
),
"length of {}
\n
({}) !=
\n
length of {} ({})"
.
format
(
key_index_arr
,
len
(
key_index_arr
),
vals_row
,
len
(
vals_row
))
...
...
@@ -237,6 +239,10 @@ def calc_cmt_str_variability(cmt_cnt_lst, num_runs):
assert
assert_runs
==
num_runs
return
var_index
#convenience method that does the same thing as read louvain, so just call that
def
read_kmeans_run_dct_arr
(
input_csv_path
):
return
read_louvain_run_arr_dict
(
input_csv_path
=
input_csv_path
)
def
read_louvain_run_arr_dict
(
input_csv_path
):
louvain_run_arr_dict
=
[]
...
...
This diff is collapsed.
Click to expand it.
src/ms_char_clust.sh
0 → 100755
View file @
923688ec
#!/bin/bash
if
[
!
-n
"
$1
"
]
then
echo
"Usage:
`
basename
$0
`
<wildcard> (<python>)"
exit
1
fi
files
=
$1
#read in python that works e.g. /usr/local/python-2.7.6/bin/python
python
=
'python'
if
[
-n
"
$2
"
]
then
python
=
$2
;
fi
for
file
in
$files
do
qsub
-cwd
-q
compute.q
-l
hostslots
=
1 src/qsub_char_clust.sh
$file
$python
done
This diff is collapsed.
Click to expand it.
src/ms_run_k-means_row_col_mat.sh
0 → 100755
View file @
923688ec
#!/bin/bash
if
[
!
-n
"
$5
"
]
then
echo
"Usage:
`
basename
$0
`
<row-column-input-csv> <min-k> <max-k> <k-inc> <num-runs> (<python>)"
exit
1
fi
row_column_input_csv
=
$1
;
min_k
=
$2
;
max_k
=
$3
;
k_inc
=
$4
;
num_runs
=
$5
;
#read in python that works e.g. /usr/local/python-2.7.6/bin/python
python
=
'python'
if
[
-n
"
$6
"
]
then
python
=
$6
;
fi
k
=
$min_k
;
while
((
$(
echo
"
$k
<=
$max_k
"
| bc
-l
)
))
;
do
qsub
-cwd
-q
compute.q
-l
hostslots
=
1 src/qsub_run_k-means_row_col_mat.sh
$row_column_input_csv
$k
$num_runs
$python
echo
$k
k
=
$(
echo
"
$k
+
$k_inc
"
| bc
)
;
done
;
This diff is collapsed.
Click to expand it.
src/qsub_char_clust.sh
0 → 100755
View file @
923688ec
if
[
!
-n
"
$1
"
]
then
echo
"Usage:
`
basename
$0
`
<file> (<python>)"
exit
1
fi
file
=
$1
python
=
"python"
if
[
-n
"
$2
"
]
then
python
=
$2
;
else
echo
"using default python"
fi
source
venv/bin/activate
basepath
=
`
basename
$file
`
dirname
=
`
dirname
$file
`
out_file
=
"
$dirname
/char_clust_
$basepath
"
if
[
!
-f
"
$out_file
"
]
then
cmd
=
"
$python
src/char_clust.py -v -i
$file
-H"
$cmd
>
$out_file
fi
This diff is collapsed.
Click to expand it.
src/qsub_run_k-means_row_col_mat.sh
0 → 100755
View file @
923688ec
#!/bin/bash
if
[
!
-n
"
$3
"
]
then
echo
"Usage:
`
basename
$0
`
<row-column-input-csv> <k> <num-runs> (<python>)"
exit
1
fi
row_column_input_csv
=
$1
;
k
=
$2
;
num_runs
=
$3
;
python
=
"python"
if
[
-n
"
$4
"
]
then
python
=
$4
;
else
echo
"using default python"
fi
source
venv/bin/activate
$python
src/run_k-means_row_col_mat.py
-i
$row_column_input_csv
-k
$k
-r
$num_runs
This diff is collapsed.
Click to expand it.
src/run_k-means_row_col_mat.py
View file @
923688ec
...
...
@@ -3,8 +3,9 @@
from
__future__
import
print_function
import
argparse
,
os
import
hpf_utils
#import csv, bct, sys
#import cPickle as pickle
import
hpf_kmeans
import
csv
import
cPickle
as
pickle
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Runs kmeans clustering on NxM (square or rectangular) row source, column destination format CSV"
)
...
...
@@ -25,12 +26,12 @@ def main():
args
=
vars
(
parser
.
parse_args
())
#get output path and related args first in case just checking for that
k
=
floa
t
(
args
[
'k'
])
k
=
in
t
(
args
[
'k'
])
runs
=
int
(
args
[
'runs'
])
verbose
=
args
[
'verbose'
]
input_csv_path
=
args
[
'input_csv'
]
name_of_study
=
input_csv_path
.
replace
(
".csv"
,
""
)
output_csv_path
=
"{}_k-{:
.2f
}_runs-{:04d}.csv"
.
\
output_csv_path
=
"{}_k-{:
03d
}_runs-{:04d}.csv"
.
\
format
(
name_of_study
,
k
,
runs
)
print_output_path
=
args
[
'print_output_path'
]
if
print_output_path
:
...
...
@@ -42,112 +43,51 @@ def main():
"can't find input csv file {}"
.
format
(
input_csv_path
)
#OPEN, READ INPUT CSV
(
row_roi_name_npa
,
col_roi_name_npa
,
ctx
_mat_npa
)
=
\
hpf_utils
.
read_ctx_mat
(
input_csv_path
)
(
row_roi_name_npa
,
col_roi_name_npa
,
row_col
_mat_npa
)
=
\
hpf_utils
.
read_ctx_mat
(
input_csv_path
)
#double check formatting, shape of array
# if hpf_utils.is_sq(row_roi_name_npa=row_roi_name_npa,\
# col_roi_name_npa=col_roi_name_npa,\
# ctx_mat_npa=ctx_mat_npa):
# roi_name_npa = row_roi_name_npa
# sq_ctx_mat_npa = ctx_mat_npa
# else: #else if not square, then either pad or convert
# if verbose:
# shape = ctx_mat_npa.shape
# print("Matrix in {} is not square\nrows {}, columns {}".\
# format(input_csv_path, shape[0], shape[1]))
# print("and/or row ROIs \n{}\n don't match col ROIs\n{}".format(row_roi_name_npa, col_roi_name_npa))
# if pad_to_sq:
# if verbose:
# print("padding to square...")
# (pad_row_roi_name_npa, pad_col_roi_name_npa, sq_ctx_mat_npa) = \
# hpf_utils.pad_rect_ctx_mat_to_sq(row_roi_name_npa,
# col_roi_name_npa, ctx_mat_npa)
# roi_name_npa = \
# pad_col_roi_name_npa #CONVENTION:only use the col name rois
# else: # conv to square (duplicate nodes on left and top)
# if verbose:
# print("converting to square...")
# (sq_roi_name_npa, sq_ctx_mat_npa) = \
# hpf_utils.conv_rect_ctx_mat_to_sq(row_roi_name_npa,
# col_roi_name_npa, ctx_mat_npa)
# roi_name_npa = sq_roi_name_npa
# if verbose:
# shape = sq_ctx_mat_npa.shape
# print("with rows/cols {}/{} done".format(shape[0], shape[1]))
# connectivity_matrix_npa = sq_ctx_mat_npa
# #check for capitlization/duplicates
# hpf_utils.dup_check_container(dup_check_roi_container=roi_name_npa,
# input_csv_path = input_csv_path)
if
verbose
:
print
(
"Running k-means"
.
format
(
runs
))
# #CALL LOUVAIN
# #generate louvain run arr dict, format
# # [ { 'run' : run_index + 1,
# # 'num_communities' : len(community_structure_dict.keys(),
# # 'q' : q,
# # 'gamma' : gamma # redundant but that's better than the alternative
# # 'community_structure' : community_structure_dict}, ... ]
# louvain_run_arr_dict = []
# for run_index in xrange(runs):
# pct_str ="\r{0:0.2f}% complete... ".\
# format((float(run_index)/float(runs))*100.0)
# print(pct_str ,end='')
# sys.stdout.flush()
# (ci, q) = bct.modularity_louvain_dir(W=connectivity_matrix_npa,\
# gamma=gamma)
# assert len(ci) == roi_name_npa.size,\
# "Uh-oh, found commmunities don't make sense"
# community_structure_dict = hpf_utils.build_community_structure_dict(\
# ci = ci,
# roi_name_npa =roi_name_npa)
# #create wrapper dict for community structure dict
# louvain_run_dict = hpf_utils.build_louvain_run_dict(
# run_index=run_index,
# q=q,
# community_structure_dict = community_structure_dict,
# gamma = gamma)
# louvain_run_arr_dict.append(louvain_run_dict)
# pct_str ="\r{0:0.2f}% complete... ".format(100)
# print(pct_str)
# #WRITE LOUVAIN TO OUTPUT CSV
# #write louvain_run_arr_dict to output_csv_path
# assert len(louvain_run_arr_dict) > 0,\
# "Your louvain run ain't got no results bro" #reasonable assumption
# with open(output_csv_path, 'wb') as csvfile:
# csvwriter = csv.writer(csvfile)
# #create key index automatically
# key_index_arr = sorted(louvain_run_arr_dict[0].keys(), reverse=True)
# csvwriter.writerow(key_index_arr)
# #RUN K-MEANS AND BUILD KMEANS RUN ARR DICT
kmeans_run_dct_arr
=
[]
kmeans_lst
=
hpf_kmeans
.
run_kmeans
(
row_col_mat_npa
=
row_col_mat_npa
,
k
=
k
,
runs
=
runs
,
verbose
=
verbose
)
for
idx
,
kmeans
in
enumerate
(
kmeans_lst
):
kmeans_run_dct_arr
.
append
(
hpf_kmeans
.
build_kmeans_run_dct
(
run_index
=
idx
,
kmeans
=
kmeans
,
k
=
k
,
partition
=
hpf_utils
.
partition
(
ci
=
kmeans
.
labels_
.
tolist
(),
roi_name_arr
=
row_roi_name_npa
.
tolist
())))
#WRITE KMEANS OUTPUT TO CSV
#write partitions_lst to output_csv_path
assert
len
(
kmeans_run_dct_arr
)
>
0
,
\
"Your kmeans run ain't got no results bro"
#reasonable assumption
with
open
(
output_csv_path
,
'wb'
)
as
csvfile
:
csvwriter
=
csv
.
writer
(
csvfile
)
#create key index automatically
key_index_arr
=
sorted
(
kmeans_run_dct_arr
[
0
].
keys
(),
reverse
=
True
)
csvwriter
.
writerow
(
key_index_arr
)
#
for m in
louvain_run_arr
_d
i
ct:
#
map_val_arr = []
#
#follow keys defined in key_index_arr
#
for key in key_index_arr:
#
map_val_arr.append(m[key])
#
csvwriter.writerow(map_val_arr)
#
print("Wrote
Louvain
results to {}".format(output_csv_path))
#
output_pickle_path = hpf_utils.pickle_path(output_csv_path)
#
if verbose:
#
print("Wrote pickle args to {}".format(output_pickle_path))
#
pickle.dump(args, open(output_pickle_path, "wb"))
for
m
in
kmeans_run
_dct
_arr
:
map_val_arr
=
[]
#follow keys defined in key_index_arr
for
key
in
key_index_arr
:
map_val_arr
.
append
(
m
[
key
])
csvwriter
.
writerow
(
map_val_arr
)
if
verbose
:
print
(
"Wrote
kmeans
results to {}"
.
format
(
output_csv_path
))
output_pickle_path
=
hpf_utils
.
pickle_path
(
output_csv_path
)
if
verbose
:
print
(
"Wrote pickle args to {}"
.
format
(
output_pickle_path
))
pickle
.
dump
(
args
,
open
(
output_pickle_path
,
"wb"
))
if
__name__
==
"__main__"
:
main
()
...
...
This diff is collapsed.
Click to expand it.
tests/test_hpf_kmeans.py
View file @
923688ec
...
...
@@ -22,7 +22,7 @@ class TestHpfKmeans(unittest.TestCase):
frozenset
([
'2'
,
'4'
,
'6'
])])]
kmeans_lst
=
hpf_kmeans
.
run_kmeans
(
row_col_mat_npa
=
row_col_mat_npa
,
k
=
k
,
runs
=
runs
)
k
=
k
,
runs
=
runs
)
kmean_cis
=
[
kmeans
.
labels_
.
tolist
()
for
kmeans
in
kmeans_lst
]
kmean_partitions
=
[]
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment