Commit 923688ec authored by Ian Bowman's avatar Ian Bowman
Browse files

New multiscale k-means code.

parent 32acfae0
#!/usr/bin/python
# Initial assumptions: N x N connectivity matrix (if not N x N will need to pad)
from __future__ import print_function
import argparse, csv, os
import numpy as np
import hpf_utils, hpf_ms
import sys
def main():
parser = argparse.ArgumentParser(description="Characterize repeated runs of k-means clustering recorded in CSV")
parser.add_argument('-i','--input_csv',
help='Input, CSV containing multiple runs of k-means clustering',
required=True)
parser.add_argument('-H', '--header',
help='Include header info, typically used first write',
action='store_true')
parser.add_argument('-v', '--verbose',
help='Print relevant but optional output',
action='store_true')
args = vars(parser.parse_args())
input_csv_path = args['input_csv']
assert os.path.isfile(input_csv_path),\
"can't find input csv file {}".format(input_csv_path)
display_header = args['header']
verbose = args['verbose']
#parse input csv into kmeans run arr dict, all values are strings
# [ { 'run' : run
# 'k' : k
# 'inertia' : inertia,
# 'partition' : k-means partition frozen set}, ... ]
kmeans_run_dct_arr = hpf_utils.read_kmeans_run_dct_arr(input_csv_path)
run_npa = np.array([x['run'] for x in kmeans_run_dct_arr])
inertia_npa = np.array([x['inertia'] for x in kmeans_run_dct_arr])
#FIND METRIC PREREQS
roi_name_lst = []
partition_lst = []
for run in kmeans_run_dct_arr:
partition = eval(run['partition']) #sorry God
if len(roi_name_lst) == 0:
roi_name_lst = sorted(
hpf_utils.flatten(partition))
partition_lst.append(partition)
num_runs = np.max(run_npa)
#CALCULATE METRICS
# #multi-scale stuff
res_dct = {}
if verbose:
print("calculating std_dev_w_alpha_beta...")
import time
start = time.time()
std_dev_w_alpha_beta = hpf_ms.calc_std_w_alpha_beta(
roi_name_lst=roi_name_lst,
cmt_str_lst_lst=partition_lst,
res_dct=res_dct)
if verbose:
print("done in {}s".format(time.time()-start))
if verbose:
print("calculating mean_var_z_alpha_beta...")
start = time.time()
mean_var = hpf_ms.calc_mean_var_z_alpha_beta(
roi_name_lst=roi_name_lst,
std_w_alpha_beta=std_dev_w_alpha_beta,
cmt_str_lst_lst=partition_lst,
M=hpf_ms.n_choose_2(len(partition_lst)),
res_dct=res_dct)
if verbose:
print("done in {}s".format(time.time()-start))
min_inertia_partition = []
for run in kmeans_run_dct_arr:
if len(min_inertia_partition) == 0 and \
run['inertia'] == np.min(inertia_npa):
min_inertia_partition = run['partition']
# #okay phew! now calculate consensus community structure
# if verbose:
# print("calculating cons_cmt_str...")
# start = time.time()
# cons_cmt_str = hpf_ms.calc_cons_cmt_str(roi_name_lst=roi_name_lst,
# cmt_str_lst_lst = cmt_str_lst_lst,
# gamma=louvain_run_arr_dict[0]['gamma'],
# runs=num_runs,
# tau=0.1)
# if verbose:
# print("done in {}s".format(time.time()-start))
assert len(kmeans_run_dct_arr) > 0 #reasonable assumption i hope
# create print dict with these values
print_dict = {}
print_dict[(0, 'num runs')] = num_runs
print_dict[(1, 'k')] = kmeans_run_dct_arr[0]['k']
# print_dict[(1, 'max num com')] = np.max(num_com_npa)
# print_dict[(2, 'mean num com')] = np.mean(num_com_npa)
# print_dict[(3, 'std dev num com')]=np.std(num_com_npa)
print_dict[(4, 'min i')] = np.min(inertia_npa)
print_dict[(5, 'mean i')] = np.mean(inertia_npa)
print_dict[(6, 'std i')] = np.std(inertia_npa)
# print_dict[(7 ,'unique com str')] = "{}/{}".format(unique_cmt_str, num_runs)
# print_dict[(8, 'com str mode count')]= "{}/{}".\
# format(cmt_str_mode_count.tolist(), num_runs)
# print_dict[(9, 'com str var index')] = cmt_str_var_index
print_dict[(10, 'mean part sim')] = mean_var[0]
print_dict[(11, 'var part sim')] = mean_var[1]
# print_dict[(11, 'consensus com str')] = cons_cmt_str
print_dict[(12, 'min i part')] = min_inertia_partition
#write to std out in csv file format
if verbose:
print("writing csv output to STDOUT")
csvwriter = csv.writer(sys.stdout)
if display_header:
row = []
for key in sorted(print_dict.keys()):
row.append(key[1])
csvwriter.writerow(row)
row = []
for key in sorted(print_dict.keys()):
row.append(print_dict[key])
csvwriter.writerow(row)
if __name__=="__main__":
main()
#k-means analysis methods
from __future__ import print_function
import sys
from sklearn.cluster import KMeans
#returns kmeans_lst: list of kmeans results
def run_kmeans(row_col_mat_npa, k, runs):
#returns kmeans_lst: list of kmeans
#see example here for more info:
#http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
def run_kmeans(row_col_mat_npa, k, runs, verbose=False):
kmeans_lst = []
for run in xrange(runs):
for idx, run in enumerate(xrange(runs)):
if verbose:
pct_str ="\r{0:0.2f}% complete... ".\
format((float(idx)/float(runs))*100.0)
print(pct_str ,end='')
sys.stdout.flush()
kmeans_lst.append(KMeans(n_clusters=k).fit(row_col_mat_npa))
if verbose:
pct_str ="\r{0:0.2f}% complete... ".format(100)
print(pct_str)
return kmeans_lst
#returns
# {'run' : number of run
# 'k' : k value
# 'partition' : partition of clusters found by k-means
def build_kmeans_run_dct(run_index, kmeans, k, partition):
return {
'run': run_index + 1,
'k' : k,
'partition' : partition,
'inertia' : kmeans.inertia_}
......@@ -206,6 +206,8 @@ def build_louvain_run_dict(run_index, q, community_structure_dict, gamma):
'gamma' : gamma,
'community_structure' : community_structure_dict}
def parse_louvain_run_dict(key_index_arr, vals_row):
assert len(key_index_arr) == len(vals_row), "length of {}\n ({}) != \nlength of {} ({})".format(key_index_arr, len(key_index_arr), vals_row, len(vals_row))
......@@ -237,6 +239,10 @@ def calc_cmt_str_variability(cmt_cnt_lst, num_runs):
assert assert_runs == num_runs
return var_index
#convenience method that does the same thing as read louvain, so just call that
def read_kmeans_run_dct_arr(input_csv_path):
return read_louvain_run_arr_dict(input_csv_path=input_csv_path)
def read_louvain_run_arr_dict(input_csv_path):
louvain_run_arr_dict = []
......
#!/bin/bash
if [ ! -n "$1" ]
then
echo "Usage: `basename $0` <wildcard> (<python>)"
exit 1
fi
files=$1
#read in python that works e.g. /usr/local/python-2.7.6/bin/python
python='python'
if [ -n "$2" ]
then
python=$2;
fi
for file in $files
do
qsub -cwd -q compute.q -l hostslots=1 src/qsub_char_clust.sh $file $python
done
#!/bin/bash
if [ ! -n "$5" ]
then
echo "Usage: `basename $0` <row-column-input-csv> <min-k> <max-k> <k-inc> <num-runs> (<python>)"
exit 1
fi
row_column_input_csv=$1;
min_k=$2;
max_k=$3;
k_inc=$4;
num_runs=$5;
#read in python that works e.g. /usr/local/python-2.7.6/bin/python
python='python'
if [ -n "$6" ]
then
python=$6;
fi
k=$min_k;
while (( $(echo "$k <= $max_k" | bc -l) )); do
qsub -cwd -q compute.q -l hostslots=1 src/qsub_run_k-means_row_col_mat.sh $row_column_input_csv $k $num_runs $python
echo $k
k=$(echo "$k + $k_inc" | bc);
done;
if [ ! -n "$1" ]
then
echo "Usage: `basename $0` <file> (<python>)"
exit 1
fi
file=$1
python="python"
if [ -n "$2" ]
then
python=$2;
else
echo "using default python"
fi
source venv/bin/activate
basepath=`basename $file`
dirname=`dirname $file`
out_file="$dirname/char_clust_$basepath"
if [ ! -f "$out_file" ]
then
cmd="$python src/char_clust.py -v -i $file -H"
$cmd > $out_file
fi
#!/bin/bash
if [ ! -n "$3" ]
then
echo "Usage: `basename $0` <row-column-input-csv> <k> <num-runs> (<python>)"
exit 1
fi
row_column_input_csv=$1;
k=$2;
num_runs=$3;
python="python"
if [ -n "$4" ]
then
python=$4;
else
echo "using default python"
fi
source venv/bin/activate
$python src/run_k-means_row_col_mat.py -i $row_column_input_csv -k $k -r $num_runs
......@@ -3,8 +3,9 @@
from __future__ import print_function
import argparse, os
import hpf_utils
#import csv, bct, sys
#import cPickle as pickle
import hpf_kmeans
import csv
import cPickle as pickle
def main():
parser = argparse.ArgumentParser(description="Runs kmeans clustering on NxM (square or rectangular) row source, column destination format CSV")
......@@ -25,12 +26,12 @@ def main():
args = vars(parser.parse_args())
#get output path and related args first in case just checking for that
k = float(args['k'])
k = int(args['k'])
runs = int(args['runs'])
verbose = args['verbose']
input_csv_path = args['input_csv']
name_of_study = input_csv_path.replace(".csv", "")
output_csv_path = "{}_k-{:.2f}_runs-{:04d}.csv".\
output_csv_path = "{}_k-{:03d}_runs-{:04d}.csv".\
format(name_of_study, k, runs)
print_output_path = args['print_output_path']
if print_output_path:
......@@ -42,112 +43,51 @@ def main():
"can't find input csv file {}".format(input_csv_path)
#OPEN, READ INPUT CSV
(row_roi_name_npa, col_roi_name_npa, ctx_mat_npa)=\
hpf_utils.read_ctx_mat(input_csv_path)
(row_roi_name_npa, col_roi_name_npa, row_col_mat_npa)=\
hpf_utils.read_ctx_mat(input_csv_path)
#double check formatting, shape of array
# if hpf_utils.is_sq(row_roi_name_npa=row_roi_name_npa,\
# col_roi_name_npa=col_roi_name_npa,\
# ctx_mat_npa=ctx_mat_npa):
# roi_name_npa = row_roi_name_npa
# sq_ctx_mat_npa = ctx_mat_npa
# else: #else if not square, then either pad or convert
# if verbose:
# shape = ctx_mat_npa.shape
# print("Matrix in {} is not square\nrows {}, columns {}".\
# format(input_csv_path, shape[0], shape[1]))
# print("and/or row ROIs \n{}\n don't match col ROIs\n{}".format(row_roi_name_npa, col_roi_name_npa))
# if pad_to_sq:
# if verbose:
# print("padding to square...")
# (pad_row_roi_name_npa, pad_col_roi_name_npa, sq_ctx_mat_npa) = \
# hpf_utils.pad_rect_ctx_mat_to_sq(row_roi_name_npa,
# col_roi_name_npa, ctx_mat_npa)
# roi_name_npa = \
# pad_col_roi_name_npa #CONVENTION:only use the col name rois
# else: # conv to square (duplicate nodes on left and top)
# if verbose:
# print("converting to square...")
# (sq_roi_name_npa, sq_ctx_mat_npa) = \
# hpf_utils.conv_rect_ctx_mat_to_sq(row_roi_name_npa,
# col_roi_name_npa, ctx_mat_npa)
# roi_name_npa = sq_roi_name_npa
# if verbose:
# shape = sq_ctx_mat_npa.shape
# print("with rows/cols {}/{} done".format(shape[0], shape[1]))
# connectivity_matrix_npa = sq_ctx_mat_npa
# #check for capitlization/duplicates
# hpf_utils.dup_check_container(dup_check_roi_container=roi_name_npa,
# input_csv_path = input_csv_path)
if verbose:
print("Running k-means".format(runs))
# #CALL LOUVAIN
# #generate louvain run arr dict, format
# # [ { 'run' : run_index + 1,
# # 'num_communities' : len(community_structure_dict.keys(),
# # 'q' : q,
# # 'gamma' : gamma # redundant but that's better than the alternative
# # 'community_structure' : community_structure_dict}, ... ]
# louvain_run_arr_dict = []
# for run_index in xrange(runs):
# pct_str ="\r{0:0.2f}% complete... ".\
# format((float(run_index)/float(runs))*100.0)
# print(pct_str ,end='')
# sys.stdout.flush()
# (ci, q) = bct.modularity_louvain_dir(W=connectivity_matrix_npa,\
# gamma=gamma)
# assert len(ci) == roi_name_npa.size,\
# "Uh-oh, found commmunities don't make sense"
# community_structure_dict = hpf_utils.build_community_structure_dict(\
# ci = ci,
# roi_name_npa =roi_name_npa)
# #create wrapper dict for community structure dict
# louvain_run_dict = hpf_utils.build_louvain_run_dict(
# run_index=run_index,
# q=q,
# community_structure_dict = community_structure_dict,
# gamma = gamma)
# louvain_run_arr_dict.append(louvain_run_dict)
# pct_str ="\r{0:0.2f}% complete... ".format(100)
# print(pct_str)
# #WRITE LOUVAIN TO OUTPUT CSV
# #write louvain_run_arr_dict to output_csv_path
# assert len(louvain_run_arr_dict) > 0,\
# "Your louvain run ain't got no results bro" #reasonable assumption
# with open(output_csv_path, 'wb') as csvfile:
# csvwriter = csv.writer(csvfile)
# #create key index automatically
# key_index_arr = sorted(louvain_run_arr_dict[0].keys(), reverse=True)
# csvwriter.writerow(key_index_arr)
# #RUN K-MEANS AND BUILD KMEANS RUN ARR DICT
kmeans_run_dct_arr = []
kmeans_lst = hpf_kmeans.run_kmeans(row_col_mat_npa=row_col_mat_npa,
k=k, runs=runs, verbose=verbose)
for idx, kmeans in enumerate(kmeans_lst):
kmeans_run_dct_arr.append(
hpf_kmeans.build_kmeans_run_dct(
run_index=idx,
kmeans=kmeans,
k=k,
partition= hpf_utils.partition(
ci=kmeans.labels_.tolist(),
roi_name_arr=row_roi_name_npa.tolist())))
#WRITE KMEANS OUTPUT TO CSV
#write partitions_lst to output_csv_path
assert len(kmeans_run_dct_arr) > 0,\
"Your kmeans run ain't got no results bro" #reasonable assumption
with open(output_csv_path, 'wb') as csvfile:
csvwriter = csv.writer(csvfile)
#create key index automatically
key_index_arr = sorted(kmeans_run_dct_arr[0].keys(), reverse=True)
csvwriter.writerow(key_index_arr)
# for m in louvain_run_arr_dict:
# map_val_arr = []
# #follow keys defined in key_index_arr
# for key in key_index_arr:
# map_val_arr.append(m[key])
# csvwriter.writerow(map_val_arr)
# print("Wrote Louvain results to {}".format(output_csv_path))
# output_pickle_path = hpf_utils.pickle_path(output_csv_path)
# if verbose:
# print("Wrote pickle args to {}".format(output_pickle_path))
# pickle.dump(args, open(output_pickle_path, "wb"))
for m in kmeans_run_dct_arr:
map_val_arr = []
#follow keys defined in key_index_arr
for key in key_index_arr:
map_val_arr.append(m[key])
csvwriter.writerow(map_val_arr)
if verbose:
print("Wrote kmeans results to {}".format(output_csv_path))
output_pickle_path = hpf_utils.pickle_path(output_csv_path)
if verbose:
print("Wrote pickle args to {}".format(output_pickle_path))
pickle.dump(args, open(output_pickle_path, "wb"))
if __name__ == "__main__":
main()
......
......@@ -22,7 +22,7 @@ class TestHpfKmeans(unittest.TestCase):
frozenset(['2', '4', '6'])])]
kmeans_lst = hpf_kmeans.run_kmeans(row_col_mat_npa=row_col_mat_npa,
k=k, runs=runs)
k=k, runs=runs)
kmean_cis = [kmeans.labels_.tolist() for kmeans in kmeans_lst]
kmean_partitions = []
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment