-
Notifications
You must be signed in to change notification settings - Fork 0
/
mod_run_lefse.py
executable file
·141 lines (130 loc) · 7.78 KB
/
mod_run_lefse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python
import os
import sys
import math
import pickle
from lefse import *
# import pdb
import pandas as pd
def read_params(args):
parser = argparse.ArgumentParser(description='LEfSe 1.0')
parser.add_argument('input_file', metavar='INPUT_FILE', type=str, help="the input file")
parser.add_argument('output_file', metavar='OUTPUT_FILE', type=str,
help="the output file containing the data for the visualization module")
# too many feature will cause R RRuntimeWarning: protect()
parser.add_argument('--top', dest="top", metavar='int', type=int, default=6000,
help="max number of features to keep before internal wilcoxon")
parser.add_argument('-o', dest="out_text_file", metavar='str', type=str, default="",
help="set the file for exporting the result (only concise textual form)")
parser.add_argument('-a', dest="anova_alpha", metavar='float', type=float, default=0.05,
help="set the alpha value for the Anova test (default 0.05)")
parser.add_argument('-w', dest="wilcoxon_alpha", metavar='float', type=float, default=0.05,
help="set the alpha value for the Wilcoxon test (default 0.05)")
parser.add_argument('-l', dest="lda_abs_th", metavar='float', type=float, default=2.0,
help="set the threshold on the absolute value of the logarithmic LDA score (default 2.0)")
parser.add_argument('--nlogs', dest="nlogs", metavar='int', type=int, default=3,
help="max log ingluence of LDA coeff")
parser.add_argument('--verbose', dest="verbose", metavar='int', choices=[0, 1], type=int, default=0,
help="verbose execution (default 0)")
parser.add_argument('--wilc', dest="wilc", metavar='int', choices=[0, 1], type=int, default=1,
help="wheter to perform the Wicoxon step (default 1)")
parser.add_argument('-r', dest="rank_tec", metavar='str', choices=['lda', 'svm'], type=str, default='lda',
help="select LDA or SVM for effect size (default LDA)")
parser.add_argument('--svm_norm', dest="svm_norm", metavar='int', choices=[0, 1], type=int, default=1,
help="whether to normalize the data in [0,1] for SVM feature waiting (default 1 strongly suggested)")
parser.add_argument('-b', dest="n_boots", metavar='int', type=int, default=30,
help="set the number of bootstrap iteration for LDA (default 30)")
parser.add_argument('-e', dest="only_same_subcl", metavar='int', type=int, default=0,
help="set whether perform the wilcoxon test only among the subclasses with the same name (default 0)")
parser.add_argument('-c', dest="curv", metavar='int', type=int, default=0,
help="set whether perform the wilcoxon test ing the Curtis's approach [BETA VERSION] (default 0)")
parser.add_argument('-f', dest="f_boots", metavar='float', type=float, default=0.67,
help="set the subsampling fraction value for each bootstrap iteration (default 0.66666)")
parser.add_argument('-s', dest="strict", choices=[0, 1, 2], type=int, default=0,
help="set the multiple testing correction options. 0 no correction (more strict, default), 1 correction for independent comparisons, 2 correction for independent comparison")
# parser.add_argument('-m',dest="m_boots", type=int, default=5,
# help="minimum cardinality of classes in each bootstrapping iteration (default 5)")
parser.add_argument('--min_c', dest="min_c", metavar='int', type=int, default=10,
help="minimum number of samples per subclass for performing wilcoxon test (default 10)")
parser.add_argument('-t', dest="title", metavar='str', type=str, default="",
help="set the title of the analysis (default input file without extension)")
parser.add_argument('-y', dest="multiclass_strat", choices=[0, 1], type=int, default=0,
help="(for multiclass tasks) set whether the test is performed in a one-against-one ( 1 - more strict!) or in a one-against-all setting ( 0 - less strict) (default 0)")
args = parser.parse_args()
params = vars(args)
if params['title'] == "":
params['title'] = params['input_file'].split("/")[-1].split('.')[0]
return params
if __name__ == '__main__':
init()
params = read_params(sys.argv)
feats, cls, class_sl, subclass_sl, class_hierarchy = load_data(params['input_file'])
kord, cls_means = get_class_means(class_sl, feats)
wilcoxon_res = {}
kw_n_ok = 0
nf = 0
feats_pv = pd.Series()
for feat_name, feat_values in feats.items():
if params['verbose']:
print "Testing feature", str(nf), ": ", feat_name,
nf += 1
kw_ok, pv = test_kw_r(cls, feat_values, params['anova_alpha'], sorted(cls.keys()))
# pdb.set_trace()
if not kw_ok:
if params['verbose']:
print "\tkw ko"
del feats[feat_name]
wilcoxon_res[feat_name] = "-"
continue
# Select top significant features
if len(feats_pv) < params['top']:
kw_n_ok += 1
feats_pv[feat_name] = pv
elif pv < feats_pv.max():
index_max = feats_pv.idxmax()
del feats[index_max], feats_pv[index_max]
# wilcoxon_res[index_max] = "-"
feats_pv[feat_name] = pv
else:
del feats[feat_name]
continue
if params['verbose']:
print "\tkw ok\t",
if not params['wilc']:
continue
# kw_n_ok += 1
res_wilcoxon_rep = test_rep_wilcoxon_r(subclass_sl, class_hierarchy, feat_values, params['wilcoxon_alpha'], params[
'multiclass_strat'], params['strict'], feat_name, params['min_c'], params['only_same_subcl'], params['curv'])
wilcoxon_res[feat_name] = str(pv) if res_wilcoxon_rep else "-"
if not res_wilcoxon_rep:
if params['verbose']:
print "wilc ko"
del feats[feat_name], feats_pv[feat_name]
elif params['verbose']:
print "wilc ok\t"
if len(feats) > 0:
print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
if params['lda_abs_th'] < 0.0:
lda_res, lda_res_th = dict([(k, 0.0) for k, v in feats.items()]), dict([(k, v) for k, v in feats.items()])
else:
if params['rank_tec'] == 'lda':
lda_res, lda_res_th = test_lda_r(
cls, feats, class_sl, params['n_boots'], params['f_boots'], params['lda_abs_th'], 0.0000000001, params['nlogs'])
elif params['rank_tec'] == 'svm':
lda_res, lda_res_th = test_svm(
cls, feats, class_sl, params['n_boots'], params['f_boots'], params['lda_abs_th'], 0.0, params['svm_norm'])
else:
lda_res, lda_res_th = dict([(k, 0.0) for k, v in feats.items()]), dict([(k, v)
for k, v in feats.items()])
else:
print "Number of significantly discriminative features:", len(feats), "(", kw_n_ok, ") before internal wilcoxon"
print "No features with significant differences between the two classes"
lda_res, lda_res_th = {}, {}
outres = {}
outres['lda_res_th'] = lda_res_th
outres['lda_res'] = lda_res
outres['cls_means'] = cls_means
outres['cls_means_kord'] = kord
outres['wilcox_res'] = wilcoxon_res
print "Number of discriminative features with abs LDA score >", params['lda_abs_th'], ":", len(lda_res_th)
save_res(outres, params["output_file"])