-
Notifications
You must be signed in to change notification settings - Fork 3
/
ClassFactorization.py
68 lines (48 loc) · 1.75 KB
/
ClassFactorization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""ClassFactorization.py
Using a user defined dictionary, will convert text labels of a class into an "int" value for more convenience
when running ML algorithms. e.g. 'Y' to 1, 'N' to 0
"""
import pandas as pd
from pathlib import Path
protected_key_words = ['train', 'test']
conversion = {'Y': 1, 'N': 0, 'Q': 2, 'U': 3}
def _data_multi_factorize(*args):
"""Converts text labels in gold standard to integers based on global conversion dictionary
e.g. 'Y' to 1
Converts anything that is not an integer and not in dictionary to -1 (which will be ignored by classifers later on)
"""
x = args[0]
global conversion, protected_key_words
if x in conversion:
return conversion[x]
elif isinstance(x, int):
pass
else:
return -1
def main(gold_csv = None, conversion_dict = None, work_dir = None):
while gold_csv is None or Path(gold_csv).exists() is False:
gold_csv = input("Please enter path for gold csv or q to quit:")
if gold_csv == 'q':
exit()
gold_csv = Path(gold_csv)
if conversion_dict is not None:
global conversion
conversion = conversion_dict
df = pd.read_csv(gold_csv, index_col=0)
print("Columns: ")
print(list(df.columns))
for task in df.columns:
if task in protected_key_words:
continue
df[task] = df[task].apply(_data_multi_factorize)
print(df.head())
if work_dir is None:
path_out = gold_csv.parent / (gold_csv.stem + '_multiclass.csv')
else:
path_out= Path(work_dir) / (gold_csv.stem + '_multiclass.csv')
df.to_csv(path_out)
print("Factorized Gold Standard saved to: ", str(path_out))
if __name__ == '__main__':
main()