-
Notifications
You must be signed in to change notification settings - Fork 0
/
MyParser.py
127 lines (109 loc) · 3.58 KB
/
MyParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from itertools import chain
class MyParser:
fileName = None
lines = None
splitted = None
def __init__(self, fileName):
super().__init__()
self.fileName = fileName
self.lines = [line.rstrip('\n') for line in open(self.fileName)]
self.lines = [line.split() for line in self.lines]
self.splitted = []
for line in self.lines:
self.splitted.append([self.createTuple(x) for x in line])
def createTuple(self, x):
return (x.split('_')[0], x.split('_')[1])
def getWordsWithTag(self):
l = list(set(list(chain.from_iterable(self.splitted))))
l.sort()
return l
def getAllThreeTagsCombinations(self):
tags = []
for line in self.splitted:
for (_, t1), (_, t2), (_, t3) in zip(line[:], line[1:], line[2:]):
tags.append((t1, t2, t3))
tags.append(("*", "*", line[0][1]))
if len(line) > 1:
tags.append(("*", line[0][1], line[1][1]))
l = list(set(tags))
l.sort()
return l
def getAllPrevWordTagCombinations(self):
tags = []
for line in self.splitted:
for (w_prev, _), (_, t_curr) in zip(line[:], line[1:]):
tags.append((w_prev, t_curr))
tags.append(("*", line[0][1]))
l = list(set(tags))
l.sort()
return l
def getAllNextWordTagCombinations(self):
tags = []
for line in self.splitted:
for (_, t_curr), (w_next, _) in zip(line[:], line[1:]):
tags.append((w_next, t_curr))
tags.append(("SEN-END", line[-1][1]))
l = list(set(tags))
l.sort()
return l
def getAllPairTagsCombinations(self):
tags = []
for line in self.splitted:
for (_, t1), (_, t2) in zip(line[:], line[1:]):
tags.append((t1, t2))
tags.append(("*", line[0][1]))
l = list(set(tags))
l.sort()
return l
def getUniqueTags(self):
l = self.getWordsWithTag()
tags = [w[1] for w in l]
tags = list(set(tags))
tags.sort()
return tags
def getSeenWordsToTagsDict(self):
words_with_tag = self.getWordsWithTag()
d = dict()
for w,t in words_with_tag:
if w not in d:
d[w] = [t]
continue
if t not in d[w]:
d[w] = d[w] + [t]
return d
def getAllTagsForPrefix(self, prefixes):
l = self.getWordsWithTag()
pref = []
for w,t in l:
pref = pref + [(w,x,t) for x in prefixes if w.startswith(x)]
pref = list(set(pref))
pref.sort()
return pref
def getAllTagsForSuffix(self, suffixes):
l = self.getWordsWithTag()
suf = []
for w, t in l:
suf = suf + [(w, x, t) for x in suffixes if w.endswith(x)]
suf = list(set(suf))
suf.sort()
return suf
def getAllTagsForLettersNumbers(self, digits):
res = []
l = self.getWordsWithTag()
for w,t in l:
res = res + [(w,x,t) for x in digits if str(w).startswith(x)]
return res
def getAllTagsForCaps(self):
l = self.getWordsWithTag()
res = []
for w, t in l:
if not str(w).islower():
res = res + [(w,t)]
return res
def getAllTagsForDigitLetters(self):
res = []
l = self.getWordsWithTag()
for w, t in l:
if any(i.isdigit() for i in str(w)):
res = res + [(w, t)]
return res