-
Notifications
You must be signed in to change notification settings - Fork 0
/
vocabcreater.py
117 lines (93 loc) · 2.67 KB
/
vocabcreater.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import string
import re
from collections import Counter
import json
import nltk
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag, map_tag
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import math
def createTokens(content):
#print type(content)
content=content.decode('utf-8','ignore').lower()
tok=wordpunct_tokenize(content)
tok.sort()
tokens=[]
for word in tok:
word=word.encode('utf-8','ignore')
if word not in stop:
if re.match('[a-z]{2}',word ):
word = unicode(word, 'utf-8')
tokens.append(word)
#print word#.encode('utf-8','ignore')#.encode('cp850','replace').decode('cp850')
return tokens
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('ADJ'):
return 'a'
elif treebank_tag.startswith('V'):
return 'v'
elif treebank_tag.startswith('NO'):
return 'n'
elif treebank_tag.startswith('ADV'):
return 'r'
else:
return ''
def lem(tokens):
posTagged=nltk.pos_tag(tokens);
lmtzr=WordNetLemmatizer()
array=[]
simplified = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]
for i in simplified:
if (get_wordnet_pos(i[1])):
array.append(lmtzr.lemmatize(i[0],pos=(get_wordnet_pos(i[1]))))
else:
array.append(i[0])
#print(array)
return array
def createVocabulary(content):
#tokenize
tokens=createTokens(content)
#print "Tokens Created"
tokens=lem(tokens)
return tokens
#setting stop words
stop=set(stopwords.words('english'))
#opening file
d=pd.read_csv('SampleForNaive.csv')
Id = list(d.id.unique())
data = d['data'].tolist()
#print data
category = d['class'].tolist()
vocabulary=[]
for content in data:
#print type(content)
newTokens=createVocabulary(content)
for word in newTokens:
if word not in vocabulary:
if len(word) > 3:
if "xxx" not in word:
word = word.encode("utf-8")
vocabulary.append(word)
#print Id
#print vocabulary
d2=pd.read_csv('test.csv')
Id = list(d.id.unique())
data1 = d2['data'].tolist()
#print data
category1 = d2['category'].tolist()
vocabularyfortest=[]
for content in data1:
#print type(content)
newTokens=createVocabulary(content)
for word in newTokens:
if word not in vocabularyfortest:
if len(word) > 3:
if "xxx" not in word:
word = word.encode("utf-8")
vocabularyfortest.append(word)
#print vocabularyfortest