-
Notifications
You must be signed in to change notification settings - Fork 0
/
FeatureWords.py
91 lines (83 loc) Β· 3.03 KB
/
FeatureWords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import csv
from nltk import bigrams
from nltk.corpus import stopwords
import re
from PreProcessTweets import *
import pickle
__author__ = 'Ritvika'
#start getfeatureVector
def getFeatureVector(tweet_tokens):
#print " \t getFeatureVector called "
featureVector = []
for w in tweet_tokens:
#replace two or more with two occurrences
#w = replaceTwoOrMore(w)
#strip punctuation
w = w.strip('\'"?,.')
#check if the word stats with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopwords.words('english') or w == 'AT_USER' or w == 'URL' or re.search(r'^[a-z]$',w) or val is None):
continue
else:
featureVector.append(w.lower())
#print type(featureVector)
return featureVector
#end
def getBigramFeatureVector(tweet_tokens):
#print " \t getFeatureVector called "
temp = []
for w in tweet_tokens:
w = w.strip('\'"!?,.')
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
if(w in stopwords.words('english') or w == 'AT_USER' or w == 'URL' or re.search(r'^[a-z]$',w) or val is None):
continue
else:
temp.append(w.lower())
featureVector = list(bigrams(temp))
return featureVector
def getFeatureVectorTrain():
#print " \t getFeatureVector called "
featureVector = []
print "inside getFeatureVectorTrain \n"
with open("ProcessedTweetTokens.txt",'r') as pt:
fp = open('FeatureVectors.txt','a')
print "ProcessedTweetTokens file opened \n"
tweet_tokens = pickle.load(pt)
for w in tweet_tokens:
w = w.strip('\'"?,.')
#check if the word stats with an alphabet
val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w)
#ignore if it is a stop word
if(w in stopwords.words('english') or w == 'AT_USER' or w == 'URL' or re.search(r'^[a-z]$',w) or val is None):
continue
else:
featureVector.append(w.lower())
#print type(featureVector)
pickle.dump(featureVector,fp)
print "feature vector written \n"
#Read the tweets one by one and process it
#getFeatureVectorTrain()
def printFeatureVector():
"""
fp = open('sample_data.txt', 'r')
line = fp.readline()
print "printing feature vector"
while line:
processedTweet = processTweet(line)
featureVector = getFeatureVector(processedTweet)
print featureVector
line = fp.readline()
#end loop
"""
#fp.close()
"""
fp = csv.reader(open('corpus.csv', 'r'), delimiter=',')
for row in fp:
tweet = row[1]
processedTweet = processTweet(tweet)
featureVector = getFeatureVector(processedTweet)
print featureVector
"""
#print getBigramFeatureVector(processTweet('Yeah! So excited ! Just updated my @Flipkart for #TheBigBillionDays. https://t.co/cApJf5DCnC'))
#printFeatureVector()