-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment_analysis.py
211 lines (180 loc) · 6.48 KB
/
sentiment_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import base64
import io
import multiprocessing as mp
import re
import string
from typing import Dict, List
from urllib import parse
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
from django.conf import settings
from nltk import FreqDist, tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from plotly.offline import plot
from textblob import Sentence, TextBlob
PROCESS_QUEUE_1: mp.Queue = mp.Queue()
PROCESS_QUEUE_2: mp.Queue = mp.Queue()
PROCESS_QUEUE_3: mp.Queue = mp.Queue()
PROCESS_QUEUE_4: mp.Queue = mp.Queue()
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
yield from tokens
def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub(
"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*(),]|"
"(?:%[0-9a-fA-F][0-9a-fA-F]))+",
"",
token,
)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = "n"
elif tag.startswith("VB"):
pos = "v"
else:
pos = "a"
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if (
len(token) > 2
and token not in string.punctuation
and token.lower() not in stop_words
):
cleaned_tokens.append(token.lower())
return cleaned_tokens
def get_polarity(text: str) -> float:
return TextBlob(text).sentiment.polarity
def get_subjectivity(text: str) -> float:
return TextBlob(text).sentiment.subjectivity
def get_analysis(polarity: int) -> str:
if polarity < 0:
return "negative"
elif polarity == 0:
return "neutral"
else:
return "positive"
def parse_text(text: str) -> List[Sentence]:
return TextBlob(text).sentences
def create_dataframe(sentences: List[Sentence]) -> pd.DataFrame:
data: Dict = {"Sentence Index": [], "Text": []}
for index, sentence in enumerate(sentences, 1):
data["Sentence Index"].append(index)
data["Text"].append(str(sentence))
df = pd.DataFrame(data=data)
df["Polarity"] = df["Text"].apply(get_polarity)
df["Subjectivity"] = df["Text"].apply(get_subjectivity)
return df.sort_values(by=["Polarity"])
def create_polarity_distribution_bar_plot(df: pd.DataFrame) -> str:
df["Analysis"] = df["Polarity"].apply(get_analysis)
y_values = df["Analysis"].value_counts()
fig = px.bar(
y_values,
y="Analysis",
labels={"index": "Polarity", "Analysis": "Number of occurrences"},
title="Polarity occurrences in bar graph",
)
data = plot(fig, output_type="div", auto_open=False)
PROCESS_QUEUE_1.put(data)
return data
def create_polarity_distribution_scatter_plot(df: pd.DataFrame) -> str:
fig = px.scatter(
df,
x="Polarity",
y="Subjectivity",
title="Polarity versus Subjectivity",
)
data = plot(fig, output_type="div", auto_open=False)
PROCESS_QUEUE_2.put(data)
return data
def create_wordcloud(df: pd.DataFrame) -> str:
from wordcloud import STOPWORDS, WordCloud
all_words = " ".join(df["Text"])
word_cloud = WordCloud(
height=500, width=500, stopwords=STOPWORDS, collocations=False
).generate(all_words)
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis("off")
image = io.BytesIO()
plt.tight_layout(pad=0)
plt.savefig(image, format="png", facecolor="k", bbox_inches="tight")
image.seek(0)
string = base64.b64encode(image.read())
data = "data:image/png;base64," + parse.quote(string)
PROCESS_QUEUE_4.put(data)
return data
def generate_frequency_distribution_report(df: pd.DataFrame) -> str:
stop_words = stopwords.words("english")
all_words = [tokenize.word_tokenize(sentence) for sentence in df["Text"]]
cleaned_tokens_list = [remove_noise(tokens, stop_words) for tokens in all_words]
all_words = get_all_words(cleaned_tokens_list)
data = FreqDist(all_words)
words, counts = [], []
for word, count in data.most_common(10):
words.append(word)
counts.append(count)
fig = px.bar(
y=counts,
x=words,
labels={"x": "Word", "y": "Number of occurrences"},
title="Top 10 appearing words",
)
data = plot(fig, output_type="div", auto_open=False)
PROCESS_QUEUE_3.put(data)
return data
def generate_report(text: str) -> Dict:
""" Using multiprocessing here gives a performance boost of ~= 2 seconds """
sentences = TextBlob(text).sentences
report = create_dataframe(sentences)
if not settings.INSCRIBE_MULTIPROCESSING:
polarity_distribution_bar_plot = create_polarity_distribution_bar_plot(report)
polarity_distribution_scatter_plot = create_polarity_distribution_scatter_plot(
report
)
frequency_distribution_report = generate_frequency_distribution_report(report)
wordcloud = create_wordcloud(report)
else:
process_1 = mp.Process(
target=create_polarity_distribution_bar_plot, args=(report,)
)
process_2 = mp.Process(
target=create_polarity_distribution_scatter_plot, args=(report,)
)
process_3 = mp.Process(
target=generate_frequency_distribution_report, args=(report,)
)
process_4 = mp.Process(target=create_wordcloud, args=(report,))
process_1.start()
process_2.start()
process_3.start()
process_4.start()
polarity_distribution_bar_plot = PROCESS_QUEUE_1.get()
polarity_distribution_scatter_plot = PROCESS_QUEUE_2.get()
frequency_distribution_report = PROCESS_QUEUE_3.get()
wordcloud = PROCESS_QUEUE_4.get()
process_1.join()
process_2.join()
process_3.join()
process_4.join()
return {
"report": report.to_html(
classes=[
"table",
"table-dark",
"rounded-lg",
"table-striped",
"table-responsive-xl",
],
index=False,
justify="left",
show_dimensions=True,
),
"polarity_distribution_bar_plot": polarity_distribution_bar_plot,
"polarity_distribution_scatter_plot": polarity_distribution_scatter_plot,
"wordcloud": wordcloud,
"frequency_distribution_report": frequency_distribution_report,
}