/
_filter.py
49 lines (38 loc) · 1.55 KB
/
_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import logging
import pandas as pd
import pycld2 as cld2
logger = logging.getLogger(__name__)
def filter(dataframe):
"""Filter a dataframe for language and text length.
The following rules apply:
1. Only comments with at least 20 characters retained.
2. Only comments in English are retained.
3. All unprintable unicode characters are removed.
:param dataframe: dataframe to be filtered. Must have column "Comments"
:type pd.DataFrame
:return: Filtered dataframe
:rtype: pd.DataFrame
"""
# 'None' comments would lead to cashes later
dataframe = dataframe[pd.notnull(dataframe["Comments"])]
# reset here to avoid "copy of slice" warning
dataframe = dataframe.reset_index(drop=True)
# filter on comment length
dataframe.loc[:, "cl"] = dataframe.Comments.apply(
lambda com: len(com) if com is not None else 0
)
dataframe = dataframe[dataframe.cl > 20]
# remove problematic unicode characters
dataframe.loc[:, "Comments"] = dataframe.Comments.apply(
lambda com: "".join(x for x in com if x.isprintable())
)
# detect language and filter english. If it's 'unknown' it's probably
# still english
dataframe.loc[:, "lang"] = dataframe.Comments.apply(
lambda com: cld2.detect(com)[2][0][1]
)
dataframe = dataframe[(dataframe["lang"] == "en") | (dataframe["lang"] == "un")]
# drop auxiliary columns again, re-index
dataframe.drop(["cl", "lang"], axis="columns", inplace=True)
dataframe.reset_index(inplace=True, drop=True)
return dataframe