/
reddit_retriever.py
151 lines (116 loc) · 4.75 KB
/
reddit_retriever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import datetime
import calendar
import json
import pandas
from psaw import PushshiftAPI
################## START PARAMETERS ##################
SUBREDDIT = "history"
# Date in ISO format "YYYY-MM-DD" (ex: "2016-06-12")
START_DATE = "2016-06-12"
END_DATE = "2016-06-15"
# Number of days after end date to request comments
EXTRA_DAYS_FOR_COMMENTS = 4
# Change to True to also create a csv file
CREATE_CSV = False
# Comments sorted by field: "created_utc" or "score"
COMMENTS_SORT = "score"
# Maximum number of comments in a submission (None for no maximum)
MAX_COMMENTS = None
# OPTIONAL query string (only searches submissions)
QUERY = None
################### END PARAMETERS ###################
def date_to_epoch(date_iso, delta_days = None):
if not date_iso:
return None
dt = datetime.date.fromisoformat(date_iso)
if delta_days:
delta = datetime.timedelta(days=delta_days)
dt += delta
return calendar.timegm(dt.timetuple())
def print_response(response):
try:
print(json.dumps(response.json(), indent=4, sort_keys=True))
except Exception as e:
print("Did not receive parsable json.\nError:", e)
def _gen_field_str(FIELD, first=False):
if not FIELD:
return ""
return FIELD if first == True else "_" + FIELD
def generate_excel(result, create_csv=False):
try:
panda_df = pandas.DataFrame(result)
now = datetime.datetime.now().strftime("%m-%d-%Y_%H-%M-%S")
sr = _gen_field_str(SUBREDDIT, first=True)
q = _gen_field_str(QUERY)
s = _gen_field_str(START_DATE)
e = _gen_field_str(END_DATE)
file_name = f"{sr}{s}{q}_to{e}_on{now}"
panda_df.to_excel(file_name + ".xlsx")
if create_csv:
panda_df.to_csv(file_name + ".csv")
except Exception as e:
print("Could not generate file.\nError:", e)
def get_date(epoch):
datetime_date = datetime.datetime.fromtimestamp(epoch)
return datetime_date.strftime("%m-%d-%Y_%H-%M-%S")
def full_name_to_id(name):
if "_" in name:
name = name.split("_")[1]
return name
if __name__=="__main__":
pushshift_api = PushshiftAPI()
comment_fields = ["id","author", "body","link_id", "created_utc",
"parent_id", "subreddit", "score"]
submission_fields = ["id", "created_utc", "author", "selftext",
"subreddit", "title", "score", "num_comments", "full_link"]
kargs = {
"subreddit": SUBREDDIT,
"q": QUERY,
"after": date_to_epoch(START_DATE),
"before": date_to_epoch(END_DATE),
"size": 500,
}
kargs_comment = dict(kargs)
kargs_comment["fields"] = comment_fields
kargs_comment["before"] = date_to_epoch(END_DATE, EXTRA_DAYS_FOR_COMMENTS)
kargs_submission = dict(kargs)
kargs_submission["fields"] = submission_fields
full_comments = list(pushshift_api.search_comments(**kargs_comment))
full_submissions = list(pushshift_api.search_submissions(**kargs_submission))
full_submissions.sort(key=lambda k: k.d_['created_utc'])
submissions = [submission.d_ for submission in full_submissions]
print(f"Number of submissions: {len(submissions)}")
result = []
for submission in submissions:
submission["type"] = "submission"
submission["date_posted"] = get_date(submission["created_utc"])
submission["body"] = submission["selftext"]
submission.pop("selftext", None)
result.append(submission)
submission_id = submission["id"]
submission_comments = [obj.d_ for obj in full_comments
if submission_id in obj.d_["link_id"]]
submission_comments.sort(key=lambda k: k[COMMENTS_SORT])
if COMMENTS_SORT == "score":
submission_comments.reverse()
stack = [comment for comment in submission_comments if
submission_id in comment["parent_id"]]
comment_num = 0
while stack:
parent_comment = stack.pop(0)
parent_comment["type"] = "comment"
parent_id = full_name_to_id(parent_comment["id"])
parent_comment["link_id"] = full_name_to_id(parent_comment["link_id"])
parent_comment["full_link"] = submission["full_link"] + parent_id
epoch = parent_comment["created_utc"]
parent_comment["date_posted"] = get_date(epoch)
parent_comment["title"] = submission["title"]
result.append(parent_comment)
sons = [comment for comment in submission_comments
if parent_id in comment["parent_id"]]
stack = sons + stack
if MAX_COMMENTS:
comment_num += 1
if comment_num >= MAX_COMMENTS:
break
generate_excel(result, create_csv=CREATE_CSV)