-
Notifications
You must be signed in to change notification settings - Fork 0
/
seed.py
79 lines (60 loc) · 2.99 KB
/
seed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from model import Posting, User, Favorite, connect_to_db, db
import json
import requests
# List of craigslist URLs. Can add any city to this list as long as we have the city prefix.
city_list = ['http://sfbay.craigslist.org/jsonsearch/apa/', 'http://portland.craigslist.org/jsonsearch/apa/', 'http://seattle.craigslist.org/jsonsearch/apa/']
def load_posts(city_list):
"""Load Craigslist posts from JSON into database.
Upon first seed, this will create all tables and gather Craigslist data.
When re-seeding, will remove all posts that have not been favorited by a user, then
update database with fresh Craigslist data.
Recommended: re-seed database once per day or every other day to maintain up-to-date data.
"""
# Delete any post that hasn't been favorited.
query = """DELETE FROM postings WHERE post_id NOT IN
(SELECT post_id FROM favorites);"""
db.session.execute(query)
db.session.commit()
for link in city_list:
# Retrieve JSON from Craigslist
cl_json = requests.get(link)
parsed_json = cl_json.json() # returns list
list_of_posts = parsed_json[0] # returns list of 4000 dicts
for posting in list_of_posts:
# If this object has a GeoCluster key, skip it, since it's not an actual post.
if posting.get("GeoCluster"):
continue
if not Posting.query.get(posting.get('PostingID')):
# Example key value pairs:
# 0 Ask: 2400
# 1 ImageThumb: 'http:\/\/images.craigslist.org...jpg'
# 2 Latitude: 38.927686
# 3 PostingTitle: 'title'
# 4 PostedDate: '1438725510'
# 5 Longitude: -122.3888
# 6 PostingURL: '\/\/sfbay.craigslist.org...html'
# 7 Bedrooms: '2'
# 8 CategoryID: '1'
# 9 PostingID: '5156767694'
# Reformat img url to display as HTML in infoWindow
raw_img_url = posting.get('ImageThumb')
if raw_img_url:
new_url = raw_img_url.split(',')
img_url = new_url[0] + '_300x300.jpg'
post_id = posting.get('PostingID')
title = posting.get('PostingTitle')
date_posted = posting.get('PostedDate')
url = posting.get('PostingURL')
price = posting.get('Ask')
bedrooms = posting.get('Bedrooms')
latitude = posting.get('Latitude')
longitude = posting.get('Longitude')
new_post = Posting(post_id=post_id, title=title, date_posted=date_posted, url=url, img_url=img_url, price=price, bedrooms=bedrooms, latitude=latitude, longitude=longitude)
db.session.add(new_post)
db.session.commit()
if __name__ == '__main__':
from server import app
connect_to_db(app)
db.create_all()
load_posts(city_list)
print "Database updated."