/
tigers.py
46 lines (34 loc) · 1.5 KB
/
tigers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import requests
import csv
from BeautifulSoup import BeautifulSoup
url = 'http://www.mutigers.com/sports/m-footbl/mtt/miss-m-footbl-mtt.html'
def get_players():
# Open the HTML file and turn it into a BeautifulSoup object for parsing
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
#scrape part
only_table = soup.find('table', id={'sortable_roster'})
output_trs = [] # The list that's going to store all of our output rows
column_titles = [th.text for th in only_table.find('thead').find('tr').findAll('th')]
# First we need to loop through all the rows in the table, tr is table row
for tr in only_table.findAll('tr'):
# Get all headers in this row.
ths = tr.findAll('th')
# And next, we want to get all the cells within each of the rows, td is table cell,
tds = tr.findAll('td')
# We'll store all of the values for each given row in a list
output_tds = []
for td in tds:
# Delete annoying tab character
output_tds.append(td.text.replace(' ', ''))
if len(output_tds) == len(column_titles):
row = dict()
for idx, th in enumerate(column_titles):
row[th] = output_tds[idx]
# And we'll add that list to our broader list of results
output_trs.append(row)
# Finally, we'll write our results to a file
return output_trs
if __name__=='__main__':
print get_players()