-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Montana Legislative Scraper - Sept. 10th Hackathon #7
Conversation
|
||
# TODO: Get more metadata from here | ||
event_info_text = re.search('EventInfo:(.*),', sliq_html).groups()[0] | ||
event_info_json = json.loads(event_info_text) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The timestamps up in here are not the video timestamps but rather the start/end time the actual day of the hearings. We have to find this and match the agenda tree timestamp 😩:
var dataModel = {
Manifest:{"id":"40001","children":[{"id":"eventInfo","children":null,"textTags":{"NAME":{"id":"eventInfo","text":"eventInfo"}},"timeTags":{"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}},{"id":"media","children":null,"textTags":{"NAME":{"id":"media","text":"media"}},"timeTags":{"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}},{"id":"agenda","children":null,"textTags":{"NAME":{"id":"agenda","text":"agenda"}},"timeTags":{"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}},{"id":"people","children":null,"textTags":{"NAME":{"id":"people","text":"people"}},"timeTags":{"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}},{"id":"handoutFile","children":null,"textTags":{"NAME":{"id":"handoutFile","text":"handoutFile"}},"timeTags":{"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}}],"textTags":{},"timeTags":{}},
EventInfo:{"id":"40001","children":null,"textTags":{"TITLE":{"id":"40001","text":"House Appropriations"},"DESCRIPTION":{"id":"40001","text":"House Appropriations - Video"},"LOCATION":{"id":"40001","text":"Room 102 "},"MEETINGSTATUS":{"id":"40001","text":"-1"},"SCHEDULEDDURATION":{"id":"40001","text":"14400"},"scheduledDate":{"id":"40001","text":"Tuesday, Jan 5, 2021"},"scheduledTime":{"id":"40001","text":"09:00 - 13:00"},"scheduledDuration":{"id":"40001","text":"4 Hours "},"actualDate":{"id":"40001","text":"Tuesday, Jan 5, 2021"},"actualTime":{"id":"40001","text":"09:02 - 12:04"},"actualDuration":{"id":"40001","text":"3 Hours 2 Minutes"},"eventStatus":{"id":"40001","text":"Adjourned"},"eventStatusClass":{"id":"40001","text":"eventTableEnded"}},"timeTags":{"SCHEDULEDSTART":{"id":null,"timestamp":"2021-01-05T09:00:00.0000000"},"STARTTIME":{"id":null,"timestamp":"2021-01-05T09:02:00.0000000"},"ENDTIME":{"id":null,"timestamp":"2021-01-05T12:04:38.0000000"},"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}},
Media:{"id":"40001","children":[{"id":"150394","children":null,"textTags":{"URL":{"id":"150394","text":"https://sg002-live.sliq.net/00309-vod/_definst_/2021/01/05/House%20Appropriations_2021-01-05-09.02.00_40001_1115.mp4/playlist.m3u8"},"DESCRIPTION":{"id":"150394","text":"Video"},"IsLive":{"id":"150394","text":"False"},"Enabled":{"id":"150394","text":"True"},"GlobalStreamId":{"id":"150394","text":"4"},"NAME":{"id":"150394","text":"Video"},"AudioOnly":{"id":"150394","text":"False"}},"timeTags":{"STARTTIME":{"id":null,"timestamp":"2021-01-05T09:02:00.0000000"}}}],"textTags":{},"timeTags":{"MODIFIED":{"id":null,"timestamp":"2021-06-07T12:23:32.0000000"}}},
AgendaTree:[{"key":"A224723","text":"Call to Order/Roll Call","startTime":"2021-01-05T09:02:00","displayOrder":100000,"lastModified":"2021-06-07T12:23:32.443+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null},{"key":"A224724","text":"HB 3 - Supplemental Appropriations Bill - David Bedey","startTime":"2021-01-05T09:19:00","displayOrder":200000,"lastModified":"2021-06-07T12:23:32.483+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null},{"key":"A224725","text":"HB 4 - Appropriations by Budget Amendment - Dan Bartel","startTime":"2021-01-05T10:37:43","displayOrder":300000,"lastModified":"2021-06-07T12:23:32.523+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null},{"key":"A224726","text":"Recess","startTime":"2021-01-05T11:10:09","displayOrder":400000,"lastModified":"2021-06-07T12:23:32.783+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null},{"key":"A224727","text":"Reconvene","startTime":"2021-01-05T11:46:46","displayOrder":500000,"lastModified":"2021-06-07T12:23:32.82+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null},{"key":"A224728","text":"HB 1 - Feed Bill to fund 67th Legislative session and prepare for 2023 - Llew Jones","startTime":"2021-01-05T11:46:48","displayOrder":600000,"lastModified":"2021-06-07T12:23:32.857+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null},{"key":"A224729","text":"Adjournment","startTime":"2021-01-05T12:04:02","displayOrder":700000,"lastModified":"2021-06-07T12:23:32.893+00:00","parentKey":"-1","externalUrl":"","children":null,"deleted":false,"foreignKey":null}],
Speakers:[],
ccItems:{},
Handouts: [{"Name":"Agenda","HandoutFileUrl":"http://laws.leg.mt.gov/legprd/LAW0240W$CMTE.ActionQuery?P_SESS=20211&P_COM_NM=%28H%29+Appropriations&P_ACTN_DTM=01%2F05%2F2021&U_ACTN_DTM=01%2F05%2F2021&Z_ACTION2=Find","HandoutFileName":"http://laws.leg.mt.gov/legprd/LAW0240W$CMTE.ActionQuery?P_SESS=20211&P_COM_NM=%28H%29+Appropriations&P_ACTN_DTM=01%2F05%2F2021&U_ACTN_DTM=01%2F05%2F2021&Z_ACTION2=Find","LastModifiedTIme":"2022-09-11T05:08:42.599304+00:00","Id":64672,"Tag":null},{"Name":"Minutes","HandoutFileUrl":"http://sg001-harmony.sliq.net:80/00309/Harmony/en/PowerBrowser/ViewHandoutFile?contentEntityId=40001&handoutId=64674","HandoutFileName":"210105APH_Hm1.pdf","LastModifiedTIme":"2022-09-11T05:08:42.599304+00:00","Id":64674,"Tag":null}] ,
AspectRatio:0.5625,
AspectRatioTxt: '16:9',
mediaStartTime : '2021-01-05T09:02:00',
mediaEndTime : '2021-01-05T12:04:38',
playRangeStart : '2021-01-05T09:02:00',
playRangeEnd: '2021-01-05T12:04:38',
venueId: '41',
silendPeriods:[]
};
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also if the agenda is added after the fact, it's very possible at the time of scrape that the subset timestamp wouldn't be available yet. This seems like reason to skip the video processing and try again later, but I think that we'd have to contribute this ability to CDP?
edit: Nope, I don't think we need to contribute to CDP. We can just not use the datetime range arguments at all and scrape everything every run, and only choose to ingest a video if it has the agenda timestamps available. If not, bail early. Then the next time the CRON job runs it will try again for the agenda timestamps. It does mean a bunch of extra scraping will happen every run, but 🤷♀️
I created an issue in CDP for this one: CouncilDataProject/cdp-backend#212
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, at least the agenda tree key minus an "A" is in the Sliq URL:
http://sg001-harmony.sliq.net/00309/Harmony/en/PowerBrowser/PowerBrowserV2/20170221/-1/40001?agendaId=224728
# TODO: Read key bills from config or Capitol Tracker JSON | ||
key_bill_names = ["HB 1"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I created #10 for this TODO
# Start at the big table of all bills from the 2021 session. | ||
bills_url_2021 = "http://laws.leg.mt.gov/legprd/LAW0217W$BAIV.return_all_bills?P_SESS=20211" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@smai-f Have you given any thought to how we might want to handle past / future legislative sessions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd say push a commit to update this line once the 2023 site is live. We should only have to update this URL once at the beginning of the 2023 session 😁 Maybe it could live in the config with the key bills
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update: Mark raised big money, we're scraping everything!
bills_table_rows = bills_table.find_all('tr') | ||
print(f"Found table with {len(bills_table_rows) - 1} rows.") | ||
|
||
# Of all the bills, narrow down to only the key bills and store off the LAWS bill URL for the next step. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for adding comments, it really adds to the overall clarity and context for the scraper!
|
||
# Of all the bills, narrow down to only the key bills and store off the LAWS bill URL for the next step. | ||
key_bills_data = [] | ||
for bill_row in bills_table_rows[1:]: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider adding a comment here that we skip the first row because it is actually headings but in a tr
and not in a th
.
# Go to each LAWS bill URL and find bill actions that have associated recordings. | ||
for bill_data in key_bills_data: | ||
laws_bill_html = requests.get(bill_data['laws_bill_url']).text | ||
bill_rows_with_recordings = re.findall('.*sliq.*', laws_bill_html) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A comment here about why we use regex search on the full html instead of going through BeautifulSoup due to "invalid" HTML returned by the server that can't be parsed by BeautifulSoup.
all_links = bill_cells[-1].find_all('a') | ||
sliq_links = [ link for link in all_links if "sliq" in link['href'] ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BeautifulSoup also supports a parameter to find that you can pass an regex ... so instead of list comprehension here to filter the anchor tags, this becomes...
bill_cells[-1].find_all('a', href=re.compile('sliq'))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oo la la 👍
parsed_url = urlparse(sliq_link) | ||
# TODO: Handle if this isn't in the query? Does that always mean that the timestamp hasn't been included yet, thus the video | ||
# shouldn't be scraped on this pass? | ||
agenda_id = 'A' + parse_qs(parsed_url.query)['agendaId'][0] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
TODO: Is this always true?
…that have an agendaId aka timestamps
WIP