1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- from collections import defaultdict
- career_lengths = defaultdict(int)
- def pull_congress(year):
- from requests import post
- from bs4 import BeautifulSoup
- search_url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp'
- payload = {'congress': year}
- res = post(search_url, payload)
- if res.status_code != 200:
- raise RuntimeError('failed to pull data from bioguide!')
- soup = BeautifulSoup(res.content, 'html.parser')
- rep_table = soup.find_all('table')[1]
- def parse_dob(dob_txt):
- dob_txt = dob_txt.split('-')[0].strip()
- if dob_txt in ['', 'unknown']:
- dob = None
- else:
- # occasionally, a 'c' is thrown in, presumably for circa or an 'a.' for around
- # Just remove all non-numeric or slash chars
- dob_txt = ''.join(c for c in dob_txt if c in '0123456789/')
- # Some dates are denoted by YYYY/YYYY for an unknown birth between two years, take the average
- dob = sum(map(int, dob_txt.split('/'))) / len(dob_txt.split('/'))
- return dob
- rows = rep_table.find_all('tr')[1:]
- name = ""
- url = ""
- yob = ""
- while rows:
- row = rows.pop(0).find_all('td')
- try:
- name = row[0].a.get_text()
- url = row[0].a['href']
- career_lengths[url] += 1
- yob = parse_dob(row[1].get_text())
- except AttributeError:
- pass
- position = row[2].get_text().strip()
- if position == 'Speaker of the House':
- continue
- party = row[3].get_text()
- state = row[4].get_text()
- career_length = career_lengths[url]
- yield (name, yob, position, party, state, year, career_length)
- def download_all():
- from sqlite3 import connect
- conn = connect("us_congress_members.sqlite3")
- conn.executescript('''
- DROP TABLE IF EXISTS Member;
- CREATE TABLE Member (name TEXT,
- yob FLOAT,
- position TEXT,
- party TEXT,
- state TEXT,
- congress INTEGER,
- career_length INTEGER);
- ''')
- for year in range(1789, 2018):
- print(f'Downloading for year: {year}')
- for rep in pull_congress(year):
- if rep is not None:
- conn.execute('INSERT INTO Member VALUES (?,?,?,?,?,?,?);', rep)
- conn.commit()
- conn.close()
- download_all()
|