from collections import defaultdict career_lengths = defaultdict(int) def pull_congress(year): from requests import post from bs4 import BeautifulSoup search_url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp' payload = {'congress': year} res = post(search_url, payload) if res.status_code != 200: raise RuntimeError('failed to pull data from bioguide!') soup = BeautifulSoup(res.content, 'html.parser') rep_table = soup.find_all('table')[1] def parse_dob(dob_txt): dob_txt = dob_txt.split('-')[0].strip() if dob_txt in ['', 'unknown']: dob = None else: # occasionally, a 'c' is thrown in, presumably for circa or an 'a.' for around # Just remove all non-numeric or slash chars dob_txt = ''.join(c for c in dob_txt if c in '0123456789/') # Some dates are denoted by YYYY/YYYY for an unknown birth between two years, take the average dob = sum(map(int, dob_txt.split('/'))) / len(dob_txt.split('/')) return dob rows = rep_table.find_all('tr')[1:] name = "" url = "" yob = "" while rows: row = rows.pop(0).find_all('td') try: name = row[0].a.get_text() url = row[0].a['href'] career_lengths[url] += 1 yob = parse_dob(row[1].get_text()) except AttributeError: pass position = row[2].get_text().strip() if position == 'Speaker of the House': continue party = row[3].get_text() state = row[4].get_text() career_length = career_lengths[url] yield (name, yob, position, party, state, year, career_length) def download_all(): from sqlite3 import connect conn = connect("us_congress_members.sqlite3") conn.executescript(''' DROP TABLE IF EXISTS Member; CREATE TABLE Member (name TEXT, yob FLOAT, position TEXT, party TEXT, state TEXT, congress INTEGER, career_length INTEGER); ''') for year in range(1789, 2018): print(f'Downloading for year: {year}') for rep in pull_congress(year): if rep is not None: conn.execute('INSERT INTO Member VALUES (?,?,?,?,?,?,?);', rep) conn.commit() conn.close() download_all()