get_congress_data.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. from collections import defaultdict
  2. career_lengths = defaultdict(int)
  3. def pull_congress(year):
  4. from requests import post
  5. from bs4 import BeautifulSoup
  6. search_url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp'
  7. payload = {'congress': year}
  8. res = post(search_url, payload)
  9. if res.status_code != 200:
  10. raise RuntimeError('failed to pull data from bioguide!')
  11. soup = BeautifulSoup(res.content, 'html.parser')
  12. rep_table = soup.find_all('table')[1]
  13. def parse_dob(dob_txt):
  14. dob_txt = dob_txt.split('-')[0].strip()
  15. if dob_txt in ['', 'unknown']:
  16. dob = None
  17. else:
  18. # occasionally, a 'c' is thrown in, presumably for circa or an 'a.' for around
  19. # Just remove all non-numeric or slash chars
  20. dob_txt = ''.join(c for c in dob_txt if c in '0123456789/')
  21. # Some dates are denoted by YYYY/YYYY for an unknown birth between two years, take the average
  22. dob = sum(map(int, dob_txt.split('/'))) / len(dob_txt.split('/'))
  23. return dob
  24. rows = rep_table.find_all('tr')[1:]
  25. name = ""
  26. url = ""
  27. yob = ""
  28. while rows:
  29. row = rows.pop(0).find_all('td')
  30. try:
  31. name = row[0].a.get_text()
  32. url = row[0].a['href']
  33. career_lengths[url] += 1
  34. yob = parse_dob(row[1].get_text())
  35. except AttributeError:
  36. pass
  37. position = row[2].get_text().strip()
  38. if position == 'Speaker of the House':
  39. continue
  40. party = row[3].get_text()
  41. state = row[4].get_text()
  42. career_length = career_lengths[url]
  43. yield (name, yob, position, party, state, year, career_length)
  44. def download_all():
  45. from sqlite3 import connect
  46. conn = connect("us_congress_members.sqlite3")
  47. conn.executescript('''
  48. DROP TABLE IF EXISTS Member;
  49. CREATE TABLE Member (name TEXT,
  50. yob FLOAT,
  51. position TEXT,
  52. party TEXT,
  53. state TEXT,
  54. congress INTEGER,
  55. career_length INTEGER);
  56. ''')
  57. for year in range(1789, 2018):
  58. print(f'Downloading for year: {year}')
  59. for rep in pull_congress(year):
  60. if rep is not None:
  61. conn.execute('INSERT INTO Member VALUES (?,?,?,?,?,?,?);', rep)
  62. conn.commit()
  63. conn.close()
  64. download_all()