get_congress_data.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. def pull_congress(year):
  2. from requests import post
  3. from bs4 import BeautifulSoup
  4. search_url = 'http://bioguide.congress.gov/biosearch/biosearch1.asp'
  5. payload = {'congress': year}
  6. res = post(search_url, payload)
  7. if res.status_code != 200:
  8. raise RuntimeError('failed to pull data from bioguide!')
  9. soup = BeautifulSoup(res.content, 'html.parser')
  10. rep_table = soup.find_all('table')[1]
  11. def parse_dob(dob_txt):
  12. dob_txt = dob_txt.split('-')[0].strip()
  13. if dob_txt in ['', 'unknown']:
  14. dob = None
  15. else:
  16. # occasionally, a 'c' is thrown in, presumably for circa or an 'a.' for around
  17. # Just remove all non-numeric or slash chars
  18. dob_txt = ''.join(c for c in dob_txt if c in '0123456789/')
  19. # Some dates are denoted by YYYY/YYYY for an unknown birth between two years, take the average
  20. dob = sum(map(int, dob_txt.split('/'))) / len(dob_txt.split('/'))
  21. return dob
  22. rows = rep_table.find_all('tr')[1:]
  23. name = ""
  24. yob = ""
  25. while rows:
  26. row = rows.pop(0).find_all('td')
  27. try:
  28. name = row[0].a.get_text()
  29. yob = parse_dob(row[1].get_text())
  30. except AttributeError:
  31. pass
  32. position = row[2].get_text().strip()
  33. if position == 'Speaker of the House':
  34. continue
  35. party = row[3].get_text()
  36. state = row[4].get_text()
  37. yield (name, yob, position, party, state, year)
  38. def download_all():
  39. from sqlite3 import connect
  40. conn = connect("us_congress_members.sqlite3")
  41. conn.executescript('''
  42. DROP TABLE IF EXISTS Member;
  43. CREATE TABLE Member (name TEXT,
  44. yob FLOAT,
  45. position TEXT,
  46. party TEXT,
  47. state TEXT,
  48. congress INTEGER);
  49. ''')
  50. for year in range(1789, 2017):
  51. print(f'Downloading for year: {year}')
  52. for rep in pull_congress(year):
  53. if rep is not None:
  54. conn.execute('INSERT INTO Member VALUES (?,?,?,?,?,?);', rep)
  55. conn.commit()
  56. conn.close()
  57. download_all()