123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215 |
- import pandas as pd
- import numpy as np
- from bokeh.plotting import figure
- from bokeh.models import HoverTool
- binning_age_max = 110
- binning_career_max = 70
- def save_fig(fig, filename):
- from bokeh.embed import components
- script, div = components(fig)
- with open(f'output/{filename}.js', 'w') as f:
- f.write(script)
- with open(f'output/{filename}.html', 'w') as f:
- f.write(div)
- def percentile_from_pdf(pdf, bin_centers, percentile=0.5):
- cdf = 0
- for pdf_val, bin_low, bin_high in zip(pdf, bin_centers[:-1], bin_centers[1:]):
- if cdf+pdf_val > percentile:
- return bin_low + (percentile - cdf)*(bin_high-bin_low) / pdf_val
- cdf += pdf_val
- raise ValueError(f"couldn't find percentile: {percentile}, cdf: {cdf}" )
- def pdf_stats(pdf, bins):
- from collections import namedtuple
- Stats = namedtuple('Stats', ['hist', 'mean', 'median', 'quart_high', 'quart_low'])
- bin_centers = (bins[:-1] + bins[1:])/2
- mean = np.average(bin_centers, weights=pdf)
- median = percentile_from_pdf(pdf, bin_centers)
- quart_low = percentile_from_pdf(pdf, bin_centers, 0.25)
- quart_high = percentile_from_pdf(pdf, bin_centers, 0.75)
- return Stats((pdf, bins), mean, median, quart_high, quart_low)
- def get_congress(year, parties=None, states=None, positions=None):
- query = f'''\
- SELECT yob, career_length FROM Member
- WHERE congress={year}'''
- if positions:
- query += ' AND position IN (' + ', '.join(f'"{position}"' for position in positions) + ')'
- if parties:
- query += ' AND party IN (' + ', '.join(f'"{party}"' for party in parties) + ')'
- if states:
- query += ' AND state IN (' + ', '.join(f'"{state}"' for state in states) + ')'
- data = pd.read_sql_query(query, 'sqlite:///us_congress_members.sqlite3')
- data['age'] = year - data.yob
- return data
- def get_stats_congress(year, age_or_term, parties=None, states=None, positions=None):
- data = get_congress(year, parties, states, positions)
- if len(data) == 0:
- return None
- if age_or_term == "Age":
- pdf, bins = np.histogram(data.age, bins=binning_age_max, range=(0, binning_age_max), density=True)
- else:
- pdf, bins = np.histogram(data.career_length, bins=binning_career_max, range=(0, binning_career_max),
- density=True)
- return pdf_stats(pdf, bins)
- def get_stats_genpop(year, sex=None, states=None):
- query = f'''\
- SELECT perwt, age FROM person
- WHERE year={year}
- '''
- if sex:
- query += f' AND sex={sex}'
- if states:
- query += f' AND statefip IN (' + ', '.join(f'{state}' for state in states) + ')'
- data = pd.read_sql_query(query, 'sqlite:///usa_00001.sqlite3')
- pdf, bins = np.histogram(data.age, bins=binning_age_max, range=(0, binning_age_max), weights=data.perwt,
- density=True)
- return pdf_stats(pdf, bins)
- def plot_yearly_stats(figname):
- hover = HoverTool(tooltips=[(f'Age', "@y{00.0}")],
- mode='vline')
- fig = figure(tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset'],
- sizing_mode='scale_width', plot_width=700, plot_height=450,
- toolbar_location="right")
- genpop_stats = {}
- congress_stats = {}
- for year in [1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2016]:
- genpop_stats[year] = get_stats_genpop(year)
- for year in range(1850, 2017):
- congress_stats[year] = get_stats_congress(year, 'Age', positions=['Senator', 'Representative'])
- congress_years = []
- congress_medians = []
- congress_quart_highs = []
- congress_quart_lows = []
- for year, year_stats in congress_stats.items():
- congress_years.append(year)
- congress_medians.append(year_stats.median)
- congress_quart_highs.append(year_stats.quart_high)
- congress_quart_lows.append(year_stats.quart_low)
- genpop_years = []
- genpop_medians = []
- genpop_quart_highs = []
- genpop_quart_lows = []
- for year, year_stats in genpop_stats.items():
- genpop_years.append(year)
- genpop_medians.append(year_stats.median)
- genpop_quart_highs.append(year_stats.quart_high)
- genpop_quart_lows.append(year_stats.quart_low)
- def do_plot(years, medians, quart_highs, quart_lows, color, label):
- fig.patch(years + years[::-1], quart_highs + quart_lows[::-1], fill_color=color, alpha=0.3)
- fig.line(years, medians, line_color=color, line_width=2, legend=label, level='overlay')
- do_plot(genpop_years, genpop_medians, genpop_quart_highs, genpop_quart_lows, 'blue', 'U.S. Population')
- do_plot(congress_years, congress_medians, congress_quart_highs, congress_quart_lows, 'red', 'Congress')
- fig.legend.location = 'bottom_right'
- fig.xaxis.axis_label = 'Year'
- fig.yaxis.axis_label = 'Age'
- fig.y_range.start = 0
- save_fig(fig, figname)
- def plot_partisan_stats(age_or_term, figname):
- hover = HoverTool(tooltips=[(f'{age_or_term}', "@y{00.0}")],
- mode='vline')
- fig = figure(tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset'],
- sizing_mode='scale_width', plot_width=700, plot_height=450,
- toolbar_location="right")
- parties = [('Republican', 'red'), ('Democrat', 'blue')]
- for (party, color) in parties:
- stats = {}
- for year in range(1850, 2017):
- stat = get_stats_congress(year, age_or_term, parties=[party], positions=['Senator', 'Representative'])
- if stat:
- stats[year] = stat
- years = []
- medians = []
- quart_highs = []
- quart_lows = []
- for year, year_stats in stats.items():
- years.append(year)
- medians.append(year_stats.median)
- quart_highs.append(year_stats.quart_high)
- quart_lows.append(year_stats.quart_low)
- fig.patch(years + years[::-1], quart_highs + quart_lows[::-1], fill_color=color, alpha=0.3)
- fig.line(years, medians, line_color=color, line_width=2, legend=party, level='overlay')
- fig.legend.location = 'bottom_right'
- fig.xaxis.axis_label = 'Year'
- fig.yaxis.axis_label = age_or_term
- fig.y_range.start = 0
- save_fig(fig, figname)
- def plot_chamber_stats(age_or_term, figname):
- from bokeh.models import HoverTool
- hover = HoverTool(tooltips=[(f'{age_or_term}', "@y{00.0}")],
- mode='vline')
- fig = figure(tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset'],
- sizing_mode='scale_width', plot_width=700, plot_height=450,
- toolbar_location="right")
- chambers = [('Senator', 'Senate', 'red'), ('Representative', 'House', 'blue')]
- for (position, chamber, color) in chambers:
- stats = {}
- for year in range(1850, 2017):
- stat = get_stats_congress(year, age_or_term, positions=[position])
- if stat is not None:
- stats[year] = stat
- years = []
- medians = []
- quart_highs = []
- quart_lows = []
- for year, year_stats in stats.items():
- years.append(year)
- medians.append(year_stats.median)
- quart_highs.append(year_stats.quart_high)
- quart_lows.append(year_stats.quart_low)
- fig.patch(years + years[::-1], quart_highs + quart_lows[::-1], fill_color=color, alpha=0.3)
- fig.line(years, medians, line_color=color, line_width=2, legend=chamber, level='overlay')
- fig.legend.location = 'bottom_right'
- fig.xaxis.axis_label = 'Year'
- fig.yaxis.axis_label = age_or_term
- fig.y_range.start = 0
- save_fig(fig, figname)
- if __name__ == '__main__':
- plot_yearly_stats('congress_ages')
- plot_chamber_stats('Age', 'chamber_age')
- plot_chamber_stats('Career Length', 'chamber_career_length')
- plot_partisan_stats('Age', 'partisan_age')
- plot_partisan_stats('Career Length', 'partisan_career_length')
|