import pandas as pd import numpy as np from bokeh.plotting import figure from bokeh.models import HoverTool binning_age_max = 110 binning_career_max = 70 def save_fig(fig, filename): from bokeh.embed import components script, div = components(fig) with open(f'output/{filename}.js', 'w') as f: f.write(script) with open(f'output/{filename}.html', 'w') as f: f.write(div) def percentile_from_pdf(pdf, bin_centers, percentile=0.5): cdf = 0 for pdf_val, bin_low, bin_high in zip(pdf, bin_centers[:-1], bin_centers[1:]): if cdf+pdf_val > percentile: return bin_low + (percentile - cdf)*(bin_high-bin_low) / pdf_val cdf += pdf_val raise ValueError(f"couldn't find percentile: {percentile}, cdf: {cdf}" ) def pdf_stats(pdf, bins): from collections import namedtuple Stats = namedtuple('Stats', ['hist', 'mean', 'median', 'quart_high', 'quart_low']) bin_centers = (bins[:-1] + bins[1:])/2 mean = np.average(bin_centers, weights=pdf) median = percentile_from_pdf(pdf, bin_centers) quart_low = percentile_from_pdf(pdf, bin_centers, 0.25) quart_high = percentile_from_pdf(pdf, bin_centers, 0.75) return Stats((pdf, bins), mean, median, quart_high, quart_low) def get_congress(year, parties=None, states=None, positions=None): query = f'''\ SELECT yob, career_length FROM Member WHERE congress={year}''' if positions: query += ' AND position IN (' + ', '.join(f'"{position}"' for position in positions) + ')' if parties: query += ' AND party IN (' + ', '.join(f'"{party}"' for party in parties) + ')' if states: query += ' AND state IN (' + ', '.join(f'"{state}"' for state in states) + ')' data = pd.read_sql_query(query, 'sqlite:///us_congress_members.sqlite3') data['age'] = year - data.yob return data def get_stats_congress(year, age_or_term, parties=None, states=None, positions=None): data = get_congress(year, parties, states, positions) if len(data) == 0: return None if age_or_term == "Age": pdf, bins = np.histogram(data.age, bins=binning_age_max, range=(0, binning_age_max), density=True) else: pdf, bins = np.histogram(data.career_length, bins=binning_career_max, range=(0, binning_career_max), density=True) return pdf_stats(pdf, bins) def get_stats_genpop(year, sex=None, states=None): query = f'''\ SELECT perwt, age FROM person WHERE year={year} ''' if sex: query += f' AND sex={sex}' if states: query += f' AND statefip IN (' + ', '.join(f'{state}' for state in states) + ')' data = pd.read_sql_query(query, 'sqlite:///usa_00001.sqlite3') pdf, bins = np.histogram(data.age, bins=binning_age_max, range=(0, binning_age_max), weights=data.perwt, density=True) return pdf_stats(pdf, bins) def plot_yearly_stats(figname): hover = HoverTool(tooltips=[(f'Age', "@y{00.0}")], mode='vline') fig = figure(tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset'], sizing_mode='scale_width', plot_width=700, plot_height=450, toolbar_location="right") genpop_stats = {} congress_stats = {} for year in [1850, 1860, 1870, 1880, 1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2016]: genpop_stats[year] = get_stats_genpop(year) for year in range(1850, 2017): congress_stats[year] = get_stats_congress(year, 'Age', positions=['Senator', 'Representative']) congress_years = [] congress_medians = [] congress_quart_highs = [] congress_quart_lows = [] for year, year_stats in congress_stats.items(): congress_years.append(year) congress_medians.append(year_stats.median) congress_quart_highs.append(year_stats.quart_high) congress_quart_lows.append(year_stats.quart_low) genpop_years = [] genpop_medians = [] genpop_quart_highs = [] genpop_quart_lows = [] for year, year_stats in genpop_stats.items(): genpop_years.append(year) genpop_medians.append(year_stats.median) genpop_quart_highs.append(year_stats.quart_high) genpop_quart_lows.append(year_stats.quart_low) def do_plot(years, medians, quart_highs, quart_lows, color, label): fig.patch(years + years[::-1], quart_highs + quart_lows[::-1], fill_color=color, alpha=0.3) fig.line(years, medians, line_color=color, line_width=2, legend=label, level='overlay') do_plot(genpop_years, genpop_medians, genpop_quart_highs, genpop_quart_lows, 'blue', 'U.S. Population') do_plot(congress_years, congress_medians, congress_quart_highs, congress_quart_lows, 'red', 'Congress') fig.legend.location = 'bottom_right' fig.xaxis.axis_label = 'Year' fig.yaxis.axis_label = 'Age' fig.y_range.start = 0 save_fig(fig, figname) def plot_partisan_stats(age_or_term, figname): hover = HoverTool(tooltips=[(f'{age_or_term}', "@y{00.0}")], mode='vline') fig = figure(tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset'], sizing_mode='scale_width', plot_width=700, plot_height=450, toolbar_location="right") parties = [('Republican', 'red'), ('Democrat', 'blue')] for (party, color) in parties: stats = {} for year in range(1850, 2017): stat = get_stats_congress(year, age_or_term, parties=[party], positions=['Senator', 'Representative']) if stat: stats[year] = stat years = [] medians = [] quart_highs = [] quart_lows = [] for year, year_stats in stats.items(): years.append(year) medians.append(year_stats.median) quart_highs.append(year_stats.quart_high) quart_lows.append(year_stats.quart_low) fig.patch(years + years[::-1], quart_highs + quart_lows[::-1], fill_color=color, alpha=0.3) fig.line(years, medians, line_color=color, line_width=2, legend=party, level='overlay') fig.legend.location = 'bottom_right' fig.xaxis.axis_label = 'Year' fig.yaxis.axis_label = age_or_term fig.y_range.start = 0 save_fig(fig, figname) def plot_chamber_stats(age_or_term, figname): from bokeh.models import HoverTool hover = HoverTool(tooltips=[(f'{age_or_term}', "@y{00.0}")], mode='vline') fig = figure(tools=[hover, 'pan', 'wheel_zoom', 'save', 'reset'], sizing_mode='scale_width', plot_width=700, plot_height=450, toolbar_location="right") chambers = [('Senator', 'Senate', 'red'), ('Representative', 'House', 'blue')] for (position, chamber, color) in chambers: stats = {} for year in range(1850, 2017): stat = get_stats_congress(year, age_or_term, positions=[position]) if stat is not None: stats[year] = stat years = [] medians = [] quart_highs = [] quart_lows = [] for year, year_stats in stats.items(): years.append(year) medians.append(year_stats.median) quart_highs.append(year_stats.quart_high) quart_lows.append(year_stats.quart_low) fig.patch(years + years[::-1], quart_highs + quart_lows[::-1], fill_color=color, alpha=0.3) fig.line(years, medians, line_color=color, line_width=2, legend=chamber, level='overlay') fig.legend.location = 'bottom_right' fig.xaxis.axis_label = 'Year' fig.yaxis.axis_label = age_or_term fig.y_range.start = 0 save_fig(fig, figname) if __name__ == '__main__': plot_yearly_stats('congress_ages') plot_chamber_stats('Age', 'chamber_age') plot_chamber_stats('Career Length', 'chamber_career_length') plot_partisan_stats('Age', 'partisan_age') plot_partisan_stats('Career Length', 'partisan_career_length')