Skip to content

Commit db7b503

Browse files
authored
Merge pull request #140 from washingtonpost/hotfix-extrap-house
hotfix-extrap-house
2 parents 2bd9a0f + d2f9408 commit db7b503

File tree

3 files changed

+84
-4
lines changed

3 files changed

+84
-4
lines changed

src/elexmodel/client.py

+29-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import defaultdict
2+
from io import StringIO
23

34
import numpy as np
45
import pandas as pd
@@ -339,6 +340,31 @@ def get_estimates(
339340
versioned_data_handler = None
340341
else:
341342
versioned_data_handler = None
343+
344+
if model_parameters.get("correct_from_presidential", False):
345+
s3_client = s3.S3CsvUtil(TARGET_BUCKET)
346+
baseline_path = f"{S3_FILE_PATH}/{self.election_id}/data/P/data_county.csv"
347+
results_path = f"{S3_FILE_PATH}/{self.election_id}/results/P/county/current.csv"
348+
predictions_path = f"{S3_FILE_PATH}/{self.election_id}/predictions/P/county/unit_data/current.csv"
349+
pres_baseline = pd.read_csv(StringIO(s3_client.get(baseline_path)), dtype={"geographic_unit_fips": str})
350+
pres_baseline["baseline_normalized_margin"] = (pres_baseline.baseline_dem - pres_baseline.baseline_gop) / (
351+
pres_baseline.baseline_dem + pres_baseline.baseline_gop
352+
)
353+
pres_results = pd.read_csv(StringIO(s3_client.get(results_path)), dtype={"geographic_unit_fips": str})
354+
pres_predictions = pd.read_csv(
355+
StringIO(s3_client.get(predictions_path)), dtype={"geographic_unit_fips": str}
356+
)
357+
pres_predictions = pres_predictions.merge(
358+
pres_results[["geographic_unit_fips", "results_weights"]], on="geographic_unit_fips", how="left"
359+
)
360+
pres_predictions = pres_predictions.merge(
361+
pres_baseline[["geographic_unit_fips", "baseline_normalized_margin"]],
362+
on="geographic_unit_fips",
363+
how="left",
364+
)
365+
else:
366+
pres_predictions = None
367+
342368
LOG.info("Running model for %s", self.election_id)
343369
LOG.info(
344370
"Model parameters: \n prediction intervals: %s, percent reporting threshold: %s, \
@@ -359,7 +385,9 @@ def get_estimates(
359385
self.model = GaussianElectionModel(model_settings=model_settings)
360386
elif pi_method == "bootstrap":
361387
self.model = BootstrapElectionModel(
362-
model_settings=model_settings, versioned_data_handler=versioned_data_handler
388+
model_settings=model_settings,
389+
versioned_data_handler=versioned_data_handler,
390+
pres_predictions=pres_predictions,
363391
)
364392

365393
minimum_reporting_units_max = 0

src/elexmodel/handlers/data/VersionedData.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,11 @@ def compute_estimated_margin(df):
117117
# because the AP adjusted its model after the fact. We correct for this here.
118118
# we recompute the percent_expected_vote using the last reported value as the max
119119
perc_expected_vote_corr = np.divide(
120-
results_turnout, results_turnout[-1], out=np.zeros_like(results_turnout), where=results_turnout[-1] != 0
120+
results_turnout,
121+
results_turnout[-1],
122+
out=np.zeros_like(results_turnout),
123+
where=results_turnout[-1] != 0,
124+
casting="unsafe",
121125
)
122126

123127
# check if perc_expected_vote_corr is monotone increasing (if not, give up and don't try to estimate a margin)
@@ -190,7 +194,7 @@ def compute_estimated_margin(df):
190194

191195
est_margins = observed_norm_margin * observed_vote + observed_batch_margin * (percs - observed_vote)
192196
est_margins = np.divide(
193-
est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins)
197+
est_margins, percs, where=percs != 0, out=np.zeros_like(est_margins), casting="unsafe"
194198
) # Handle div-by-zero
195199

196200
# Return a DataFrame with the multi-index (geographic_unit_fips, perc)

src/elexmodel/models/BootstrapElectionModel.py

+49-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class BootstrapElectionModel(BaseElectionModel):
5252
and the epsilons are contest (state/district) level random effects.
5353
"""
5454

55-
def __init__(self, model_settings={}, versioned_data_handler=None):
55+
def __init__(self, model_settings={}, versioned_data_handler=None, pres_predictions=None):
5656
super().__init__(model_settings)
5757
self.B = model_settings.get("B", 500) # number of bootstrap samples
5858
self.strata = model_settings.get("strata", ["county_classification"]) # columns to stratify the data by
@@ -61,6 +61,7 @@ def __init__(self, model_settings={}, versioned_data_handler=None):
6161
"agg_model_hard_threshold", True
6262
) # use sigmoid or hard thresold when calculating agg model
6363
self.district_election = model_settings.get("district_election", False)
64+
6465
self.lambda_ = model_settings.get("lambda_", None) # regularization parameter for OLS
6566

6667
# save versioned data for later use
@@ -70,6 +71,10 @@ def __init__(self, model_settings={}, versioned_data_handler=None):
7071
self.extrapolate_std_method = model_settings.get("extrapolate_std_method", "std")
7172
self.max_dist_to_observed = model_settings.get("max_dist_to_observed", 5)
7273

74+
# save presidenial predictions for later use
75+
self.pres_predictions = pres_predictions
76+
self.correct_from_presidential = model_settings.get("correct_from_presidential", False)
77+
7378
# upper and lower bounds for the quantile regression which define the strata distributions
7479
# these make sure that we can control the worst cases for the distributions in case we
7580
# haven't seen enough data ayet
@@ -1283,6 +1288,49 @@ def compute_bootstrap_errors(
12831288
extrap_filter
12841289
]
12851290

1291+
if self.correct_from_presidential:
1292+
nonreporting_units["geographic_unit_fips_p"] = nonreporting_units.geographic_unit_fips.apply(
1293+
lambda x: x.split("_")[1]
1294+
)
1295+
nonreporting_units = nonreporting_units.merge(
1296+
self.pres_predictions,
1297+
left_on="geographic_unit_fips_p",
1298+
right_on="geographic_unit_fips",
1299+
how="left",
1300+
suffixes=("", "_pres"),
1301+
)
1302+
1303+
# adjust results_normalized_margin_pres to account for split counties
1304+
1305+
nonreporting_units["margin_adj"] = (
1306+
nonreporting_units.baseline_normalized_margin - nonreporting_units.baseline_normalized_margin_pres
1307+
)
1308+
1309+
nonreporting_units["results_normalized_margin_pres"] = (
1310+
nonreporting_units.results_margin_pres / nonreporting_units.results_weights_pres
1311+
+ nonreporting_units.margin_adj
1312+
)
1313+
nonreporting_units["pred_normalized_margin_pres"] = (
1314+
nonreporting_units.pred_margin / nonreporting_units.pred_turnout + nonreporting_units.margin_adj
1315+
)
1316+
1317+
nonreporting_units["pred_normalized_margin"] = np.mean(
1318+
y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper), axis=1
1319+
)
1320+
1321+
nonreporting_units["margin_gap"] = (
1322+
nonreporting_units.results_normalized_margin - nonreporting_units.results_normalized_margin_pres
1323+
)
1324+
1325+
nonreporting_units["pred_normalized_margin_new"] = (
1326+
nonreporting_units.pred_normalized_margin_pres + nonreporting_units.margin_gap
1327+
)
1328+
adjustment = (
1329+
nonreporting_units["pred_normalized_margin_new"].values
1330+
- nonreporting_units["pred_normalized_margin"].values
1331+
)
1332+
y_test_pred_B[~np.isnan(adjustment)] += adjustment[~np.isnan(adjustment)].reshape(-1, 1)
1333+
12861334
y_test_pred_B = y_test_pred_B.clip(min=y_partial_reporting_lower, max=y_partial_reporting_upper)
12871335

12881336
# \tilde{y_i}^{b} * \tilde{z_i}^{b}

0 commit comments

Comments
 (0)