Merge pull request #118 from washingtonpost/release/2.1.2

dmnapolitano · web-flow · commit 1388a63eba4c · 2024-10-24T18:48:00.000-04:00
Release/2.1.2 🎉
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+## 2.1.2 (10/24/2024)
+- feat: `agg_model_hard_threshold` now defaults to `True`
+- feat: using cross-validation to find the optimal OLS `lambda` for use in the `BootstrapElectionModel` is now optional due to the `lambda_` model parameter [#115](https://github.com/washingtonpost/elex-live-model/pull/115)
+
 ## 2.1.1 (10/10/2024)
 - fix: allow multiple `alpha` values passed in to `ModelClient.get_national_summary_votes_estimates()` and change that method to return a `pandas.DataFrame` [#111](https://github.com/washingtonpost/elex-live-model/pull/111)
 
diff --git a/README.md b/README.md
@@ -106,7 +106,7 @@ Some model types have specific model parameters that can be included.
 
 | Name                              | Type    | Acceptable values                | model           |
 |-----------------------------------|---------|----------------------------------|-----------------|
-| lambda                            | numeric | regularization constant          | all             |
+| lambda_                           | numeric | regularization constant          | all             |
 | turnout_factor_lower              | numeric | drop units with < turnout factor | all             |
 | turnout_factor_upper              | numeric | drop units with < turnout factor | all             |
 | robust                            | boolean | larger prediction intervals      | `nonparametric` |
diff --git a/setup.py b/setup.py
@@ -20,7 +20,7 @@
     LONG_DESCRIPTION = f.read()
 
 # The full version, including alpha/beta/rc tags
-RELEASE = "2.1.1"
+RELEASE = "2.1.2"
 # The short X.Y version
 VERSION = ".".join(RELEASE.split(".")[:2])
 
diff --git a/src/elexmodel/models/BootstrapElectionModel.py b/src/elexmodel/models/BootstrapElectionModel.py
@@ -55,9 +55,10 @@ def __init__(self, model_settings={}):
         self.strata = model_settings.get("strata", ["county_classification"])  # columns to stratify the data by
         self.T = model_settings.get("T", 5000)  # temperature for aggregate model
         self.hard_threshold = model_settings.get(
-            "agg_model_hard_threshold", False
+            "agg_model_hard_threshold", True
         )  # use sigmoid or hard thresold when calculating agg model
         self.district_election = model_settings.get("district_election", False)
+        self.lambda_ = model_settings.get("lambda_", None)  # regularization parameter for OLS
 
         # upper and lower bounds for the quantile regression which define the strata distributions
         # these make sure that we can control the worst cases for the distributions in case we
@@ -807,8 +808,14 @@ def compute_bootstrap_errors(
         )
 
         # we use k-fold cross validation to find the optimal lambda for our OLS regression
-        optimal_lambda_y = self.cv_lambda(x_train, y_train, np.logspace(-3, 2, 20), weights=weights_train)
-        optimal_lambda_z = self.cv_lambda(x_train, z_train, np.logspace(-3, 2, 20), weights=weights_train)
+        if self.lambda_ is None:
+            optimal_lambda_y = self.cv_lambda(x_train, y_train, np.logspace(-3, 2, 20), weights=weights_train)
+            optimal_lambda_z = self.cv_lambda(x_train, z_train, np.logspace(-3, 2, 20), weights=weights_train)
+            LOG.info(f"Optimal lambda for y: {optimal_lambda_y}, Optimal lambda for z: {optimal_lambda_z}")
+        else:
+            optimal_lambda_y = self.lambda_
+            optimal_lambda_z = self.lambda_
+            LOG.info(f"Using user provided lambda: {self.lambda_}")
 
         # step 1) fit the initial model
         # we don't want to regularize the intercept or the coefficient for baseline_normalized_margin
@@ -1490,7 +1497,7 @@ def get_national_summary_estimates(self, nat_sum_data_dict: dict, base_to_add: i
 
         # we also need a national aggregate point prediction
         if self.hard_threshold:
-            aggregate_dem_probs_total = self.aggregate_pred_margin > 0.5
+            aggregate_dem_probs_total = self.aggregate_pred_margin > 0
         else:
             aggregate_dem_probs_total = expit(self.T * self.aggregate_pred_margin)
 
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -876,5 +876,9 @@ def test_get_national_summary_votes_estimates(model_client, va_governor_county_d
 
     current = model_client.get_national_summary_votes_estimates(None, 0, [0.99])
 
-    pd.testing.assert_frame_equal(current, model_client.results_handler.final_results["nat_sum_data"])
-    pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"])
+    pd.testing.assert_frame_equal(
+        current, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
+    )
+    pd.testing.assert_frame_equal(
+        expected_df, model_client.results_handler.final_results["nat_sum_data"], check_dtype=False
+    )