Merge pull request #100 from washingtonpost/ELEX-3469-save-aggregate-predictions-to-s3

dmnapolitano · web-flow · commit d766bcd82854 · 2024-09-11T15:40:13.000-04:00
ELEX-3469 save aggregate predictions to s3
diff --git a/README.md b/README.md
@@ -72,6 +72,7 @@ Parameters for the CLI tool:
 | called_contests      | dict    | a dictionary of called contests. specific to Bootstrap model for now. e.g. `--called_contests='{"VA": -1}'` |
 | save_output          | list    | `results`, `data`, `config` |
 | unexpected_units     | int     | number of unexpected units to simulate; only used for testing and does not work with historical run |
+| national_summary     | flag    | When not running a historical election, specify this flag to output national summary (aggregate model) estimates. |
 
 Note: When running the model with multiple fixed effects, make sure they are not linearly dependent. For example, `county_fips` and `county_classification` are linearly dependent when run together. That's because every county is in one county class, so all the fixed effect columns of the counties in the county class sum up to the fixed effect column of that county class.
 
diff --git a/src/elexmodel/cli.py b/src/elexmodel/cli.py
@@ -79,6 +79,12 @@ def type_cast_value(self, ctx, value):
     help="options: results, data, config",
 )
 @click.option("--handle_unreporting", "handle_unreporting", default="drop", type=click.Choice(["drop", "zero"]))
+@click.option(
+    "--national_summary",
+    "national_summary",
+    is_flag=True,
+    help="When not running a historical election, output results aggregated to the national level.",
+)
 def cli(
     election_id, estimands, office_id, prediction_intervals, percent_reporting_threshold, geographic_unit_type, **kwargs
 ):
@@ -159,5 +165,10 @@ def cli(
             geographic_unit_type,
             **kwargs
         )
+
+        if kwargs.get("national_summary", False):
+            # TODO: get_national_summary_votes_estimates() arguments via CLI
+            model_client.get_national_summary_votes_estimates(None, 0, 0.99)
+
         for aggregate_level, estimates in result.items():
             print(aggregate_level, "\n", estimates, "\n")
diff --git a/src/elexmodel/client.py b/src/elexmodel/client.py
@@ -41,6 +41,11 @@ def __init__(self):
         self.all_conformalization_data_unit_dict = defaultdict(dict)
         self.all_conformalization_data_agg_dict = defaultdict(dict)
         self.model = None
+        self.results_handler = None
+        self.election_id = None
+        self.office = None
+        self.geographic_unit_type = None
+        self.save_results = None
 
     def _check_input_parameters(
         self,
@@ -170,7 +175,20 @@ def get_aggregate_list(self, office, aggregate):
         return sorted(list(set(raw_aggregate_list)), key=lambda x: AGGREGATE_ORDER.index(x))
 
     def get_national_summary_votes_estimates(self, nat_sum_data_dict=None, base_to_add=0, alpha=0.99):
-        return self.model.get_national_summary_estimates(nat_sum_data_dict, base_to_add, alpha)
+        if self.model is None:
+            raise ModelClientException(
+                "Must call the get_estimands() method before get_national_summary_votes_estimates()."
+            )
+
+        nat_sum_estimates = self.model.get_national_summary_estimates(nat_sum_data_dict, base_to_add, alpha)
+        self.results_handler.add_national_summary_estimates(nat_sum_estimates)
+
+        if APP_ENV != "local" and self.save_results:
+            self.results_handler.write_data(
+                self.election_id, self.office, self.geographic_unit_type, keys=["nat_sum_data"]
+            )
+
+        return nat_sum_estimates
 
     def get_estimates(
         self,
@@ -202,7 +220,7 @@ def get_estimates(
         pi_method = kwargs.get("pi_method", "nonparametric")
         called_contests = kwargs.get("called_contests", None)
         save_output = kwargs.get("save_output", ["results"])
-        save_results = "results" in save_output
+        self.save_results = "results" in save_output
         save_data = "data" in save_output
         save_config = "config" in save_output
         # saving conformalization data only makes sense if a ConformalElectionModel is used
@@ -241,15 +259,18 @@ def get_estimates(
             model_parameters,
             handle_unreporting,
         )
+        self.election_id = election_id
+        self.office = office
+        self.geographic_unit_type = geographic_unit_type
 
-        states_with_election = config_handler.get_states(office)
-        estimand_baselines = config_handler.get_estimand_baselines(office, estimands)
+        states_with_election = config_handler.get_states(self.office)
+        estimand_baselines = config_handler.get_estimand_baselines(self.office, estimands)
 
-        LOG.info("Getting preprocessed data: %s", election_id)
+        LOG.info("Getting preprocessed data: %s", self.election_id)
         preprocessed_data_handler = PreprocessedDataHandler(
-            election_id,
-            office,
-            geographic_unit_type,
+            self.election_id,
+            self.office,
+            self.geographic_unit_type,
             estimands,
             estimand_baselines,
             data=preprocessed_data,
@@ -267,7 +288,7 @@ def get_estimates(
             preprocessed_data,
             current_data,
             estimands,
-            geographic_unit_type,
+            self.geographic_unit_type,
             handle_unreporting=handle_unreporting,
         )
 
@@ -307,8 +328,8 @@ def get_estimates(
             if minimum_reporting_units > minimum_reporting_units_max:
                 minimum_reporting_units_max = minimum_reporting_units
 
-        if APP_ENV != "local" and save_results:
-            data.write_data(election_id, office)
+        if APP_ENV != "local" and self.save_results:
+            data.write_data(self.election_id, self.office)
 
         n_reporting_expected_units = reporting_units.shape[0]
         n_unexpected_units = unexpected_units.shape[0]
@@ -330,44 +351,44 @@ def get_estimates(
         if len(duplicate_units) > 0:
             raise ModelClientException(f"At least one unit appears twice: {duplicate_units}")
 
-        results_handler = ModelResultsHandler(
+        self.results_handler = ModelResultsHandler(
             aggregates, prediction_intervals, reporting_units, nonreporting_units, unexpected_units
         )
 
         for estimand in estimands:
             unit_predictions, unit_turnout_predictions = self.model.get_unit_predictions(
                 reporting_units, nonreporting_units, estimand, unexpected_units=unexpected_units
             )
-            results_handler.add_unit_predictions(estimand, unit_predictions, unit_turnout_predictions)
+            self.results_handler.add_unit_predictions(estimand, unit_predictions, unit_turnout_predictions)
             # gets prediciton intervals for each alpha
             alpha_to_unit_prediction_intervals = {}
             for alpha in prediction_intervals:
                 alpha_to_unit_prediction_intervals[alpha] = self.model.get_unit_prediction_intervals(
-                    results_handler.reporting_units, results_handler.nonreporting_units, alpha, estimand
+                    self.results_handler.reporting_units, self.results_handler.nonreporting_units, alpha, estimand
                 )
                 if isinstance(self.model, ConformalElectionModel):
                     self.all_conformalization_data_unit_dict[alpha][
                         estimand
                     ] = self.model.get_all_conformalization_data_unit()
 
-            results_handler.add_unit_intervals(estimand, alpha_to_unit_prediction_intervals)
+            self.results_handler.add_unit_intervals(estimand, alpha_to_unit_prediction_intervals)
 
-            for aggregate in results_handler.aggregates:
-                aggregate_list = self.get_aggregate_list(office, aggregate)
+            for aggregate in self.results_handler.aggregates:
+                aggregate_list = self.get_aggregate_list(self.office, aggregate)
                 estimates_df = self.model.get_aggregate_predictions(
-                    results_handler.reporting_units,
-                    results_handler.nonreporting_units,
-                    results_handler.unexpected_units,
+                    self.results_handler.reporting_units,
+                    self.results_handler.nonreporting_units,
+                    self.results_handler.unexpected_units,
                     aggregate_list,
                     estimand,
                     called_contests=called_contests,
                 )
                 alpha_to_agg_prediction_intervals = {}
                 for alpha in prediction_intervals:
                     alpha_to_agg_prediction_intervals[alpha] = self.model.get_aggregate_prediction_intervals(
-                        results_handler.reporting_units,
-                        results_handler.nonreporting_units,
-                        results_handler.unexpected_units,
+                        self.results_handler.reporting_units,
+                        self.results_handler.nonreporting_units,
+                        self.results_handler.unexpected_units,
                         aggregate_list,
                         alpha,
                         alpha_to_unit_prediction_intervals[alpha],
@@ -380,15 +401,16 @@ def get_estimates(
                         ] = self.model.get_all_conformalization_data_agg()
 
                 # get all of the prediction intervals here
-                results_handler.add_agg_predictions(
+                self.results_handler.add_agg_predictions(
                     estimand, aggregate, estimates_df, alpha_to_agg_prediction_intervals
                 )
 
-        results_handler.process_final_results()
-        if APP_ENV != "local" and save_results:
-            results_handler.write_data(election_id, office, geographic_unit_type)
+        self.results_handler.process_final_results()
+
+        if APP_ENV != "local" and self.save_results:
+            self.results_handler.write_data(self.election_id, self.office, self.geographic_unit_type)
 
-        return results_handler.final_results
+        return self.results_handler.final_results
 
 
 class HistoricalModelClient(ModelClient):
diff --git a/src/elexmodel/handlers/data/ModelResults.py b/src/elexmodel/handlers/data/ModelResults.py
@@ -25,6 +25,7 @@ def __init__(
         self.aggregates = [agg for agg in aggregates if agg != "unit"]
         self.estimates = {agg: [] for agg in self.aggregates}
         self.unit_data = {}
+        self.final_results = {}
 
         self.reporting_units = reporting_units
         self.nonreporting_units = nonreporting_units
@@ -93,7 +94,6 @@ def process_final_results(self):
         """
         Create final data frames of results
         """
-        self.final_results = {}
         for agg in self.aggregates:
             merge_on = ["postal_code", "reporting", agg]
             # joins together dfs of the same level of aggregation (different estimands)
@@ -106,7 +106,14 @@ def process_final_results(self):
                 lambda x, y: pd.merge(x, y, how="inner", on=merge_on), self.unit_data.values()
             )
 
-    def write_data(self, election_id, office, geographic_unit_type):
+    def add_national_summary_estimates(self, national_summary_dict):
+        df = pd.DataFrame.from_dict(
+            national_summary_dict, orient="index", columns=["agg_pred", "agg_lower", "agg_upper"]
+        )
+        df.index.name = "estimand"
+        self.final_results["nat_sum_data"] = df.reset_index()
+
+    def write_data(self, election_id, office, geographic_unit_type, keys=None):
         """
         Saves dataframe of estimates for all estimands to S3
         Different file by aggregate level
@@ -115,6 +122,8 @@ def write_data(self, election_id, office, geographic_unit_type):
             self.process_final_results()
         s3_client = s3.S3CsvUtil(TARGET_BUCKET)
         for key, value in self.final_results.items():
+            if keys is not None and key not in keys:
+                continue
             path = f"{S3_FILE_PATH}/{election_id}/predictions/{office}/{geographic_unit_type}/{key}/current.csv"
             # convert df to csv
             csv_data = convert_df_to_csv(value)
diff --git a/src/elexmodel/models/BaseElectionModel.py b/src/elexmodel/models/BaseElectionModel.py
@@ -172,5 +172,5 @@ def get_coefficients(self) -> dict:
         """
         return self.features_to_coefficients
 
-    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add):
+    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add, alpha):
         raise NotImplementedError()
diff --git a/src/elexmodel/models/ConformalElectionModel.py b/src/elexmodel/models/ConformalElectionModel.py
@@ -206,5 +206,5 @@ def get_all_conformalization_data_agg(cls):
         """
         raise NotImplementedError
 
-    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add):
+    def get_national_summary_estimates(self, nat_sum_data_dict, called_states, base_to_add, alpha):
         raise NotImplementedError()
diff --git a/tests/test_client.py b/tests/test_client.py
@@ -832,3 +832,47 @@ def test_estimandizer_input(model_client, va_governor_county_data, va_config):
         )
     except KeyError:
         pytest.raises("Error with client input for estimandizer")
+
+
+def test_get_national_summary_votes_estimates(model_client, va_governor_county_data, va_config):
+    expected = {"margin": [1.0, 1.0, 1.0]}
+    expected_df = pd.DataFrame.from_dict(expected, orient="index", columns=["agg_pred", "agg_lower", "agg_upper"])
+    expected_df.index.name = "estimand"
+    expected_df = expected_df.reset_index()
+
+    election_id = "2017-11-07_VA_G"
+    office_id = "G"
+    geographic_unit_type = "county"
+    estimands = ["margin"]
+    prediction_intervals = [0.9]
+    percent_reporting_threshold = 100
+    kwargs = {"pi_method": "bootstrap", "features": ["baseline_normalized_margin"], "national_summary": True}
+
+    data_handler = MockLiveDataHandler(
+        election_id, office_id, geographic_unit_type, estimands, data=va_governor_county_data
+    )
+
+    data_handler.shuffle()
+    data = data_handler.get_percent_fully_reported(100)
+
+    preprocessed_data = va_governor_county_data.copy()
+    preprocessed_data["last_election_results_turnout"] = preprocessed_data["baseline_turnout"].copy() + 1
+
+    model_client.get_estimates(
+        data,
+        election_id,
+        office_id,
+        estimands,
+        prediction_intervals,
+        percent_reporting_threshold,
+        geographic_unit_type,
+        raw_config=va_config,
+        preprocessed_data=preprocessed_data,
+        save_output=[],
+        **kwargs,
+    )
+
+    current = model_client.get_national_summary_votes_estimates(None, 0, 0.99)
+
+    assert expected == current
+    pd.testing.assert_frame_equal(expected_df, model_client.results_handler.final_results["nat_sum_data"])