Merge pull request #1026 from akrherz/dsm_report_field

DSM Quality of Life Improvements
akrherz · Feb 27, 2025 · 5ed4e3c · 5ed4e3c
2 parents c637638 + fcc9092
commit 5ed4e3c
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 178 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,14 +10,18 @@ All notable changes to this library are documented in this file.
 - Drop `summary` database table processing of `max_tmpf_qc`, `min_tmpf_qc`,
   `pday_qc`, and `snow_qc`.  These are ill-designed and unused.
 - Drop poorly designed `iemdb` support within `webutil.iemapp`.
+- Internal refactor of `WMOProduct` timestamp processing in parent class.
 
 ### New Features
 
 - Add `text` `pattern` support within `get_autoplot_context`.
 - Introduce `database.sql_helper` as a hacky attempt to ease my ignorance
   with psycopg + sqlalchemy + pandas usage.
 - Introduce `database.with_sqlalchemy_conn` decorator helper.
+- Introduce `util.ddhhmm2datetime` helper to convert a WMO header timestamp
+  to a UTC timestamp.
 - Support decorated `webutil.iemapp` functions that return generators.
+- Write DSM product_id to IEMAccess summary table.
 
 ### Bug Fixes
 

diff --git a/src/pyiem/nws/product.py b/src/pyiem/nws/product.py
@@ -2,9 +2,8 @@
 
 import re
 from collections import OrderedDict
-from datetime import datetime, timedelta, timezone
+from datetime import timedelta, timezone
 from typing import Optional, Union
-from zoneinfo import ZoneInfo
 
 from shapely.geometry import MultiPolygon, Polygon
 from shapely.wkt import dumps
@@ -19,22 +18,6 @@
 # but in practice it is sometimes something between 4 and 6 chars
 # We need to be careful this does not match the LDM sequence identifier
 AFOSRE = re.compile(r"^([A-Z0-9]{4,6})\s*\t*$", re.M)
-TIME_FMT = (
-    "([0-9:]+) (AM|PM) ([A-Z][A-Z][A-Z]?T) ([A-Z][A-Z][A-Z]) "
-    "([A-Z][A-Z][A-Z]) ([0-9]+) ([1-2][0-9][0-9][0-9])"
-)
-TIME_RE = re.compile(f"^{TIME_FMT}$", re.M | re.IGNORECASE)
-TIME_UTC_RE = re.compile(
-    TIME_FMT.replace("(AM|PM) ([A-Z][A-Z][A-Z]?T)", r"(AM|PM)?\s?(UTC)"),
-    re.M | re.I,
-)
-# Sometimes products have a duplicated timestamp in another tz
-TIME_EXT_RE = re.compile(
-    rf"^{TIME_FMT}\s?/\s?{TIME_FMT}\s?/$", re.M | re.IGNORECASE
-)
-# Without the line start and end requirement
-TIME_RE_ANYWHERE = re.compile(f"{TIME_FMT}", re.IGNORECASE)
-TIME_STARTS_LINE = re.compile(r"^([0-9:]+) (AM|PM)")
 
 TIME_MOT_LOC = re.compile(
     r"TIME\.\.\.MOT\.\.\.LOC\s+(?P<ztime>[0-9]{4})Z\s+"
@@ -171,54 +154,6 @@ def str2polygon(strdata):
     return Polygon(pts)
 
 
-def date_tokens2datetime(tokens):
-    """Convert tokens from MND regex to a valid time, if possible.
-
-    Returns:
-      z (str): 3-4 char timezone string
-      tz (datetime.timezone): of this product
-      utcvalid (datetimetz): of this product
-    """
-    tokens = list(tokens)  # ensure mutable
-    z = tokens[2].upper()
-    tz = ZoneInfo(reference.name2pytz.get(z, "UTC"))
-    hhmi = tokens[0]
-    # False positive from regex
-    if hhmi[0] == ":":
-        hhmi = hhmi.replace(":", "")
-    if hhmi.find(":") > -1:
-        (hh, mi) = hhmi.split(":")
-    elif len(hhmi) < 3:
-        hh = hhmi
-        mi = 0
-    else:
-        hh = hhmi[:-2]
-        mi = hhmi[-2:]
-    # Workaround another 24 hour clock issue
-    if (
-        tokens[2] in ["UTC", "GMT"]
-        and tokens[1].upper() == "AM"
-        and int(hh) == 12
-    ):
-        hh = 0
-    # Workaround 24 hour clock abuse
-    if int(hh) >= 12 and (
-        tokens[1].upper() == "PM" or tokens[2] in ["UTC", "GMT"]
-    ):
-        # this is a hack to ensure this is PM when we are in UTC
-        tokens[1] = "PM"
-        hh = int(hh) - 12
-    dstr = (
-        f"{hh if int(hh) > 0 else 12}:{mi} "
-        f"{tokens[1] if tokens[1] != '' else 'AM'} "
-        f"{tokens[4]} {tokens[5]} {tokens[6]}"
-    )
-    # Careful here, need to go to UTC time first then come back!
-    now = datetime.strptime(dstr, "%I:%M %p %b %d %Y")
-    now += timedelta(hours=reference.offsets.get(z, 0))
-    return z, tz, now.replace(tzinfo=timezone.utc)
-
-
 def qc_is_emergency(seg):
     """Belt + Suspenders check that this segment is an emergency."""
     ffdt = seg.flood_tags.get("FLASH FLOOD DAMAGE THREAT")
@@ -675,15 +610,10 @@ def __init__(
         self.nwsli_provider = nwsli_provider
         self.unixtext = self.text.replace("\r", "")
         self.sections = self.unixtext.split("\n\n")
-        # The "truth" timestamp
-        self.valid = None
         self.segments = []
-        self.z = None
-        self.tz = None
         self.geometry = None
 
         self.parse_afos()
-        self._parse_valid(utcnow)
         if parse_segments:
             self.parse_segments()
 
@@ -856,100 +786,6 @@ def get_product_id(self):
             pid += f"-{self.bbb}"
         return pid.strip()
 
-    def _parse_valid(self, provided_utcnow):
-        """Figure out the timestamp of this product.
-
-        Args:
-          provided_utcnow (datetime): What our library was provided for the UTC
-            timestamp, it could be None
-        """
-        # The MND header hopefully has a full timestamp that is the best
-        # truth that we can have for this product.
-        tokens = TIME_RE.findall(self.unixtext)
-        if not tokens:
-            tokens = TIME_EXT_RE.findall(self.unixtext)
-            if not tokens:
-                tokens = TIME_RE_ANYWHERE.findall(self.unixtext)
-                if not tokens:
-                    tokens = TIME_UTC_RE.findall(self.unixtext)
-                    if not tokens:
-                        # We are very desperate at this point, evasive action
-                        for line in self.unixtext.split("\n")[:15]:
-                            if TIME_STARTS_LINE.match(line):
-                                # Remove anything inside of () or //
-                                line = re.sub(r" \(.*?\)", "", line)
-                                line = re.sub(r" /.*?/", "", line)
-                                tokens = TIME_RE.findall(line)
-                                break
-        if provided_utcnow is None and tokens:
-            try:
-                z, _tz, valid = date_tokens2datetime(tokens[0])
-                if z not in reference.offsets:
-                    self.warnings.append(f"product timezone '{z}' unknown")
-            except ValueError as exp:
-                msg = (
-                    f"Invalid timestamp [{' '.join(tokens[0])}] found in "
-                    f"product [{self.wmo} {self.source} {self.afos}] header"
-                )
-                raise TextProductException(self.source[1:], msg) from exp
-
-            # Set the utcnow based on what we found by looking at the header
-            self.utcnow = valid
-
-        # Search out the WMO header, this had better always be there
-        # We only care about the first hit in the file, searching from top
-        # Take the first hit, ignore others
-        wmo_day = int(self.ddhhmm[:2])
-        wmo_hour = int(self.ddhhmm[2:4])
-        wmo_minute = int(self.ddhhmm[4:])
-
-        self.wmo_valid = self.utcnow.replace(
-            hour=wmo_hour, minute=wmo_minute, second=0, microsecond=0
-        )
-        if wmo_day != self.utcnow.day:
-            if wmo_day - self.utcnow.day == 1:  # Tomorrow
-                self.wmo_valid = self.wmo_valid.replace(day=wmo_day)
-            elif wmo_day > 25 and self.utcnow.day < 15:  # Previous month!
-                self.wmo_valid = self.wmo_valid + timedelta(days=-10)
-                self.wmo_valid = self.wmo_valid.replace(day=wmo_day)
-            elif wmo_day < 5 and self.utcnow.day >= 15:  # next month
-                self.wmo_valid = self.wmo_valid + timedelta(days=10)
-                self.wmo_valid = self.wmo_valid.replace(day=wmo_day)
-            else:
-                self.wmo_valid = self.wmo_valid.replace(day=wmo_day)
-
-        # we can do no better
-        self.valid = self.wmo_valid
-
-        # If we don't find anything, lets default to now, its the best
-        if not tokens:
-            return
-        self.z, self.tz, self.valid = date_tokens2datetime(tokens[0])
-        # We want to forgive two easy situations
-        offset = (self.valid - self.wmo_valid).total_seconds()
-        # 1. self.valid is off from WMO by approximately 12 hours (am/pm flip)
-        if 42900 <= offset <= 43800:
-            LOG.info(
-                "Auto correcting AM/PM typo, %s -> %s",
-                self.valid,
-                self.wmo_valid,
-            )
-            self.warnings.append(
-                "Detected AM/PM flip, adjusting product timestamp - 12 hours"
-            )
-            self.valid = self.valid - timedelta(hours=12)
-        # 2. self.valid is off by approximate 1 year (year typo)
-        if -367 * 86400 < offset < -364 * 86400:
-            LOG.info(
-                "Auto correcting year typo, %s -> %s",
-                self.valid,
-                self.wmo_valid,
-            )
-            self.warnings.append(
-                "Detected year typo, adjusting product timestamp + 1 year"
-            )
-            self.valid = self.valid.replace(year=self.valid.year + 1)
-
     def get_affected_wfos(self):
         """Based on the ugc_provider, figure out which WFOs are impacted by
         this product"""

diff --git a/src/pyiem/nws/products/dsm.py b/src/pyiem/nws/products/dsm.py
@@ -119,7 +119,7 @@ def compute_times(self, utcnow):
             self.date, self.groupdict.get("time_sped_gust_max")
         )
 
-    def sql(self, txn):
+    def sql(self, txn, product_id: str = None):
         """Persist to database given the transaction object."""
         cols = []
         args = []
@@ -167,13 +167,15 @@ def sql(self, txn):
             return False
         cs = ", ".join([f"{c} = %s" for c in cols])
         slicer = slice(0, 4) if self.station[0] != "K" else slice(1, 4)
-        args.extend([self.station[slicer], self.date])
+        args.extend([product_id, product_id, self.station[slicer], self.date])
         txn.execute(
-            (
-                f"UPDATE summary_{self.date.year} s SET {cs} FROM stations t "
-                "WHERE s.iemid = t.iemid and t.network ~* 'ASOS' "
-                "and t.id = %s and s.day = %s"
-            ),
+            f"""
+    UPDATE summary_{self.date:%Y} s SET {cs}
+    , report = trim(case when %s::text is null then report else
+        coalesce(report, '') || %s::text || ' ' end)
+    FROM stations t
+    WHERE s.iemid = t.iemid and t.network ~* 'ASOS'
+    and t.id = %s and s.day = %s""",
             args,
         )
         return txn.rowcount == 1
@@ -194,7 +196,7 @@ def __init__(
         self.ugc_provider = ugc_provider
         self.nwsli_provider = nwsli_provider
         # hold our parsing results
-        self.data = []
+        self.data: list[DSMProduct] = []
         lines = self.text.replace("\r", "").split("\n")
         if len(lines[3]) < 10:
             meat = ("".join(lines[4:])).split("=")
@@ -219,9 +221,19 @@ def tzlocalize(self, tzprovider):
                 continue
             dsm.tzlocalize(tzinfo)
 
-    def sql(self, txn):
+    def sql(self, txn) -> list[bool]:
         """Do databasing."""
-        return [dsm.sql(txn) for dsm in self.data]
+        res = []
+        for dsm in self.data:
+            # Some magic is happening here to construct a product_id
+            # that is unique for this DSM and based on the pyWWA splitting of
+            # DSMs that is done
+            product_id = (
+                f"{self.wmo_valid:%Y%m%d%H%M}-{self.source}-{self.wmo}-"
+                f"DSM{dsm.station[1:]}"
+            )
+            res.append(dsm.sql(txn, product_id))
+        return res
 
 
 def parser(text, utcnow=None, ugc_provider=None, nwsli_provider=None):

diff --git a/src/pyiem/util.py b/src/pyiem/util.py
@@ -66,6 +66,30 @@ def _strptime(ins: str, fmt: str, rectify: bool = False) -> datetime:
         ) from exp
 
 
+def ddhhmm2datetime(ddhhmm: str, utcnow: datetime) -> datetime:
+    """Do the nasty WMO header timestamp conversion."""
+    wmo_day = int(ddhhmm[:2])
+    wmo_hour = int(ddhhmm[2:4])
+    wmo_minute = int(ddhhmm[4:])
+
+    wmo_valid = utcnow.replace(
+        hour=wmo_hour, minute=wmo_minute, second=0, microsecond=0
+    )
+    if wmo_day != utcnow.day:
+        if wmo_day - utcnow.day == 1:  # Tomorrow
+            wmo_valid = wmo_valid.replace(day=wmo_day)
+        elif wmo_day > 25 and utcnow.day < 15:  # Previous month!
+            wmo_valid = wmo_valid + timedelta(days=-10)
+            wmo_valid = wmo_valid.replace(day=wmo_day)
+        elif wmo_day < 5 and utcnow.day >= 15:  # next month
+            wmo_valid = wmo_valid + timedelta(days=10)
+            wmo_valid = wmo_valid.replace(day=wmo_day)
+        else:
+            wmo_valid = wmo_valid.replace(day=wmo_day)
+
+    return wmo_valid
+
+
 def web2ldm(url, ldm_product_name, md5_from_name=False, pqinsert="pqinsert"):
     """Download a URL and insert into LDM.