aryn-ai · alexaryn · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py b/lib/sycamore/sycamore/tests/unit/transforms/test_standardizer.py
@@ -5,6 +5,7 @@
     DateTimeStandardizer,
     ignore_errors,
 )
+import pytest
 import unittest
 from datetime import date, datetime
 
@@ -266,3 +267,41 @@ def test_ignore_errors_key_missing(self):
         key_path = ["nonExistentKey"]
         expected_output = {"event": {"coolKey": ""}}
         self.assertEqual(ignore_errors(doc, standardizer, key_path), expected_output)
+
+
+@pytest.mark.parametrize(
+    "raw, want",
+    [
+        ("March 17, 2023, 14.25 Local", "2023-03-17 14:25:00"),
+        ("March 17, 2023, 14.25", "2023-03-17 14:25:00"),
+        ("March 17, 2023 14:25:00", "2023-03-17 14:25:00"),
+        ("March 17, 2023 2:25PM", "2023-03-17 14:25:00"),
+        ("March 17, 2023 2:25AM", "2023-03-17 02:25:00"),
+        ("17 March 2023 14:25", "2023-03-17 14:25:00"),
+        ("2023-07-15 10.30.00", "2023-07-15 10:30:00"),
+        ("15/07/2023 10.30.00", "2023-07-15 10:30:00"),
+        ("2023-07-15 10.30.00 Local", "2023-07-15 10:30:00"),
+        ("2023-07-15 10.30.00PDT", "2023-07-15 10:30:00-07:00"),
+        ("2024/6/01 23:59:59 PDT", "2024-06-01 23:59:59-07:00"),
+        ("2024/12/04 15:25:39 PST", "2024-12-04 15:25:39-08:00"),
+        ("03/02/1995 0815 CST", "1995-03-02 08:15:00-06:00"),
+        ("03/02/1995 081500 CST", "1995-03-02 08:15:00-06:00"),
+        ("3/2/95 0815", "1995-03-02 08:15:00"),
+        ("1995-03-02 0815 CST", "1995-03-02 08:15:00-06:00"),
+        ("1995-03-02 081500 CST", "1995-03-02 08:15:00-06:00"),
+        ("4/30/1970 10:15:00 JST", "1970-04-30 10:15:00+09:00"),
+        ("1/2/2034 12:13:14 GMT", "2034-01-02 12:13:14+00:00"),
+        ("2034-01-02T12:13:14+00:00", "2034-01-02 12:13:14+00:00"),
+        ("", "fail"),
+        ("wrongdate", "fail"),
+        ("2023123-07-15 10.30.00 Local", "fail"),
+        ("April 1, 1999 1259", "fail"),
+    ],
+)
+def test_date_fixer(raw, want):
+    try:
+        dt = DateTimeStandardizer.fixer(raw)
+        s = dt.isoformat(sep=" ", timespec="seconds")
+    except ValueError:
+        s = "fail"
+    assert s == want
diff --git a/lib/sycamore/sycamore/transforms/standardizer.py b/lib/sycamore/sycamore/transforms/standardizer.py
@@ -185,6 +185,9 @@ class DateTimeStandardizer(Standardizer):
     """
 
     DEFAULT_FORMAT = "%B %d, %Y %H:%M:%S%Z"
+    clock_re = re.compile(r"\d:[0-5]\d")
+    year_re = re.compile(r"([12]\d\d\d-)|(/[12]\d\d\d)|(\d/[0-3]?\d/\d)")
+    digitpair_re = re.compile(r"([0-2]\d)([0-5]\d)(\d\d)?")
 
     @staticmethod
     def fixer(raw_dateTime: str) -> datetime:
@@ -205,10 +208,11 @@ def fixer(raw_dateTime: str) -> datetime:
         """
         assert raw_dateTime is not None, "raw_dateTime is None"
         try:
-            raw_dateTime = raw_dateTime.strip()
+            raw_dateTime = DateTimeStandardizer.preprocess(raw_dateTime)
             raw_dateTime = raw_dateTime.replace("Local", "")
             raw_dateTime = raw_dateTime.replace("local", "")
             raw_dateTime = raw_dateTime.replace(".", ":")
+            logging.error(f"FIXME {raw_dateTime}")
             parsed = dateparser.parse(raw_dateTime)
             if not parsed:
                 raise ValueError(f"Invalid date format: {raw_dateTime}")
@@ -222,6 +226,35 @@ def fixer(raw_dateTime: str) -> datetime:
             # Handle any other exceptions
             raise RuntimeError(f"Unexpected error occurred while processing: {raw_dateTime}") from e
 
+    @staticmethod
+    def preprocess(raw: str) -> str:
+        # Fix up military clock time with just digits (0800)
+        raw = raw.strip()
+        tokens = raw.split()
+        saw_clock = 0
+        saw_year = 0
+        saw_digits = 0
+        for token in tokens:
+            if DateTimeStandardizer.clock_re.search(token):
+                saw_clock += 1
+            elif DateTimeStandardizer.year_re.search(token):
+                saw_year += 1
+            elif DateTimeStandardizer.digitpair_re.fullmatch(token):
+                saw_digits += 1
+        # If unsure there's exactly one military clock time, bail out.
+        # Note that numbers like 2024 could be times or years.
+        if (saw_clock > 0) or (saw_year == 0) or (saw_digits != 1):
+            return raw
+        pieces: list[str] = []
+        for token in tokens:
+            if match := DateTimeStandardizer.digitpair_re.fullmatch(token):
+                clock = ":".join([x for x in match.groups() if x])
+                before = token[: match.start(0)]
+                after = token[match.end(0) :]
+                token = before + clock + after
+            pieces.append(token)
+        return " ".join(pieces)
+
     @staticmethod
     def standardize(
         doc: Document,
@@ -305,7 +338,7 @@ def ignore_errors(doc: Document, standardizer: Standardizer, key_path: list[str]
     try:
         doc = standardizer.standardize(doc, key_path=key_path)
     except KeyError:
-        logger.warn(f"Key {key_path} not found in document: {doc}")
+        logger.warning(f"Key {key_path} not found in document: {doc}")
     except Exception as e:
         logger.error(e)
     return doc