Merge pull request #48 from artshumrc/natural-language

Update natural language parser
ixc · May 22, 2024 · 077eac5 · 077eac5
2 parents ee1d07d + 26b0afb
commit 077eac5
Show file tree

Hide file tree

Showing 5 changed files with 188 additions and 168 deletions.
diff --git a/README.md b/README.md
@@ -124,8 +124,8 @@ Test coverage includes every example given in the spec table of features.
 
 * Years exceeding four digits:
 
-        >>> parse_edtf('y-12000') # 12000 years BCE
-        LongYear: 'y-12000'
+        >>> parse_edtf('Y-12000') # 12000 years BCE
+        LongYear: 'Y-12000'
 
 * Season:
 
@@ -167,8 +167,8 @@ Test coverage includes every example given in the spec table of features.
 
 * Year requiring more than 4 digits - exponential form:
 
-        >>> parse_edtf('y-17e7')
-        ExponentialYear: 'y-17e7'
+        >>> parse_edtf('Y-17e7')
+        ExponentialYear: 'Y-17e7'
 
 ### Natural language representation
 
@@ -196,43 +196,33 @@ The parser can parse strings such as:
     'c.1860' => '1860~' #with or without .
     'ca1860' => '1860~'
     'approx 1860' => '1860~'
-
-    # masked precision
-    '1860s' => '186x' #186x has decade precision, 186u has year precision.
-    '1800s' => '18xx' # without uncertainty indicators, assume century
-
-    # masked precision + uncertainty
-    'ca. 1860s' => '186x~'
-    'circa 1840s' => '184x~'
-    'ca. 1860s?' => '186x?~'
-    'c1800s?' => '180x?~' # with uncertainty indicators, use the decade
+    'ca. 1860s' => '186X~'
+    'circa 1840s' => '184X~'
+    'ca. 1860s?' => '186X?~'
+    'c1800s?' => '180X?~' # with uncertainty indicators, use the decade
 
     # unspecified parts
     'January 12' => 'XXXX-01-12'
     'January' => 'XXXX-01'
     '7/2008' => '2008-07'
+    'month in 1872' => '1872-XX'
+    'day in January 1872' => '1872-01-XX'
+    'day in 1872' => '1872-XX-XX'
 
     #seasons
     'Autumn 1872' => '1872-23'
     'Fall 1872' => '1872-23'
 
     # before/after
-    'earlier than 1928' => 'unknown/1928'
-    'later than 1928' => '1928/unknown'
-    'before January 1928' => 'unknown/1928-01'
-    'after about the 1920s' => '192x~/unknown'
-
-    # unspecified
-    'year in the 1860s' => '186u' #186x has decade precision, 186u has year precision.
-    ('year in the 1800s', '18xu')
-    'month in 1872' => '1872-XX'
-    'day in January 1872' => '1872-01-XX'
-    'day in 1872' => '1872-XX-XX'
+    'earlier than 1928' => '/1928'
+    'later than 1928' => '1928/'
+    'before January 1928' => '/1928-01'
+    'after about the 1920s' => '192X~/'
 
     #centuries
-    '1st century' => '00xx'
-    '10c' => '09xx'
-    '19th century?' => '18xx?'
+    '1st century' => '00XX'
+    '10c' => '09XX'
+    '19th century?' => '18XX?'
 
     # just showing off now...
     'a day in about Spring 1849?' => '1849-21-XX?~'
@@ -243,8 +233,8 @@ The parser can parse strings such as:
     '1851-1852; printed 1853-1854' => '1851/1852'
     '1851-52' => '1851/1852'
     '1856-ca. 1865' => '1856/1865~'
-    '1860s-1870s' => '186x/187x'
-    '1920s -early 1930s' => '192x/193x'
+    '1860s-1870s' => '186X/187X'
+    '1920s - early 1930s' => '192X/193X'
     '1938, printed 1940s-1950s' => '1938'
 
 

diff --git a/edtf/natlang/en.py b/edtf/natlang/en.py
@@ -14,8 +14,8 @@
 DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
 DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)
 
-SHORT_YEAR_RE = r"(-?)([\du])([\dxu])([\dxu])([\dxu])"
-LONG_YEAR_RE = r"y(-?)([1-9]\d\d\d\d+)"
+SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
+LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
 CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
 CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"
 
@@ -31,7 +31,7 @@ def text_to_edtf(text):
     Generate EDTF string equivalent of a given natural language date string.
     """
     if not text:
-        return
+        return None
 
     t = text.lower()
 
@@ -101,10 +101,9 @@ def text_to_edtf(text):
     is_after = is_after or re.findall(r"\blater\b", t)
 
     if is_before:
-        result = f"unknown/{result}"
+        result = f"/{result}"  # unknown is replaced with null for intervals
     elif is_after:
-        result = f"{result}/unknown"
-
+        result = f"{result}/"  # unknown is replaced with null for intervals
     return result
 
 
@@ -155,7 +154,7 @@ def text_to_edtf_date(text):
     # detect CE/BCE year form
     is_ce = re.findall(CE_RE, t)
     if is_century:
-        result = "%02dxx" % (int(is_century[0][0]) - 1,)
+        result = "%02dXX" % (int(is_century[0][0]) - 1,)
         is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
         is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)
 
@@ -222,25 +221,25 @@ def text_to_edtf_date(text):
             # approximate/uncertain markers to decide whether we treat it as
             # a century or a decade.
             if i == 2 and could_be_century and not (is_approximate or is_uncertain):
-                result += "x"
+                result += "X"
             elif i == 3 and is_decade > 0:
                 if mentions_year:
-                    result += "u"  # year precision
+                    result += "X"  # previously year precision - now just X
                 else:
-                    result += "x"  # decade precision
+                    result += "X"  # previously decade precision - now just X
             elif date1[i] == date2[i]:
                 # since both attempts at parsing produced the same result
                 # it must be parsed value, not a default
                 result += date1[i]
             else:
                 # different values were produced, meaning that it's likely
-                # a default. Use 'unspecified'
-                result += "u"
+                # a default. Use 'X'
+                result += "X"
 
         # strip off unknown chars from end of string - except the first 4
 
         for i in reversed(xrange(len(result))):
-            if result[i] not in ("u", "x", "-"):
+            if result[i] not in ("X", "-"):
                 smallest_length = 4
 
                 if mentions_month:
@@ -264,14 +263,16 @@ def text_to_edtf_date(text):
 
             # end dateutil post-parsing
 
-    if is_uncertain:
-        result += "?"
-
-    if is_approximate:
-        result += "~"
+    if is_uncertain and is_approximate:
+        result += "%"
+    else:
+        if is_uncertain:
+            result += "?"
+        if is_approximate:
+            result += "~"
 
     # weed out bad parses
-    if result.startswith("uu-uu"):
+    if result.startswith("XX-XX"):
         return None
 
     return result