Skip to content

Commit

Permalink
Merge pull request #48 from artshumrc/natural-language
Browse files Browse the repository at this point in the history
Update natural language parser
  • Loading branch information
aweakley committed May 22, 2024
2 parents ee1d07d + 26b0afb commit 077eac5
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 168 deletions.
50 changes: 20 additions & 30 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ Test coverage includes every example given in the spec table of features.

* Years exceeding four digits:

>>> parse_edtf('y-12000') # 12000 years BCE
LongYear: 'y-12000'
>>> parse_edtf('Y-12000') # 12000 years BCE
LongYear: 'Y-12000'

* Season:

Expand Down Expand Up @@ -167,8 +167,8 @@ Test coverage includes every example given in the spec table of features.

* Year requiring more than 4 digits - exponential form:

>>> parse_edtf('y-17e7')
ExponentialYear: 'y-17e7'
>>> parse_edtf('Y-17e7')
ExponentialYear: 'Y-17e7'

### Natural language representation

Expand Down Expand Up @@ -196,43 +196,33 @@ The parser can parse strings such as:
'c.1860' => '1860~' #with or without .
'ca1860' => '1860~'
'approx 1860' => '1860~'

# masked precision
'1860s' => '186x' #186x has decade precision, 186u has year precision.
'1800s' => '18xx' # without uncertainty indicators, assume century

# masked precision + uncertainty
'ca. 1860s' => '186x~'
'circa 1840s' => '184x~'
'ca. 1860s?' => '186x?~'
'c1800s?' => '180x?~' # with uncertainty indicators, use the decade
'ca. 1860s' => '186X~'
'circa 1840s' => '184X~'
'ca. 1860s?' => '186X?~'
'c1800s?' => '180X?~' # with uncertainty indicators, use the decade

# unspecified parts
'January 12' => 'XXXX-01-12'
'January' => 'XXXX-01'
'7/2008' => '2008-07'
'month in 1872' => '1872-XX'
'day in January 1872' => '1872-01-XX'
'day in 1872' => '1872-XX-XX'

#seasons
'Autumn 1872' => '1872-23'
'Fall 1872' => '1872-23'

# before/after
'earlier than 1928' => 'unknown/1928'
'later than 1928' => '1928/unknown'
'before January 1928' => 'unknown/1928-01'
'after about the 1920s' => '192x~/unknown'

# unspecified
'year in the 1860s' => '186u' #186x has decade precision, 186u has year precision.
('year in the 1800s', '18xu')
'month in 1872' => '1872-XX'
'day in January 1872' => '1872-01-XX'
'day in 1872' => '1872-XX-XX'
'earlier than 1928' => '/1928'
'later than 1928' => '1928/'
'before January 1928' => '/1928-01'
'after about the 1920s' => '192X~/'

#centuries
'1st century' => '00xx'
'10c' => '09xx'
'19th century?' => '18xx?'
'1st century' => '00XX'
'10c' => '09XX'
'19th century?' => '18XX?'

# just showing off now...
'a day in about Spring 1849?' => '1849-21-XX?~'
Expand All @@ -243,8 +233,8 @@ The parser can parse strings such as:
'1851-1852; printed 1853-1854' => '1851/1852'
'1851-52' => '1851/1852'
'1856-ca. 1865' => '1856/1865~'
'1860s-1870s' => '186x/187x'
'1920s -early 1930s' => '192x/193x'
'1860s-1870s' => '186X/187X'
'1920s - early 1930s' => '192X/193X'
'1938, printed 1940s-1950s' => '1938'


Expand Down
39 changes: 20 additions & 19 deletions edtf/natlang/en.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
DEFAULT_DATE_1 = datetime(1234, 1, 1, 0, 0)
DEFAULT_DATE_2 = datetime(5678, 10, 10, 0, 0)

SHORT_YEAR_RE = r"(-?)([\du])([\dxu])([\dxu])([\dxu])"
LONG_YEAR_RE = r"y(-?)([1-9]\d\d\d\d+)"
SHORT_YEAR_RE = r"(-?)([\dX])([\dX])([\dX])([\dX])"
LONG_YEAR_RE = r"Y(-?)([1-9]\d\d\d\d+)"
CENTURY_RE = r"(\d{1,2})(c\.?|(st|nd|rd|th) century)\s?(ad|ce|bc|bce)?"
CE_RE = r"(\d{1,4}) (ad|ce|bc|bce)"

Expand All @@ -31,7 +31,7 @@ def text_to_edtf(text):
Generate EDTF string equivalent of a given natural language date string.
"""
if not text:
return
return None

t = text.lower()

Expand Down Expand Up @@ -101,10 +101,9 @@ def text_to_edtf(text):
is_after = is_after or re.findall(r"\blater\b", t)

if is_before:
result = f"unknown/{result}"
result = f"/{result}" # unknown is replaced with null for intervals
elif is_after:
result = f"{result}/unknown"

result = f"{result}/" # unknown is replaced with null for intervals
return result


Expand Down Expand Up @@ -155,7 +154,7 @@ def text_to_edtf_date(text):
# detect CE/BCE year form
is_ce = re.findall(CE_RE, t)
if is_century:
result = "%02dxx" % (int(is_century[0][0]) - 1,)
result = "%02dXX" % (int(is_century[0][0]) - 1,)
is_approximate = is_approximate or re.findall(r"\b(ca?\.?) ?" + CENTURY_RE, t)
is_uncertain = is_uncertain or re.findall(CENTURY_RE + r"\?", t)

Expand Down Expand Up @@ -222,25 +221,25 @@ def text_to_edtf_date(text):
# approximate/uncertain markers to decide whether we treat it as
# a century or a decade.
if i == 2 and could_be_century and not (is_approximate or is_uncertain):
result += "x"
result += "X"
elif i == 3 and is_decade > 0:
if mentions_year:
result += "u" # year precision
result += "X" # previously year precision - now just X
else:
result += "x" # decade precision
result += "X" # previously decade precision - now just X
elif date1[i] == date2[i]:
# since both attempts at parsing produced the same result
# it must be parsed value, not a default
result += date1[i]
else:
# different values were produced, meaning that it's likely
# a default. Use 'unspecified'
result += "u"
# a default. Use 'X'
result += "X"

# strip off unknown chars from end of string - except the first 4

for i in reversed(xrange(len(result))):
if result[i] not in ("u", "x", "-"):
if result[i] not in ("X", "-"):
smallest_length = 4

if mentions_month:
Expand All @@ -264,14 +263,16 @@ def text_to_edtf_date(text):

# end dateutil post-parsing

if is_uncertain:
result += "?"

if is_approximate:
result += "~"
if is_uncertain and is_approximate:
result += "%"
else:
if is_uncertain:
result += "?"
if is_approximate:
result += "~"

# weed out bad parses
if result.startswith("uu-uu"):
if result.startswith("XX-XX"):
return None

return result
Loading

0 comments on commit 077eac5

Please sign in to comment.