Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update date parsing #94

Merged
merged 5 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 42 additions & 61 deletions asyncwhois/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,8 @@
from datetime import datetime
from typing import Dict, List, Any, Union


# Date formats from richardpenman/pywhois
KNOWN_DATE_FORMATS = [
"%d-%b-%Y", # 02-jan-2000
"%d-%B-%Y", # 11-February-2000
"%d-%m-%Y", # 20-10-2000
"%Y-%m-%d", # 2000-01-02
"%d.%m.%Y", # 2.1.2000
"%Y.%m.%d", # 2000.01.02
"%Y/%m/%d", # 2000/01/02
"%Y/%m/%d %H:%M:%S", # 2011/06/01 01:05:01
"%Y/%m/%d %H:%M:%S (%z)", # 2011/06/01 01:05:01 (+0900)
"%Y%m%d", # 20170209
"%Y%m%d %H:%M:%S", # 20110908 14:44:51
"%d/%m/%Y", # 02/01/2013
"%Y. %m. %d.", # 2000. 01. 02.
"%Y.%m.%d %H:%M:%S", # 2014.03.08 10:28:24
"%d-%b-%Y %H:%M:%S %Z", # 24-Jul-2009 13:20:03 UTC
"%a %b %d %H:%M:%S %Z %Y", # Tue Jun 21 23:59:59 GMT 2011
"%Y-%m-%dT%H:%M:%S", # 2007-01-26T19:10:31
"%Y-%m-%dT%H:%M:%SZ", # 2007-01-26T19:10:31Z
"%Y-%m-%dT%H:%M:%SZ[%Z]", # 2007-01-26T19:10:31Z[UTC]
"%Y-%m-%dT%H:%M:%S.%fZ", # 2018-12-01T16:17:30.568Z
"%Y-%m-%dT%H:%M:%S.%f%z", # 2011-09-08T14:44:51.622265+03:00
"%Y-%m-%dT%H:%M:%S%z", # 2013-12-06T08:17:22-0800
"%Y-%m-%dT%H:%M:%S%zZ", # 1970-01-01T02:00:00+02:00Z
"%Y-%m-%dt%H:%M:%S.%f", # 2011-09-08t14:44:51.622265
"%Y-%m-%dt%H:%M:%S", # 2007-01-26T19:10:31
"%Y-%m-%dt%H:%M:%SZ", # 2007-01-26T19:10:31Z
"%Y-%m-%dt%H:%M:%S.%fz", # 2007-01-26t19:10:31.00z
"%Y-%m-%dt%H:%M:%S%z", # 2011-03-30T19:36:27+0200
"%Y-%m-%dt%H:%M:%S.%f%z", # 2011-09-08T14:44:51.622265+03:00
"%Y-%m-%d %H:%M:%SZ", # 2000-08-22 18:55:20Z
"%Y-%m-%d %H:%M:%S", # 2000-08-22 18:55:20
"%d %b %Y %H:%M:%S", # 08 Apr 2013 05:44:00
"%d/%m/%Y %H:%M:%S", # 23/04/2015 12:00:07 EEST
"%d/%m/%Y %H:%M:%S %Z", # 23/04/2015 12:00:07 EEST
"%d/%m/%Y %H:%M:%S.%f %Z", # 23/04/2015 12:00:07.619546 EEST
"%Y-%m-%d %H:%M:%S.%f", # 23/04/2015 12:00:07.619546
"%B %d %Y", # August 14 2017
"%d.%m.%Y %H:%M:%S", # 08.03.2014 10:28:24
"%a %b %d %Y", # Tue Dec 12 2000
"before %b-%Y", # before aug-1996
"%Y-%m-%d %H:%M:%S (%Z%z)", # 2017-09-26 11:38:29 (GMT+00:00)
]
from dateutil.parser import parse, ParserError
from dateutil import tz


class TLDBaseKeys(str, Enum):
Expand Down Expand Up @@ -239,22 +196,31 @@ class BaseParser:
date_keys = ()
multiple_match_keys = ()

# For handling special cases in TLD parser classes
known_date_formats = []
# Extra formats that dateutil might not figure out
extra_date_formats = [
"%Y-%m-%dT%H:%M:%SZ[%Z]", # 2007-01-26T19:10:31Z[UTC]
"%Y-%m-%dT%H:%M:%S.%fZ", # 2018-12-01T16:17:30.568Z
"%Y-%m-%dT%H:%M:%S%zZ", # 1970-01-01T02:00:00+02:00Z
"%Y-%m-%dt%H:%M:%S.%fz", # 2007-01-26t19:10:31.00z
"%Y-%m-%d %H:%M:%SZ", # 2000-08-22 18:55:20Z
"before %b-%Y", # before aug-1996
]
# Additional timezone info for dateutil
timezone_info = {
"KST": tz.gettz("Asia/Seoul"), # Korea Standard Time UTC+9
"JST": tz.gettz("Asia/Tokyo"), # Japan Standard Time UTC+9
"EEST": tz.gettz("Europe/Athens"), # Eastern European Summertime UTC+3
}

def update_reg_expressions(self, expressions_update: Dict[str, Any]) -> None:
"""
Updates the `reg_expressions` dictionary
:param expressions_update: dictionary of keys/regexes to update
:param expressions_update: dict of keys/regexes to update
"""
self.reg_expressions.update(expressions_update)

@staticmethod
def _parse_date_mdY(date_string: str) -> datetime:
date_string = date_string.rstrip()
try:
# This date format conflicts with "%d/%m/%Y" date format in `KNOWN_DATE_FORMATS`
return datetime.strptime(date_string, "%m/%d/%Y")
except ValueError:
return date_string or None

def parse(self, blob: str) -> Dict[Union[IPBaseKeys, TLDBaseKeys], Any]:
"""
Iterates over the `reg_expressions` dictionary attempting to use each regex to extract values
Expand Down Expand Up @@ -333,20 +299,35 @@ def find_multiline_match(self, start: str, blob: str) -> List[str]:
matches = self._process_many(multiline_match.group(1))
return matches

@staticmethod
def _parse_date(date_string: str) -> Union[datetime, str]:
def _parse_date(self, date_string: str) -> Union[datetime, str]:
"""
Attempts to convert the given date string to a datetime.datetime object
otherwise returns the input `date_string`
:param date_string: a date string
:return: a datetime.datetime object
"""
for date_format in KNOWN_DATE_FORMATS:

def _datetime_or_none(dt_string: str, dt_format: str) -> Union[datetime, None]:
try:
date = datetime.strptime(date_string, date_format)
return date
return datetime.strptime(dt_string, dt_format)
except ValueError:
continue
return None

# first, try the known formats
for date_format in self.known_date_formats:
if date := _datetime_or_none(date_string, date_format):
return date
# next, try dateutil.parse
try:
clean_date_string = re.sub(r"\(([^)]+)\)", r"\1", date_string).strip()
return parse(clean_date_string, tzinfos=self.timezone_info)
except ParserError:
pass
# finally, try extra formats
for date_format in self.extra_date_formats:
if date := _datetime_or_none(date_string, date_format):
return date
# no luck parsing
return date_string

def _process_many(self, match: str) -> List[str]:
Expand Down
77 changes: 9 additions & 68 deletions asyncwhois/tldparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,7 @@ class RegexFI(TLDParser):
TLDBaseKeys.DNSSEC: r"dnssec\.*: *([\S]+)",
TLDBaseKeys.REGISTRAR: r"registrar\.*:\s(.+)",
}
known_date_formats = ["%d.%m.%Y"]


class RegexNU(TLDParser):
Expand Down Expand Up @@ -692,6 +693,7 @@ class RegexCZ(TLDParser):
TLDBaseKeys.EXPIRES: r"expire: *(.+)",
TLDBaseKeys.NAME_SERVERS: r"nserver: *(.+)",
}
known_date_formats = ["%d.%m.%Y %H:%M:%S", "%d.%m.%Y"]

def parse(self, blob: str) -> dict[str, Any]:
parsed_output = super().parse(blob)
Expand Down Expand Up @@ -766,52 +768,6 @@ class RegexUA(TLDParser):
TLDBaseKeys.NAME_SERVERS: "nserver: *(.+)",
}

KNOWN_DATE_FORMATS = [
"%Y-%m-%d %H:%M:%S%z",
]

@staticmethod
def _fix_timezone(date_string: str) -> str:
"""
Fix timezone format for datetime.strptime

:param date_string: date string
:return: fixed date string

>>> RegexUA._fix_timezone("2023-02-17 14:22:06+02")
'2023-02-17 14:22:06+0200'
>>>
>>> RegexUA._fix_timezone("2023-02-17 14:22:06+2")
'2023-02-17 14:22:06+0200'
>>>
>>> RegexUA._fix_timezone("2023-02-17 14:22:06+0200")
'2023-02-17 14:22:06+0200'
>>>

"""
if "+" in date_string:
date_time, timezone = date_string.split("+")
if timezone.isdigit() and len(timezone) <= 2:
date_string = f"{date_time}+{int(timezone):02d}00"

return date_string

@staticmethod
def _parse_date(date_string: str) -> Union[datetime, str]:
date = TLDParser._parse_date(date_string)
if isinstance(date, datetime):
return date

date_string = RegexUA._fix_timezone(date_string)
for date_format in RegexUA.KNOWN_DATE_FORMATS:
try:
date = datetime.strptime(date_string, date_format)
return date
except ValueError:
pass

return date_string


class RegexCN(TLDParser):
tld_specific_expressions: ExpressionDict = {
Expand Down Expand Up @@ -876,6 +832,7 @@ class RegexVE(TLDParser): # double check
TLDBaseKeys.REGISTRANT_COUNTRY: r"(?:address:.+\n){4}address: *(.+)",
TLDBaseKeys.REGISTRANT_ORGANIZATION: r"org: *(.+)",
}
known_date_formats = ["%d.%m.%Y %H:%M:%S", "%d.%m.%Y"]


class RegexAE(TLDParser):
Expand Down Expand Up @@ -955,6 +912,8 @@ class RegexIR(TLDParser):

class RegexTK(TLDParser):
tld_specific_expressions: ExpressionDict = {
TLDBaseKeys.CREATED: r"Domain registered: *(.+)",
TLDBaseKeys.EXPIRES: r"Record will expire on: *(.+)",
TLDBaseKeys.REGISTRANT_ORGANIZATION: r"(?<=Owner contact)[\s\S]*?Organization:(.*)",
TLDBaseKeys.REGISTRANT_NAME: r"(?<=Owner contact)[\s\S]*?Name:(.*)",
TLDBaseKeys.REGISTRANT_ADDRESS: r"(?<=Owner contact)[\s\S]*?Address:(.*)",
Expand Down Expand Up @@ -992,25 +951,14 @@ class RegexTK(TLDParser):
TLDBaseKeys.TECH_FAX: r"(?<=Tech contact)[\s\S]*?Fax:(.*)",
TLDBaseKeys.TECH_PHONE: r"(?<=Tech contact)[\s\S]*?Phone:(.*)",
}
known_date_formats = ["%m/%d/%Y"]

def parse(self, blob: str) -> dict[str, Any]:
parsed_output = super().parse(blob)
# handle multiline nameservers
parsed_output[TLDBaseKeys.NAME_SERVERS] = self.find_multiline_match(
"Domain nameservers:", blob
)
# a date parser exists for '%d/%m/%Y', but this interferes with the parser needed
# for this one, which is '%m/%d/%Y', so this date format needs to be parsed separately here
created_match = re.search(r"Domain registered: *(.+)", blob, re.IGNORECASE)
if created_match:
parsed_output[TLDBaseKeys.CREATED] = self._parse_date_mdY(
created_match.group(1)
)
expires_match = re.search(r"Record will expire on: *(.+)", blob, re.IGNORECASE)
if expires_match:
parsed_output[TLDBaseKeys.EXPIRES] = self._parse_date_mdY(
expires_match.group(1)
)
# Check if "Status" is inline with Domain Name. For example:
# Domain Name:
# GOOGLE.TK is Active
Expand Down Expand Up @@ -1134,7 +1082,7 @@ def parse(self, blob: str) -> dict[str, Any]:
parsed_output = super().parse(blob)
# parse created date
created_match = parsed_output.get(
"created"
TLDBaseKeys.CREATED
) # looks like 30th April 2003; need to remove day suffix
if created_match and isinstance(created_match, str):
date_string = re.sub(r"(\d)(st|nd|rd|th)", r"\1", created_match)
Expand Down Expand Up @@ -1178,6 +1126,7 @@ class RegexAX(TLDParser):
TLDBaseKeys.STATUS: r"status\.+: *(.+)",
TLDBaseKeys.NAME_SERVERS: r"nserver\.+: *(.+)",
}
known_date_formats = ["%d.%m.%Y"]

def parse(self, blob: str) -> dict[str, Any]:
parsed_output = super().parse(blob)
Expand Down Expand Up @@ -1320,19 +1269,11 @@ class RegexGA(TLDParser):
TLDBaseKeys.TECH_FAX: r"(?<=Tech contact)[\s\S]*?Fax:(.*)",
TLDBaseKeys.TECH_PHONE: r"(?<=Tech contact)[\s\S]*?Phone:(.*)",
}
known_date_formats = ["%m/%d/%Y"]

def parse(self, blob: str) -> dict[str, Any]:
output = super().parse(blob)
output[TLDBaseKeys.NAME_SERVERS] = self.find_multiline_match(
"Domain Nameservers:", blob
)
# date format is m/d/Y
created = output.get(TLDBaseKeys.CREATED)
if created:
output[TLDBaseKeys.CREATED] = self._parse_date_mdY(created)

expires = output.get(TLDBaseKeys.EXPIRES)
if expires:
output[TLDBaseKeys.EXPIRES] = self._parse_date_mdY(expires)

return output
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ classifiers = [
dynamic = ["version"]
license = {text = "MIT License"}
dependencies = [
"python-dateutil>=2.9.0",
"python-socks[asyncio]>=2.0.2",
"tldextract>=3.2.0",
"whodap>=0.1.10"
"whodap>=0.1.12"
]

[project.urls]
Expand Down
62 changes: 62 additions & 0 deletions tests/test_dateparsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from datetime import datetime

from asyncwhois.parse import BaseParser


def test_dateparsers(): # noqa
date_and_time_examples = [
"2010-07-04 04:18:23 +03:00",
"2024-02-19 01:30:15.927683+11",
"2008-08-31 04:14:06 KST",
"2024/01/01 01:05:04 (JST)",
"07 Aug 2024",
]
date_and_time_examples += [
"02-jan-2000",
"11-February-2000",
"20-10-2000",
"2000-01-02",
"2.1.2000",
"2000.01.02",
"2000/01/02",
"2011/06/01 01:05:01",
"2011/06/01 01:05:01 (+0900)",
"20170209",
"20110908 14:44:51",
"02/01/2013",
"2000. 01. 02.",
"2014.03.08 10:28:24",
"24-Jul-2009 13:20:03 UTC",
"Tue Jun 21 23:59:59 GMT 2011",
"2007-01-26T19:10:31",
"2007-01-26T19:10:31Z",
"2007-01-26T19:10:31Z[UTC]", # extra
"2018-12-01T16:17:30.568Z", # extra
"2011-09-08T14:44:51.622265+03:00",
"2013-12-06T08:17:22-0800",
"1970-01-01T02:00:00+02:00Z", # extra
"2011-09-08t14:44:51.622265",
"2007-01-26T19:10:31",
"2007-01-26T19:10:31Z",
"2007-01-26t19:10:31.00z", # extra
"2011-03-30T19:36:27+0200",
"2011-09-08T14:44:51.622265+03:00",
"2000-08-22 18:55:20Z", # extra
"2000-08-22 18:55:20",
"08 Apr 2013 05:44:00",
"23/04/2015 12:00:07",
"23/04/2015 12:00:07 EEST",
"23/04/2015 12:00:07.619546 EEST",
"2015-04-23 12:00:07.619546",
"August 14 2017",
"08.03.2014 10:28:24",
"Tue Dec 12 2000",
"before aug-1996", # extra
"2017-09-26 11:38:29 (GMT+00:00)",
]

bp = BaseParser()

for dt in date_and_time_examples:
result = bp._parse_date(dt)
assert isinstance(result, datetime), f"Failed to parse date string: {dt}"
9 changes: 6 additions & 3 deletions tests/test_not_found.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import sys

import pytest
import asyncwhois
from asyncwhois.errors import NotFoundError

if sys.version_info >= (3, 8):
from unittest import IsolatedAsyncioTestCase

class TestLookupNotFound(IsolatedAsyncioTestCase):
@pytest.mark.skip(reason="this is failing on github actions")
async def test_not_found_aio(self):
domain = "some-non-existent-domain123.com"
with self.assertRaises(NotFoundError):
await asyncwhois.aio_whois_domain(domain)
await asyncwhois.aio_whois(domain)

@pytest.mark.skip(reason="this is failing on github actions")
def test_not_found(self):
domain = "some-non-existent-domain123.com"
with self.assertRaises(NotFoundError):
asyncwhois.whois_domain(domain)
asyncwhois.whois(domain)

asyncwhois.whois_domain(domain, ignore_not_found=True)
asyncwhois.whois(domain, ignore_not_found=True)
2 changes: 1 addition & 1 deletion tests/test_parser_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def test_parse_dates(self):
]

for date_string in date_strings:
formatted_date = BaseParser._parse_date(date_string)
formatted_date = BaseParser()._parse_date(date_string)
self.assertIsInstance(formatted_date, datetime.datetime)

def test_find_match(self):
Expand Down
Loading
Loading