michplunkett · michplunkett · Feb 3, 2024 · Feb 3, 2024 · Feb 3, 2024 · Feb 3, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -43,6 +43,14 @@ repos:
           - --profile=black
           - --line-length=80
 
+  - repo: https://github.com/psf/black
+    rev: 23.7.0
+    hooks:
+      - id: black
+        args:
+          - --target-version=py311
+          - --line-length=80
+
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.0.286
     hooks:
@@ -61,11 +69,3 @@ repos:
         args:
           - --ignore=E203,E402,E501,E800,W503,W391,E261
           - --select=B,C,E,F,W,T4,B9
-
-  - repo: https://github.com/psf/black
-    rev: 23.7.0
-    hooks:
-      - id: black
-        args:
-          - --target-version=py311
-          - --line-length=80
diff --git a/Makefile b/Makefile
@@ -1,13 +1,13 @@
 BASEDIR=incident_scraper
 
-default: create-requirements lint
+default: create_requirements lint
 
 .PHONY: lint
 lint:
 	pre-commit run --all-files
 
-.PHONY: create-requirements
-create-requirements:
+.PHONY: create_requirements
+create_requirements:
 	poetry export --without-hashes --format=requirements.txt > requirements.txt
 
 .PHONY: download
@@ -28,12 +28,16 @@ build_model: download
 categorize:
 	python -m incident_scraper categorize
 
-.PHONY: correct-geopt
-correct-geopt:
+.PHONY: correct_geopt
+correct_geopt:
 	python -m incident_scraper correct-geopt
 
-.PHONY: lemmatize-categories
-lemmatize-categories:
+.PHONY: correct_location
+correct_location:
+	python -m incident_scraper correct-location
+
+.PHONY: lemmatize_categories
+lemmatize_categories:
 	python -m incident_scraper lemmatize-categories
 
 .PHONY: seed
@@ -67,7 +71,3 @@ thirty_days:
 .PHONY: test
 test:
 	pytest -vs test/
-
-.PHONY: test-and-fail
-test-and-fail:
-	pytest -vsx test/
diff --git a/incident_scraper/__main__.py b/incident_scraper/__main__.py
@@ -31,7 +31,10 @@
     UCPD_MDY_KEY_DATE_FORMAT,
     SystemFlags,
 )
-from incident_scraper.utils.functions import parse_scraped_incident_timestamp
+from incident_scraper.utils.functions import (
+    address_correction,
+    parse_scraped_incident_timestamp,
+)
 
 
 # TODO: Chop this up into a service or some other organized structure
@@ -51,6 +54,7 @@ def main():
     subparser.add_parser(SystemFlags.BUILD_MODEL)
     subparser.add_parser(SystemFlags.CATEGORIZE)
     subparser.add_parser(SystemFlags.CORRECT_GEOPT)
+    subparser.add_parser(SystemFlags.CORRECT_LOCATION)
     subparser.add_parser(SystemFlags.DOWNLOAD)
     subparser.add_parser(SystemFlags.LEMMATIZE_CATEGORIES)
     subparser.add_parser(SystemFlags.SEED)
@@ -70,7 +74,9 @@ def main():
         case SystemFlags.CATEGORIZE:
             categorize_information(nbd_client)
         case SystemFlags.CORRECT_GEOPT:
-            categorize_information(nbd_client)
+            correct_coordinates(nbd_client)
+        case SystemFlags.CORRECT_LOCATION:
+            correct_location(nbd_client)
         case SystemFlags.DAYS_BACK:
             incidents = scraper.scrape_last_days(args.days)
         case SystemFlags.DOWNLOAD:
@@ -112,7 +118,7 @@ def categorize_information(nbd_client: GoogleNBD) -> None:
     nbd_client.update_list_of_incidents(incidents)
 
 
-def correct_location_information(nbd_client: GoogleNBD) -> None:
+def correct_coordinates(nbd_client: GoogleNBD) -> None:
     incidents = nbd_client.get_all_incidents()
     logging.info(f"{len(incidents)} incidents fetched.")
     incidents = [i for i in incidents if i.validated_location.latitude < 0.0]
@@ -132,6 +138,74 @@ def correct_location_information(nbd_client: GoogleNBD) -> None:
     logging.info(f"{len(incidents)} incorrect incident GeoPts were updated.")
 
     nbd_client.update_list_of_incidents(incidents)
+    incidents = nbd_client.get_all_incidents()
+
+    corrected_locations = 0
+    for i in incidents:
+        address = i.location
+        if address != i.location:
+            logging.info(f"{i.location} changed to {address}")
+            corrected_locations += 1
+
+    logging.info(
+        f"{corrected_locations} of {len(incidents)} "
+        "had their addressed updated."
+    )
+
+
+def correct_location(nbd_client: GoogleNBD) -> None:
+    geocoder = Geocoder()
+    incidents = nbd_client.get_all_incidents()
+
+    corrected_locations: int = 0
+    updated_incidents: [Incident] = []
+    for i in incidents:
+        if (
+            i.location == "Unknown"
+            or i.location == "Campus"
+            or i.location == "Metra Train"
+        ):
+            continue
+
+        fmt_address = address_correction(i.location)
+        if fmt_address != i.location:
+            logging.info(f"{i.location} changed to {fmt_address}")
+            i.location = fmt_address
+
+            fmt_address = (
+                fmt_address.split(" (")[0]
+                if "(" in fmt_address
+                else fmt_address
+            )
+
+            i_dict: {str: Any} = {"dummy_key": True}
+            if (
+                geocoder.get_address_information(fmt_address, i_dict)
+                and INCIDENT_KEY_ADDRESS in i_dict
+                and -90.0 <= i_dict[INCIDENT_KEY_LATITUDE] <= 90.0
+                and -90.0 <= i_dict[INCIDENT_KEY_LONGITUDE] <= 90.0
+            ):
+                i.validated_address = i_dict[INCIDENT_KEY_ADDRESS]
+                i.validated_location = GeoPt(
+                    i_dict[INCIDENT_KEY_LATITUDE],
+                    i_dict[INCIDENT_KEY_LONGITUDE],
+                )
+                corrected_locations += 1
+                updated_incidents.append(i)
+            else:
+                logging.error(
+                    "This incident failed to get a valid location with the "
+                    f"Geocoder: {i}"
+                )
+
+    logging.info(
+        f"{corrected_locations} of {len(incidents)} "
+        "had their address updated."
+    )
+
+    nbd_client.update_list_of_incidents(updated_incidents)
+
+    logging.info(f"{len(updated_incidents)} addresses were updated.")
 
 
 def lemmatize_categories(nbd_client: GoogleNBD) -> None:
@@ -198,11 +272,16 @@ def parse_and_save_records(
                 continue
 
             i[INCIDENT_KEY_ID] = key
+
+            i[INCIDENT_KEY_LOCATION] = address_correction(
+                i[INCIDENT_KEY_LOCATION]
+            )
+
             address = (
                 i[INCIDENT_KEY_LOCATION].split(" (")[0]
                 if "(" in i[INCIDENT_KEY_LOCATION]
                 else i[INCIDENT_KEY_LOCATION]
-            ).replace("&", "and")
+            )
 
             i[INCIDENT_KEY_REPORTED] = i[INCIDENT_KEY_REPORTED].replace(
                 ";", ":"

diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py
@@ -12,6 +12,7 @@
     INCIDENT_KEY_LATITUDE,
     INCIDENT_KEY_LONGITUDE,
     LOCATION_CHICAGO,
+    LOCATION_HYDE_PARK,
     LOCATION_ILLINOIS,
     LOCATION_US,
 )
@@ -38,6 +39,8 @@ def get_address_information(self, address: str, i_dict: dict) -> bool:
             INCIDENT_KEY_ADDRESS not in i_dict
             and "between" not in address
             and " and " not in address
+            and " to " not in address
+            and " at " not in address
         ):
             self._get_address_from_cache(
                 i_dict, self._get_address_from_census(address)
@@ -99,7 +102,7 @@ def _get_address_from_google(self, address: str) -> dict:
             [address],
             # Enable Coding Accuracy Support System
             enableUspsCass=True,
-            locality=LOCATION_CHICAGO,
+            locality=LOCATION_HYDE_PARK,
             regionCode=LOCATION_US,
         )
 

diff --git a/incident_scraper/utils/constants.py b/incident_scraper/utils/constants.py
@@ -46,6 +46,7 @@
 
 # Location Constants
 LOCATION_CHICAGO = "Chicago"
+LOCATION_HYDE_PARK = "Hyde Park, Chicago"
 LOCATION_ILLINOIS = "IL"
 LOCATION_US = "US"
 TIMEZONE_KEY_CHICAGO = f"America/{LOCATION_CHICAGO}"
@@ -57,6 +58,7 @@ class SystemFlags:
     BUILD_MODEL = "build-model"
     CATEGORIZE = "categorize"
     CORRECT_GEOPT = "correct-geopt"
+    CORRECT_LOCATION = "correct-location"
     DAYS_BACK = "days-back"
     DOWNLOAD = "download"
     LEMMATIZE_CATEGORIES = "lemmatize-categories"

diff --git a/incident_scraper/utils/functions.py b/incident_scraper/utils/functions.py
@@ -1,13 +1,84 @@
 import re
 from datetime import datetime
-from typing import Optional
+from typing import Optional, Tuple
 
 from incident_scraper.utils.constants import (
     INCIDENT_KEY_REPORTED,
     UCPD_DATE_FORMATS,
 )
 
 
+def create_street_tuple(
+    street: str, blvd: bool = False
+) -> Tuple[str, str, str]:
+    street_type = "Ave." if not blvd else "Blvd."
+
+    return street, f"S. {street}", f"S. {street} {street_type}"
+
+
+STREET_CORRECTIONS = [
+    create_street_tuple("Blackstone"),
+    create_street_tuple("Cottage Grove"),
+    create_street_tuple("Cornell"),
+    create_street_tuple("Dorchester"),
+    create_street_tuple("Drexel"),
+    create_street_tuple("East End"),
+    create_street_tuple("Ellis"),
+    create_street_tuple("Everett"),
+    create_street_tuple("Greenwood"),
+    create_street_tuple("Harper"),
+    create_street_tuple("Hyde Park", blvd=True),
+    create_street_tuple("Ingleside"),
+    create_street_tuple("Kenwood"),
+    create_street_tuple("Kimbark"),
+    create_street_tuple("Lake Park"),
+    create_street_tuple("Maryland"),
+    create_street_tuple("Oakenwald"),
+    create_street_tuple("Oakwood", blvd=True),
+    create_street_tuple("Stony Island"),
+    create_street_tuple("University"),
+    create_street_tuple("Woodlawn"),
+]
+
+
+def address_correction(address: str) -> str:
+    address = (
+        address.replace("&", "and")
+        .replace(" .s ", " .S ")
+        .replace(" .e ", " .E ")
+        .replace(" st. ", " St. ")
+        .replace(" pl. ", " Pl. ")
+        .replace(" Midway Pl. ", " Midway Plaisance ")
+    )
+
+    address = re.sub(r"\s{2,}", " ", address)
+
+    numerical_streets = [make_ordinal(s) for s in range(37, 66)]
+    for s in numerical_streets:
+        dir_s = f"E. {s}"
+        if s in address and dir_s not in address:
+            address = address.replace(s, dir_s)
+
+        full_s = f"{dir_s} St."
+        if (
+            dir_s in address
+            and full_s not in address
+            and f"{s} Pl" not in address
+        ):
+            address = address.replace(dir_s, full_s)
+
+    for sc in STREET_CORRECTIONS:
+        name, dir_name, full_name = sc
+
+        if name in address and dir_name not in address:
+            address = address.replace(name, dir_name)
+
+        if dir_name in address and full_name not in address:
+            address = address.replace(dir_name, full_name)
+
+    return address
+
+
 # Source: https://www.geeksforgeeks.org/convert-string-to-title-case-in-python/
 def custom_title_case(input_string: str) -> str:
     # List of articles.
@@ -66,6 +137,23 @@ def custom_title_case(input_string: str) -> str:
     return " ".join(output_list)
 
 
+# Source: https://stackoverflow.com/a/50992575
+def make_ordinal(n: int):
+    """
+    Convert an integer into its ordinal representation::
+
+        make_ordinal(0)   => '0th'
+        make_ordinal(3)   => '3rd'
+        make_ordinal(122) => '122nd'
+        make_ordinal(213) => '213th'
+    """
+    if 11 <= (n % 100) <= 13:
+        suffix = "th"
+    else:
+        suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)]
+    return str(n) + suffix
+
+
 def parse_scraped_incident_timestamp(i: dict) -> Optional[str]:
     result = None