diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 91c64ea..d726d11 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,6 +43,14 @@ repos: - --profile=black - --line-length=80 + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + args: + - --target-version=py311 + - --line-length=80 + - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.0.286 hooks: @@ -61,11 +69,3 @@ repos: args: - --ignore=E203,E402,E501,E800,W503,W391,E261 - --select=B,C,E,F,W,T4,B9 - - - repo: https://github.com/psf/black - rev: 23.7.0 - hooks: - - id: black - args: - - --target-version=py311 - - --line-length=80 diff --git a/Makefile b/Makefile index 1596850..8c853ef 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,13 @@ BASEDIR=incident_scraper -default: create-requirements lint +default: create_requirements lint .PHONY: lint lint: pre-commit run --all-files -.PHONY: create-requirements -create-requirements: +.PHONY: create_requirements +create_requirements: poetry export --without-hashes --format=requirements.txt > requirements.txt .PHONY: download @@ -28,12 +28,16 @@ build_model: download categorize: python -m incident_scraper categorize -.PHONY: correct-geopt -correct-geopt: +.PHONY: correct_geopt +correct_geopt: python -m incident_scraper correct-geopt -.PHONY: lemmatize-categories -lemmatize-categories: +.PHONY: correct_location +correct_location: + python -m incident_scraper correct-location + +.PHONY: lemmatize_categories +lemmatize_categories: python -m incident_scraper lemmatize-categories .PHONY: seed @@ -67,7 +71,3 @@ thirty_days: .PHONY: test test: pytest -vs test/ - -.PHONY: test-and-fail -test-and-fail: - pytest -vsx test/ diff --git a/incident_scraper/__main__.py b/incident_scraper/__main__.py index 7480f61..fc957d8 100644 --- a/incident_scraper/__main__.py +++ b/incident_scraper/__main__.py @@ -31,7 +31,10 @@ UCPD_MDY_KEY_DATE_FORMAT, SystemFlags, ) -from incident_scraper.utils.functions import parse_scraped_incident_timestamp +from incident_scraper.utils.functions import ( + address_correction, + parse_scraped_incident_timestamp, +) # TODO: Chop this up into a service or some other organized structure @@ -51,6 +54,7 @@ def main(): subparser.add_parser(SystemFlags.BUILD_MODEL) subparser.add_parser(SystemFlags.CATEGORIZE) subparser.add_parser(SystemFlags.CORRECT_GEOPT) + subparser.add_parser(SystemFlags.CORRECT_LOCATION) subparser.add_parser(SystemFlags.DOWNLOAD) subparser.add_parser(SystemFlags.LEMMATIZE_CATEGORIES) subparser.add_parser(SystemFlags.SEED) @@ -70,7 +74,9 @@ def main(): case SystemFlags.CATEGORIZE: categorize_information(nbd_client) case SystemFlags.CORRECT_GEOPT: - categorize_information(nbd_client) + correct_coordinates(nbd_client) + case SystemFlags.CORRECT_LOCATION: + correct_location(nbd_client) case SystemFlags.DAYS_BACK: incidents = scraper.scrape_last_days(args.days) case SystemFlags.DOWNLOAD: @@ -112,7 +118,7 @@ def categorize_information(nbd_client: GoogleNBD) -> None: nbd_client.update_list_of_incidents(incidents) -def correct_location_information(nbd_client: GoogleNBD) -> None: +def correct_coordinates(nbd_client: GoogleNBD) -> None: incidents = nbd_client.get_all_incidents() logging.info(f"{len(incidents)} incidents fetched.") incidents = [i for i in incidents if i.validated_location.latitude < 0.0] @@ -132,6 +138,74 @@ def correct_location_information(nbd_client: GoogleNBD) -> None: logging.info(f"{len(incidents)} incorrect incident GeoPts were updated.") nbd_client.update_list_of_incidents(incidents) + incidents = nbd_client.get_all_incidents() + + corrected_locations = 0 + for i in incidents: + address = i.location + if address != i.location: + logging.info(f"{i.location} changed to {address}") + corrected_locations += 1 + + logging.info( + f"{corrected_locations} of {len(incidents)} " + "had their addressed updated." + ) + + +def correct_location(nbd_client: GoogleNBD) -> None: + geocoder = Geocoder() + incidents = nbd_client.get_all_incidents() + + corrected_locations: int = 0 + updated_incidents: [Incident] = [] + for i in incidents: + if ( + i.location == "Unknown" + or i.location == "Campus" + or i.location == "Metra Train" + ): + continue + + fmt_address = address_correction(i.location) + if fmt_address != i.location: + logging.info(f"{i.location} changed to {fmt_address}") + i.location = fmt_address + + fmt_address = ( + fmt_address.split(" (")[0] + if "(" in fmt_address + else fmt_address + ) + + i_dict: {str: Any} = {"dummy_key": True} + if ( + geocoder.get_address_information(fmt_address, i_dict) + and INCIDENT_KEY_ADDRESS in i_dict + and -90.0 <= i_dict[INCIDENT_KEY_LATITUDE] <= 90.0 + and -90.0 <= i_dict[INCIDENT_KEY_LONGITUDE] <= 90.0 + ): + i.validated_address = i_dict[INCIDENT_KEY_ADDRESS] + i.validated_location = GeoPt( + i_dict[INCIDENT_KEY_LATITUDE], + i_dict[INCIDENT_KEY_LONGITUDE], + ) + corrected_locations += 1 + updated_incidents.append(i) + else: + logging.error( + "This incident failed to get a valid location with the " + f"Geocoder: {i}" + ) + + logging.info( + f"{corrected_locations} of {len(incidents)} " + "had their address updated." + ) + + nbd_client.update_list_of_incidents(updated_incidents) + + logging.info(f"{len(updated_incidents)} addresses were updated.") def lemmatize_categories(nbd_client: GoogleNBD) -> None: @@ -198,11 +272,16 @@ def parse_and_save_records( continue i[INCIDENT_KEY_ID] = key + + i[INCIDENT_KEY_LOCATION] = address_correction( + i[INCIDENT_KEY_LOCATION] + ) + address = ( i[INCIDENT_KEY_LOCATION].split(" (")[0] if "(" in i[INCIDENT_KEY_LOCATION] else i[INCIDENT_KEY_LOCATION] - ).replace("&", "and") + ) i[INCIDENT_KEY_REPORTED] = i[INCIDENT_KEY_REPORTED].replace( ";", ":" diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py index 41e2876..187a32c 100644 --- a/incident_scraper/external/geocoder.py +++ b/incident_scraper/external/geocoder.py @@ -12,6 +12,7 @@ INCIDENT_KEY_LATITUDE, INCIDENT_KEY_LONGITUDE, LOCATION_CHICAGO, + LOCATION_HYDE_PARK, LOCATION_ILLINOIS, LOCATION_US, ) @@ -38,6 +39,8 @@ def get_address_information(self, address: str, i_dict: dict) -> bool: INCIDENT_KEY_ADDRESS not in i_dict and "between" not in address and " and " not in address + and " to " not in address + and " at " not in address ): self._get_address_from_cache( i_dict, self._get_address_from_census(address) @@ -99,7 +102,7 @@ def _get_address_from_google(self, address: str) -> dict: [address], # Enable Coding Accuracy Support System enableUspsCass=True, - locality=LOCATION_CHICAGO, + locality=LOCATION_HYDE_PARK, regionCode=LOCATION_US, ) diff --git a/incident_scraper/utils/constants.py b/incident_scraper/utils/constants.py index 307a1e7..4fe8403 100644 --- a/incident_scraper/utils/constants.py +++ b/incident_scraper/utils/constants.py @@ -46,6 +46,7 @@ # Location Constants LOCATION_CHICAGO = "Chicago" +LOCATION_HYDE_PARK = "Hyde Park, Chicago" LOCATION_ILLINOIS = "IL" LOCATION_US = "US" TIMEZONE_KEY_CHICAGO = f"America/{LOCATION_CHICAGO}" @@ -57,6 +58,7 @@ class SystemFlags: BUILD_MODEL = "build-model" CATEGORIZE = "categorize" CORRECT_GEOPT = "correct-geopt" + CORRECT_LOCATION = "correct-location" DAYS_BACK = "days-back" DOWNLOAD = "download" LEMMATIZE_CATEGORIES = "lemmatize-categories" diff --git a/incident_scraper/utils/functions.py b/incident_scraper/utils/functions.py index adc67b6..4aa7cad 100644 --- a/incident_scraper/utils/functions.py +++ b/incident_scraper/utils/functions.py @@ -1,6 +1,6 @@ import re from datetime import datetime -from typing import Optional +from typing import Optional, Tuple from incident_scraper.utils.constants import ( INCIDENT_KEY_REPORTED, @@ -8,6 +8,77 @@ ) +def create_street_tuple( + street: str, blvd: bool = False +) -> Tuple[str, str, str]: + street_type = "Ave." if not blvd else "Blvd." + + return street, f"S. {street}", f"S. {street} {street_type}" + + +STREET_CORRECTIONS = [ + create_street_tuple("Blackstone"), + create_street_tuple("Cottage Grove"), + create_street_tuple("Cornell"), + create_street_tuple("Dorchester"), + create_street_tuple("Drexel"), + create_street_tuple("East End"), + create_street_tuple("Ellis"), + create_street_tuple("Everett"), + create_street_tuple("Greenwood"), + create_street_tuple("Harper"), + create_street_tuple("Hyde Park", blvd=True), + create_street_tuple("Ingleside"), + create_street_tuple("Kenwood"), + create_street_tuple("Kimbark"), + create_street_tuple("Lake Park"), + create_street_tuple("Maryland"), + create_street_tuple("Oakenwald"), + create_street_tuple("Oakwood", blvd=True), + create_street_tuple("Stony Island"), + create_street_tuple("University"), + create_street_tuple("Woodlawn"), +] + + +def address_correction(address: str) -> str: + address = ( + address.replace("&", "and") + .replace(" .s ", " .S ") + .replace(" .e ", " .E ") + .replace(" st. ", " St. ") + .replace(" pl. ", " Pl. ") + .replace(" Midway Pl. ", " Midway Plaisance ") + ) + + address = re.sub(r"\s{2,}", " ", address) + + numerical_streets = [make_ordinal(s) for s in range(37, 66)] + for s in numerical_streets: + dir_s = f"E. {s}" + if s in address and dir_s not in address: + address = address.replace(s, dir_s) + + full_s = f"{dir_s} St." + if ( + dir_s in address + and full_s not in address + and f"{s} Pl" not in address + ): + address = address.replace(dir_s, full_s) + + for sc in STREET_CORRECTIONS: + name, dir_name, full_name = sc + + if name in address and dir_name not in address: + address = address.replace(name, dir_name) + + if dir_name in address and full_name not in address: + address = address.replace(dir_name, full_name) + + return address + + # Source: https://www.geeksforgeeks.org/convert-string-to-title-case-in-python/ def custom_title_case(input_string: str) -> str: # List of articles. @@ -66,6 +137,23 @@ def custom_title_case(input_string: str) -> str: return " ".join(output_list) +# Source: https://stackoverflow.com/a/50992575 +def make_ordinal(n: int): + """ + Convert an integer into its ordinal representation:: + + make_ordinal(0) => '0th' + make_ordinal(3) => '3rd' + make_ordinal(122) => '122nd' + make_ordinal(213) => '213th' + """ + if 11 <= (n % 100) <= 13: + suffix = "th" + else: + suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)] + return str(n) + suffix + + def parse_scraped_incident_timestamp(i: dict) -> Optional[str]: result = None