From 56337355cbcaba1ab4ef36bcf0303ed21339a3b0 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 21:27:42 -0600 Subject: [PATCH 01/14] Update geocoder.py --- incident_scraper/external/geocoder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py index 41e2876..e6fa3ef 100644 --- a/incident_scraper/external/geocoder.py +++ b/incident_scraper/external/geocoder.py @@ -22,6 +22,9 @@ class Geocoder: A class that houses code for both the Census and Google Maps geocoders. """ + # Approximate bounding box of UCPD patrol area: + # https://d3qi0qp55mx5f5.cloudfront.net/safety-security/uploads/files/Extended_Patrol_Map.pdf + BOUNDING_BOX = [-87.608703, 41.776408, -87.568594, 41.826281] NUM_RETRIES = 10 TIMEOUT = 5 From fac483e4e1b970555c41259bd6f4ff67b1a71398 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 22:14:42 -0600 Subject: [PATCH 02/14] update --- incident_scraper/__main__.py | 5 ++++- incident_scraper/external/geocoder.py | 3 ++- incident_scraper/utils/constants.py | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/incident_scraper/__main__.py b/incident_scraper/__main__.py index 7480f61..c504db2 100644 --- a/incident_scraper/__main__.py +++ b/incident_scraper/__main__.py @@ -198,11 +198,14 @@ def parse_and_save_records( continue i[INCIDENT_KEY_ID] = key + i[INCIDENT_KEY_LOCATION] = i[INCIDENT_KEY_LOCATION].replace( + "&", "and" + ) address = ( i[INCIDENT_KEY_LOCATION].split(" (")[0] if "(" in i[INCIDENT_KEY_LOCATION] else i[INCIDENT_KEY_LOCATION] - ).replace("&", "and") + ) i[INCIDENT_KEY_REPORTED] = i[INCIDENT_KEY_REPORTED].replace( ";", ":" diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py index e6fa3ef..05fe251 100644 --- a/incident_scraper/external/geocoder.py +++ b/incident_scraper/external/geocoder.py @@ -12,6 +12,7 @@ INCIDENT_KEY_LATITUDE, INCIDENT_KEY_LONGITUDE, LOCATION_CHICAGO, + LOCATION_HYDE_PARK, LOCATION_ILLINOIS, LOCATION_US, ) @@ -102,7 +103,7 @@ def _get_address_from_google(self, address: str) -> dict: [address], # Enable Coding Accuracy Support System enableUspsCass=True, - locality=LOCATION_CHICAGO, + locality=LOCATION_HYDE_PARK, regionCode=LOCATION_US, ) diff --git a/incident_scraper/utils/constants.py b/incident_scraper/utils/constants.py index 307a1e7..457e8bf 100644 --- a/incident_scraper/utils/constants.py +++ b/incident_scraper/utils/constants.py @@ -46,6 +46,7 @@ # Location Constants LOCATION_CHICAGO = "Chicago" +LOCATION_HYDE_PARK = "Hyde Park, Chicago" LOCATION_ILLINOIS = "IL" LOCATION_US = "US" TIMEZONE_KEY_CHICAGO = f"America/{LOCATION_CHICAGO}" From b722a0c2ed257ba029404b61677a36d716afad47 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 22:15:08 -0600 Subject: [PATCH 03/14] Update __main__.py --- incident_scraper/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/incident_scraper/__main__.py b/incident_scraper/__main__.py index c504db2..16b6415 100644 --- a/incident_scraper/__main__.py +++ b/incident_scraper/__main__.py @@ -198,6 +198,7 @@ def parse_and_save_records( continue i[INCIDENT_KEY_ID] = key + i[INCIDENT_KEY_LOCATION] = i[INCIDENT_KEY_LOCATION].replace( "&", "and" ) From 434dbb14d70672c2caa0e40d94889b292341c9f2 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 22:26:05 -0600 Subject: [PATCH 04/14] Update geocoder.py --- incident_scraper/external/geocoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py index 05fe251..ff03d2e 100644 --- a/incident_scraper/external/geocoder.py +++ b/incident_scraper/external/geocoder.py @@ -42,6 +42,7 @@ def get_address_information(self, address: str, i_dict: dict) -> bool: INCIDENT_KEY_ADDRESS not in i_dict and "between" not in address and " and " not in address + and " to " not in address ): self._get_address_from_cache( i_dict, self._get_address_from_census(address) From 6bc72505c4508af55335082b6ed45f900c0ad439 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 22:57:57 -0600 Subject: [PATCH 05/14] Update geocoder.py --- incident_scraper/external/geocoder.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py index ff03d2e..8320e7f 100644 --- a/incident_scraper/external/geocoder.py +++ b/incident_scraper/external/geocoder.py @@ -23,9 +23,6 @@ class Geocoder: A class that houses code for both the Census and Google Maps geocoders. """ - # Approximate bounding box of UCPD patrol area: - # https://d3qi0qp55mx5f5.cloudfront.net/safety-security/uploads/files/Extended_Patrol_Map.pdf - BOUNDING_BOX = [-87.608703, 41.776408, -87.568594, 41.826281] NUM_RETRIES = 10 TIMEOUT = 5 From 5e8096bb0856b9c523ebbadf312688e6b780d9ec Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 23:54:00 -0600 Subject: [PATCH 06/14] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 91c64ea..d726d11 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,6 +43,14 @@ repos: - --profile=black - --line-length=80 + - repo: https://github.com/psf/black + rev: 23.7.0 + hooks: + - id: black + args: + - --target-version=py311 + - --line-length=80 + - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.0.286 hooks: @@ -61,11 +69,3 @@ repos: args: - --ignore=E203,E402,E501,E800,W503,W391,E261 - --select=B,C,E,F,W,T4,B9 - - - repo: https://github.com/psf/black - rev: 23.7.0 - hooks: - - id: black - args: - - --target-version=py311 - - --line-length=80 From a65837f2240a2216ff48d58988d3828aa155fa25 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 23:57:02 -0600 Subject: [PATCH 07/14] Update Makefile --- Makefile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 1596850..8c853ef 100644 --- a/Makefile +++ b/Makefile @@ -1,13 +1,13 @@ BASEDIR=incident_scraper -default: create-requirements lint +default: create_requirements lint .PHONY: lint lint: pre-commit run --all-files -.PHONY: create-requirements -create-requirements: +.PHONY: create_requirements +create_requirements: poetry export --without-hashes --format=requirements.txt > requirements.txt .PHONY: download @@ -28,12 +28,16 @@ build_model: download categorize: python -m incident_scraper categorize -.PHONY: correct-geopt -correct-geopt: +.PHONY: correct_geopt +correct_geopt: python -m incident_scraper correct-geopt -.PHONY: lemmatize-categories -lemmatize-categories: +.PHONY: correct_location +correct_location: + python -m incident_scraper correct-location + +.PHONY: lemmatize_categories +lemmatize_categories: python -m incident_scraper lemmatize-categories .PHONY: seed @@ -67,7 +71,3 @@ thirty_days: .PHONY: test test: pytest -vs test/ - -.PHONY: test-and-fail -test-and-fail: - pytest -vsx test/ From 8e44b840ef65cb3eb64c9d977582ca1d9ae46ea2 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 23:57:09 -0600 Subject: [PATCH 08/14] Update constants.py --- incident_scraper/utils/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/incident_scraper/utils/constants.py b/incident_scraper/utils/constants.py index 457e8bf..4fe8403 100644 --- a/incident_scraper/utils/constants.py +++ b/incident_scraper/utils/constants.py @@ -58,6 +58,7 @@ class SystemFlags: BUILD_MODEL = "build-model" CATEGORIZE = "categorize" CORRECT_GEOPT = "correct-geopt" + CORRECT_LOCATION = "correct-location" DAYS_BACK = "days-back" DOWNLOAD = "download" LEMMATIZE_CATEGORIES = "lemmatize-categories" From 1bb45bc0cc0d1a2a3cf43889aa7dd59cbc675948 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 23:57:18 -0600 Subject: [PATCH 09/14] Update functions.py --- incident_scraper/utils/functions.py | 83 ++++++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/incident_scraper/utils/functions.py b/incident_scraper/utils/functions.py index adc67b6..e9098a3 100644 --- a/incident_scraper/utils/functions.py +++ b/incident_scraper/utils/functions.py @@ -1,6 +1,6 @@ import re from datetime import datetime -from typing import Optional +from typing import Optional, Tuple from incident_scraper.utils.constants import ( INCIDENT_KEY_REPORTED, @@ -8,6 +8,70 @@ ) +def create_street_tuple( + street: str, blvd: bool = False +) -> Tuple[str, str, str]: + street_type = "Ave." if not blvd else "Blvd." + + return street, f"S. {street}", f"S. {street} {street_type}" + + +STREET_CORRECTIONS = [ + create_street_tuple("Blackstone"), + create_street_tuple("Cottage Grove"), + create_street_tuple("Cornell"), + create_street_tuple("Dorchester"), + create_street_tuple("Drexel"), + create_street_tuple("East End"), + create_street_tuple("Ellis"), + create_street_tuple("Everett"), + create_street_tuple("Greenwood"), + create_street_tuple("Harper"), + create_street_tuple("Hyde Park", blvd=True), + create_street_tuple("Ingleside"), + create_street_tuple("Kenwood"), + create_street_tuple("Kimbark"), + create_street_tuple("Lake Park"), + create_street_tuple("Maryland"), + create_street_tuple("Oakwood", blvd=True), + create_street_tuple("Stony Island"), + create_street_tuple("University"), + create_street_tuple("Woodlawn"), +] + + +def address_correction(address: str) -> str: + address = ( + address.replace("&", "and") + .replace(" .s ", " .S ") + .replace(" .e ", " .E ") + .replace(" st. ", " St. ") + ) + + address = re.sub(r"\s{2,}", " ", address) + + numerical_streets = [make_ordinal(s) for s in range(37, 66)] + for s in numerical_streets: + fmt_s = f"E. {s}" + if s in address and fmt_s not in address: + address = address.replace(s, fmt_s) + + fmt_s += " St." + if s in address and fmt_s not in address and f"{s} Pl" not in address: + address = address.replace(s, fmt_s) + + for sc in STREET_CORRECTIONS: + name, dir_name, full_name = sc + + if name in address and dir_name not in address: + address = address.replace(name, dir_name) + + if dir_name in address and full_name not in address: + address = address.replace(dir_name, full_name) + + return address + + # Source: https://www.geeksforgeeks.org/convert-string-to-title-case-in-python/ def custom_title_case(input_string: str) -> str: # List of articles. @@ -66,6 +130,23 @@ def custom_title_case(input_string: str) -> str: return " ".join(output_list) +# Source: https://stackoverflow.com/a/50992575 +def make_ordinal(n: int): + """ + Convert an integer into its ordinal representation:: + + make_ordinal(0) => '0th' + make_ordinal(3) => '3rd' + make_ordinal(122) => '122nd' + make_ordinal(213) => '213th' + """ + if 11 <= (n % 100) <= 13: + suffix = "th" + else: + suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)] + return str(n) + suffix + + def parse_scraped_incident_timestamp(i: dict) -> Optional[str]: result = None From 7d2904520e66be7e3b13ec75c186e2809355ab3f Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Fri, 2 Feb 2024 23:57:35 -0600 Subject: [PATCH 10/14] Update __main__.py --- incident_scraper/__main__.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/incident_scraper/__main__.py b/incident_scraper/__main__.py index 16b6415..0b88ea6 100644 --- a/incident_scraper/__main__.py +++ b/incident_scraper/__main__.py @@ -31,7 +31,10 @@ UCPD_MDY_KEY_DATE_FORMAT, SystemFlags, ) -from incident_scraper.utils.functions import parse_scraped_incident_timestamp +from incident_scraper.utils.functions import ( + address_correction, + parse_scraped_incident_timestamp, +) # TODO: Chop this up into a service or some other organized structure @@ -51,6 +54,7 @@ def main(): subparser.add_parser(SystemFlags.BUILD_MODEL) subparser.add_parser(SystemFlags.CATEGORIZE) subparser.add_parser(SystemFlags.CORRECT_GEOPT) + subparser.add_parser(SystemFlags.CORRECT_LOCATION) subparser.add_parser(SystemFlags.DOWNLOAD) subparser.add_parser(SystemFlags.LEMMATIZE_CATEGORIES) subparser.add_parser(SystemFlags.SEED) @@ -70,7 +74,9 @@ def main(): case SystemFlags.CATEGORIZE: categorize_information(nbd_client) case SystemFlags.CORRECT_GEOPT: - categorize_information(nbd_client) + correct_coordinates(nbd_client) + case SystemFlags.CORRECT_LOCATION: + correct_location(nbd_client) case SystemFlags.DAYS_BACK: incidents = scraper.scrape_last_days(args.days) case SystemFlags.DOWNLOAD: @@ -112,7 +118,7 @@ def categorize_information(nbd_client: GoogleNBD) -> None: nbd_client.update_list_of_incidents(incidents) -def correct_location_information(nbd_client: GoogleNBD) -> None: +def correct_coordinates(nbd_client: GoogleNBD) -> None: incidents = nbd_client.get_all_incidents() logging.info(f"{len(incidents)} incidents fetched.") incidents = [i for i in incidents if i.validated_location.latitude < 0.0] @@ -132,6 +138,23 @@ def correct_location_information(nbd_client: GoogleNBD) -> None: logging.info(f"{len(incidents)} incorrect incident GeoPts were updated.") nbd_client.update_list_of_incidents(incidents) + incidents = nbd_client.get_all_incidents() + + corrected_locations = 0 + for i in incidents: + address = i.location + if address != i.location: + logging.info(f"{i.location} changed to {address}") + corrected_locations += 1 + + logging.info( + f"{corrected_locations} of {len(incidents)} " + "had their addressed updated." + ) + + +def correct_location(nbd_client: GoogleNBD) -> None: + nbd_client.get() def lemmatize_categories(nbd_client: GoogleNBD) -> None: @@ -199,9 +222,10 @@ def parse_and_save_records( i[INCIDENT_KEY_ID] = key - i[INCIDENT_KEY_LOCATION] = i[INCIDENT_KEY_LOCATION].replace( - "&", "and" + i[INCIDENT_KEY_LOCATION] = address_correction( + i[INCIDENT_KEY_LOCATION] ) + address = ( i[INCIDENT_KEY_LOCATION].split(" (")[0] if "(" in i[INCIDENT_KEY_LOCATION] From 889dcd1215679bf2d85934335705a77c17aa046d Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Sat, 3 Feb 2024 00:52:37 -0600 Subject: [PATCH 11/14] Update functions.py --- incident_scraper/utils/functions.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/incident_scraper/utils/functions.py b/incident_scraper/utils/functions.py index e9098a3..1f6e2c6 100644 --- a/incident_scraper/utils/functions.py +++ b/incident_scraper/utils/functions.py @@ -33,6 +33,7 @@ def create_street_tuple( create_street_tuple("Kimbark"), create_street_tuple("Lake Park"), create_street_tuple("Maryland"), + create_street_tuple("Oakenwald"), create_street_tuple("Oakwood", blvd=True), create_street_tuple("Stony Island"), create_street_tuple("University"), @@ -52,13 +53,17 @@ def address_correction(address: str) -> str: numerical_streets = [make_ordinal(s) for s in range(37, 66)] for s in numerical_streets: - fmt_s = f"E. {s}" - if s in address and fmt_s not in address: - address = address.replace(s, fmt_s) - - fmt_s += " St." - if s in address and fmt_s not in address and f"{s} Pl" not in address: - address = address.replace(s, fmt_s) + dir_s = f"E. {s}" + if s in address and dir_s not in address: + address = address.replace(s, dir_s) + + full_s = f"{dir_s} St." + if ( + dir_s in address + and full_s not in address + and f"{s} Pl" not in address + ): + address = address.replace(dir_s, full_s) for sc in STREET_CORRECTIONS: name, dir_name, full_name = sc From ea93b7baf46a55cc112f58046d63b8f8fa8baff1 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Sat, 3 Feb 2024 00:52:46 -0600 Subject: [PATCH 12/14] Update geocoder.py --- incident_scraper/external/geocoder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/incident_scraper/external/geocoder.py b/incident_scraper/external/geocoder.py index 8320e7f..187a32c 100644 --- a/incident_scraper/external/geocoder.py +++ b/incident_scraper/external/geocoder.py @@ -40,6 +40,7 @@ def get_address_information(self, address: str, i_dict: dict) -> bool: and "between" not in address and " and " not in address and " to " not in address + and " at " not in address ): self._get_address_from_cache( i_dict, self._get_address_from_census(address) From 857125dec45afe4b162ed333c7e3938b86a6fdff Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Sat, 3 Feb 2024 01:00:35 -0600 Subject: [PATCH 13/14] Update __main__.py --- incident_scraper/__main__.py | 53 +++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/incident_scraper/__main__.py b/incident_scraper/__main__.py index 0b88ea6..fc957d8 100644 --- a/incident_scraper/__main__.py +++ b/incident_scraper/__main__.py @@ -154,7 +154,58 @@ def correct_coordinates(nbd_client: GoogleNBD) -> None: def correct_location(nbd_client: GoogleNBD) -> None: - nbd_client.get() + geocoder = Geocoder() + incidents = nbd_client.get_all_incidents() + + corrected_locations: int = 0 + updated_incidents: [Incident] = [] + for i in incidents: + if ( + i.location == "Unknown" + or i.location == "Campus" + or i.location == "Metra Train" + ): + continue + + fmt_address = address_correction(i.location) + if fmt_address != i.location: + logging.info(f"{i.location} changed to {fmt_address}") + i.location = fmt_address + + fmt_address = ( + fmt_address.split(" (")[0] + if "(" in fmt_address + else fmt_address + ) + + i_dict: {str: Any} = {"dummy_key": True} + if ( + geocoder.get_address_information(fmt_address, i_dict) + and INCIDENT_KEY_ADDRESS in i_dict + and -90.0 <= i_dict[INCIDENT_KEY_LATITUDE] <= 90.0 + and -90.0 <= i_dict[INCIDENT_KEY_LONGITUDE] <= 90.0 + ): + i.validated_address = i_dict[INCIDENT_KEY_ADDRESS] + i.validated_location = GeoPt( + i_dict[INCIDENT_KEY_LATITUDE], + i_dict[INCIDENT_KEY_LONGITUDE], + ) + corrected_locations += 1 + updated_incidents.append(i) + else: + logging.error( + "This incident failed to get a valid location with the " + f"Geocoder: {i}" + ) + + logging.info( + f"{corrected_locations} of {len(incidents)} " + "had their address updated." + ) + + nbd_client.update_list_of_incidents(updated_incidents) + + logging.info(f"{len(updated_incidents)} addresses were updated.") def lemmatize_categories(nbd_client: GoogleNBD) -> None: From f4df2c419e92dcaeed78dfe9653ab0828eb23c35 Mon Sep 17 00:00:00 2001 From: michplunkett <5885605+michplunkett@users.noreply.github.com> Date: Sat, 3 Feb 2024 14:45:05 -0600 Subject: [PATCH 14/14] Update functions.py --- incident_scraper/utils/functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/incident_scraper/utils/functions.py b/incident_scraper/utils/functions.py index 1f6e2c6..4aa7cad 100644 --- a/incident_scraper/utils/functions.py +++ b/incident_scraper/utils/functions.py @@ -47,6 +47,8 @@ def address_correction(address: str) -> str: .replace(" .s ", " .S ") .replace(" .e ", " .E ") .replace(" st. ", " St. ") + .replace(" pl. ", " Pl. ") + .replace(" Midway Pl. ", " Midway Plaisance ") ) address = re.sub(r"\s{2,}", " ", address)