Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add address string cleaning functionality #46

Merged
merged 14 commits into from
Feb 3, 2024
16 changes: 8 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ repos:
- --profile=black
- --line-length=80

- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black
args:
- --target-version=py311
- --line-length=80

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.286
hooks:
Expand All @@ -61,11 +69,3 @@ repos:
args:
- --ignore=E203,E402,E501,E800,W503,W391,E261
- --select=B,C,E,F,W,T4,B9

- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black
args:
- --target-version=py311
- --line-length=80
22 changes: 11 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
BASEDIR=incident_scraper

default: create-requirements lint
default: create_requirements lint

.PHONY: lint
lint:
pre-commit run --all-files

.PHONY: create-requirements
create-requirements:
.PHONY: create_requirements
create_requirements:
poetry export --without-hashes --format=requirements.txt > requirements.txt

.PHONY: download
Expand All @@ -28,12 +28,16 @@ build_model: download
categorize:
python -m incident_scraper categorize

.PHONY: correct-geopt
correct-geopt:
.PHONY: correct_geopt
correct_geopt:
python -m incident_scraper correct-geopt

.PHONY: lemmatize-categories
lemmatize-categories:
.PHONY: correct_location
correct_location:
python -m incident_scraper correct-location

.PHONY: lemmatize_categories
lemmatize_categories:
python -m incident_scraper lemmatize-categories

.PHONY: seed
Expand Down Expand Up @@ -67,7 +71,3 @@ thirty_days:
.PHONY: test
test:
pytest -vs test/

.PHONY: test-and-fail
test-and-fail:
pytest -vsx test/
87 changes: 83 additions & 4 deletions incident_scraper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
UCPD_MDY_KEY_DATE_FORMAT,
SystemFlags,
)
from incident_scraper.utils.functions import parse_scraped_incident_timestamp
from incident_scraper.utils.functions import (
address_correction,
parse_scraped_incident_timestamp,
)


# TODO: Chop this up into a service or some other organized structure
Expand All @@ -51,6 +54,7 @@ def main():
subparser.add_parser(SystemFlags.BUILD_MODEL)
subparser.add_parser(SystemFlags.CATEGORIZE)
subparser.add_parser(SystemFlags.CORRECT_GEOPT)
subparser.add_parser(SystemFlags.CORRECT_LOCATION)
subparser.add_parser(SystemFlags.DOWNLOAD)
subparser.add_parser(SystemFlags.LEMMATIZE_CATEGORIES)
subparser.add_parser(SystemFlags.SEED)
Expand All @@ -70,7 +74,9 @@ def main():
case SystemFlags.CATEGORIZE:
categorize_information(nbd_client)
case SystemFlags.CORRECT_GEOPT:
categorize_information(nbd_client)
correct_coordinates(nbd_client)
case SystemFlags.CORRECT_LOCATION:
correct_location(nbd_client)
case SystemFlags.DAYS_BACK:
incidents = scraper.scrape_last_days(args.days)
case SystemFlags.DOWNLOAD:
Expand Down Expand Up @@ -112,7 +118,7 @@ def categorize_information(nbd_client: GoogleNBD) -> None:
nbd_client.update_list_of_incidents(incidents)


def correct_location_information(nbd_client: GoogleNBD) -> None:
def correct_coordinates(nbd_client: GoogleNBD) -> None:
incidents = nbd_client.get_all_incidents()
logging.info(f"{len(incidents)} incidents fetched.")
incidents = [i for i in incidents if i.validated_location.latitude < 0.0]
Expand All @@ -132,6 +138,74 @@ def correct_location_information(nbd_client: GoogleNBD) -> None:
logging.info(f"{len(incidents)} incorrect incident GeoPts were updated.")

nbd_client.update_list_of_incidents(incidents)
incidents = nbd_client.get_all_incidents()

corrected_locations = 0
for i in incidents:
address = i.location
if address != i.location:
logging.info(f"{i.location} changed to {address}")
corrected_locations += 1

logging.info(
f"{corrected_locations} of {len(incidents)} "
"had their addressed updated."
)


def correct_location(nbd_client: GoogleNBD) -> None:
geocoder = Geocoder()
incidents = nbd_client.get_all_incidents()

corrected_locations: int = 0
updated_incidents: [Incident] = []
for i in incidents:
if (
i.location == "Unknown"
or i.location == "Campus"
or i.location == "Metra Train"
):
continue

fmt_address = address_correction(i.location)
if fmt_address != i.location:
logging.info(f"{i.location} changed to {fmt_address}")
i.location = fmt_address

fmt_address = (
fmt_address.split(" (")[0]
if "(" in fmt_address
else fmt_address
)

i_dict: {str: Any} = {"dummy_key": True}
if (
geocoder.get_address_information(fmt_address, i_dict)
and INCIDENT_KEY_ADDRESS in i_dict
and -90.0 <= i_dict[INCIDENT_KEY_LATITUDE] <= 90.0
and -90.0 <= i_dict[INCIDENT_KEY_LONGITUDE] <= 90.0
):
i.validated_address = i_dict[INCIDENT_KEY_ADDRESS]
i.validated_location = GeoPt(
i_dict[INCIDENT_KEY_LATITUDE],
i_dict[INCIDENT_KEY_LONGITUDE],
)
corrected_locations += 1
updated_incidents.append(i)
else:
logging.error(
"This incident failed to get a valid location with the "
f"Geocoder: {i}"
)

logging.info(
f"{corrected_locations} of {len(incidents)} "
"had their address updated."
)

nbd_client.update_list_of_incidents(updated_incidents)

logging.info(f"{len(updated_incidents)} addresses were updated.")


def lemmatize_categories(nbd_client: GoogleNBD) -> None:
Expand Down Expand Up @@ -198,11 +272,16 @@ def parse_and_save_records(
continue

i[INCIDENT_KEY_ID] = key

i[INCIDENT_KEY_LOCATION] = address_correction(
i[INCIDENT_KEY_LOCATION]
)

address = (
i[INCIDENT_KEY_LOCATION].split(" (")[0]
if "(" in i[INCIDENT_KEY_LOCATION]
else i[INCIDENT_KEY_LOCATION]
).replace("&", "and")
)

i[INCIDENT_KEY_REPORTED] = i[INCIDENT_KEY_REPORTED].replace(
";", ":"
Expand Down
5 changes: 4 additions & 1 deletion incident_scraper/external/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
INCIDENT_KEY_LATITUDE,
INCIDENT_KEY_LONGITUDE,
LOCATION_CHICAGO,
LOCATION_HYDE_PARK,
LOCATION_ILLINOIS,
LOCATION_US,
)
Expand All @@ -38,6 +39,8 @@ def get_address_information(self, address: str, i_dict: dict) -> bool:
INCIDENT_KEY_ADDRESS not in i_dict
and "between" not in address
and " and " not in address
and " to " not in address
and " at " not in address
):
self._get_address_from_cache(
i_dict, self._get_address_from_census(address)
Expand Down Expand Up @@ -99,7 +102,7 @@ def _get_address_from_google(self, address: str) -> dict:
[address],
# Enable Coding Accuracy Support System
enableUspsCass=True,
locality=LOCATION_CHICAGO,
locality=LOCATION_HYDE_PARK,
regionCode=LOCATION_US,
)

Expand Down
2 changes: 2 additions & 0 deletions incident_scraper/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

# Location Constants
LOCATION_CHICAGO = "Chicago"
LOCATION_HYDE_PARK = "Hyde Park, Chicago"
LOCATION_ILLINOIS = "IL"
LOCATION_US = "US"
TIMEZONE_KEY_CHICAGO = f"America/{LOCATION_CHICAGO}"
Expand All @@ -57,6 +58,7 @@ class SystemFlags:
BUILD_MODEL = "build-model"
CATEGORIZE = "categorize"
CORRECT_GEOPT = "correct-geopt"
CORRECT_LOCATION = "correct-location"
DAYS_BACK = "days-back"
DOWNLOAD = "download"
LEMMATIZE_CATEGORIES = "lemmatize-categories"
Expand Down
90 changes: 89 additions & 1 deletion incident_scraper/utils/functions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,84 @@
import re
from datetime import datetime
from typing import Optional
from typing import Optional, Tuple

from incident_scraper.utils.constants import (
INCIDENT_KEY_REPORTED,
UCPD_DATE_FORMATS,
)


def create_street_tuple(
street: str, blvd: bool = False
) -> Tuple[str, str, str]:
street_type = "Ave." if not blvd else "Blvd."

return street, f"S. {street}", f"S. {street} {street_type}"


STREET_CORRECTIONS = [
create_street_tuple("Blackstone"),
create_street_tuple("Cottage Grove"),
create_street_tuple("Cornell"),
create_street_tuple("Dorchester"),
create_street_tuple("Drexel"),
create_street_tuple("East End"),
create_street_tuple("Ellis"),
create_street_tuple("Everett"),
create_street_tuple("Greenwood"),
create_street_tuple("Harper"),
create_street_tuple("Hyde Park", blvd=True),
create_street_tuple("Ingleside"),
create_street_tuple("Kenwood"),
create_street_tuple("Kimbark"),
create_street_tuple("Lake Park"),
create_street_tuple("Maryland"),
create_street_tuple("Oakenwald"),
create_street_tuple("Oakwood", blvd=True),
create_street_tuple("Stony Island"),
create_street_tuple("University"),
create_street_tuple("Woodlawn"),
]


def address_correction(address: str) -> str:
address = (
address.replace("&", "and")
.replace(" .s ", " .S ")
.replace(" .e ", " .E ")
.replace(" st. ", " St. ")
.replace(" pl. ", " Pl. ")
.replace(" Midway Pl. ", " Midway Plaisance ")
)

address = re.sub(r"\s{2,}", " ", address)

numerical_streets = [make_ordinal(s) for s in range(37, 66)]
for s in numerical_streets:
dir_s = f"E. {s}"
if s in address and dir_s not in address:
address = address.replace(s, dir_s)

full_s = f"{dir_s} St."
if (
dir_s in address
and full_s not in address
and f"{s} Pl" not in address
):
address = address.replace(dir_s, full_s)

for sc in STREET_CORRECTIONS:
name, dir_name, full_name = sc

if name in address and dir_name not in address:
address = address.replace(name, dir_name)

if dir_name in address and full_name not in address:
address = address.replace(dir_name, full_name)

return address


# Source: https://www.geeksforgeeks.org/convert-string-to-title-case-in-python/
def custom_title_case(input_string: str) -> str:
# List of articles.
Expand Down Expand Up @@ -66,6 +137,23 @@ def custom_title_case(input_string: str) -> str:
return " ".join(output_list)


# Source: https://stackoverflow.com/a/50992575
def make_ordinal(n: int):
"""
Convert an integer into its ordinal representation::

make_ordinal(0) => '0th'
make_ordinal(3) => '3rd'
make_ordinal(122) => '122nd'
make_ordinal(213) => '213th'
"""
if 11 <= (n % 100) <= 13:
suffix = "th"
else:
suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)]
return str(n) + suffix


def parse_scraped_incident_timestamp(i: dict) -> Optional[str]:
result = None

Expand Down
Loading