Skip to content

Commit

Permalink
Add address string cleaning functionality (#46)
Browse files Browse the repository at this point in the history
## Describe your changes
Added an `address_correction` function that clears up a lot of
ambiguities for address searching. I still haven't been able to get the
cross street problem fixed, but I'm gonna save that for a later time.


## Checklist before requesting a review
- [x] The code runs successfully.

```commandline
(ucpd-incident-scraper-py3.11) michaelp@MacBook-Air-18 ucpd-incident-scraper % make correct_location                                                                        
python -m incident_scraper correct-location
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michaelp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
API queries_quota: 60
1425 E. Midway Pl. (Public Way) changed to 1425 E. Midway Plaisance (Public Way)
1130 E. Midway Pl. (Park District Skating Rink) changed to 1130 E. Midway Plaisance (Park District Skating Rink)
819 E. Midway Pl. (Public Way) changed to 819 E. Midway Plaisance (Public Way)
...
Area of Midway Pl. and S. Ellis Ave. changed to Area of Midway Plaisance and S. Ellis Ave.
1111 E. Midway Pl. (Winter Garden) changed to 1111 E. Midway Plaisance (Winter Garden)
1005 E. Midway Pl. (Park Dist. Property) changed to 1005 E. Midway Plaisance (Park Dist. Property)
1130 E. Midway Pl. (Skating Rink) changed to 1130 E. Midway Plaisance (Skating Rink)
78 of 16819 had their address updated.
78 addresses were updated.
Waiting up to 5 seconds.
Sent all pending logs.
```
  • Loading branch information
michplunkett authored Feb 3, 2024
1 parent d6c9050 commit 1cfb264
Show file tree
Hide file tree
Showing 6 changed files with 197 additions and 25 deletions.
16 changes: 8 additions & 8 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ repos:
- --profile=black
- --line-length=80

- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black
args:
- --target-version=py311
- --line-length=80

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.286
hooks:
Expand All @@ -61,11 +69,3 @@ repos:
args:
- --ignore=E203,E402,E501,E800,W503,W391,E261
- --select=B,C,E,F,W,T4,B9

- repo: https://github.com/psf/black
rev: 23.7.0
hooks:
- id: black
args:
- --target-version=py311
- --line-length=80
22 changes: 11 additions & 11 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
BASEDIR=incident_scraper

default: create-requirements lint
default: create_requirements lint

.PHONY: lint
lint:
pre-commit run --all-files

.PHONY: create-requirements
create-requirements:
.PHONY: create_requirements
create_requirements:
poetry export --without-hashes --format=requirements.txt > requirements.txt

.PHONY: download
Expand All @@ -28,12 +28,16 @@ build_model: download
categorize:
python -m incident_scraper categorize

.PHONY: correct-geopt
correct-geopt:
.PHONY: correct_geopt
correct_geopt:
python -m incident_scraper correct-geopt

.PHONY: lemmatize-categories
lemmatize-categories:
.PHONY: correct_location
correct_location:
python -m incident_scraper correct-location

.PHONY: lemmatize_categories
lemmatize_categories:
python -m incident_scraper lemmatize-categories

.PHONY: seed
Expand Down Expand Up @@ -67,7 +71,3 @@ thirty_days:
.PHONY: test
test:
pytest -vs test/

.PHONY: test-and-fail
test-and-fail:
pytest -vsx test/
87 changes: 83 additions & 4 deletions incident_scraper/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
UCPD_MDY_KEY_DATE_FORMAT,
SystemFlags,
)
from incident_scraper.utils.functions import parse_scraped_incident_timestamp
from incident_scraper.utils.functions import (
address_correction,
parse_scraped_incident_timestamp,
)


# TODO: Chop this up into a service or some other organized structure
Expand All @@ -51,6 +54,7 @@ def main():
subparser.add_parser(SystemFlags.BUILD_MODEL)
subparser.add_parser(SystemFlags.CATEGORIZE)
subparser.add_parser(SystemFlags.CORRECT_GEOPT)
subparser.add_parser(SystemFlags.CORRECT_LOCATION)
subparser.add_parser(SystemFlags.DOWNLOAD)
subparser.add_parser(SystemFlags.LEMMATIZE_CATEGORIES)
subparser.add_parser(SystemFlags.SEED)
Expand All @@ -70,7 +74,9 @@ def main():
case SystemFlags.CATEGORIZE:
categorize_information(nbd_client)
case SystemFlags.CORRECT_GEOPT:
categorize_information(nbd_client)
correct_coordinates(nbd_client)
case SystemFlags.CORRECT_LOCATION:
correct_location(nbd_client)
case SystemFlags.DAYS_BACK:
incidents = scraper.scrape_last_days(args.days)
case SystemFlags.DOWNLOAD:
Expand Down Expand Up @@ -112,7 +118,7 @@ def categorize_information(nbd_client: GoogleNBD) -> None:
nbd_client.update_list_of_incidents(incidents)


def correct_location_information(nbd_client: GoogleNBD) -> None:
def correct_coordinates(nbd_client: GoogleNBD) -> None:
incidents = nbd_client.get_all_incidents()
logging.info(f"{len(incidents)} incidents fetched.")
incidents = [i for i in incidents if i.validated_location.latitude < 0.0]
Expand All @@ -132,6 +138,74 @@ def correct_location_information(nbd_client: GoogleNBD) -> None:
logging.info(f"{len(incidents)} incorrect incident GeoPts were updated.")

nbd_client.update_list_of_incidents(incidents)
incidents = nbd_client.get_all_incidents()

corrected_locations = 0
for i in incidents:
address = i.location
if address != i.location:
logging.info(f"{i.location} changed to {address}")
corrected_locations += 1

logging.info(
f"{corrected_locations} of {len(incidents)} "
"had their addressed updated."
)


def correct_location(nbd_client: GoogleNBD) -> None:
geocoder = Geocoder()
incidents = nbd_client.get_all_incidents()

corrected_locations: int = 0
updated_incidents: [Incident] = []
for i in incidents:
if (
i.location == "Unknown"
or i.location == "Campus"
or i.location == "Metra Train"
):
continue

fmt_address = address_correction(i.location)
if fmt_address != i.location:
logging.info(f"{i.location} changed to {fmt_address}")
i.location = fmt_address

fmt_address = (
fmt_address.split(" (")[0]
if "(" in fmt_address
else fmt_address
)

i_dict: {str: Any} = {"dummy_key": True}
if (
geocoder.get_address_information(fmt_address, i_dict)
and INCIDENT_KEY_ADDRESS in i_dict
and -90.0 <= i_dict[INCIDENT_KEY_LATITUDE] <= 90.0
and -90.0 <= i_dict[INCIDENT_KEY_LONGITUDE] <= 90.0
):
i.validated_address = i_dict[INCIDENT_KEY_ADDRESS]
i.validated_location = GeoPt(
i_dict[INCIDENT_KEY_LATITUDE],
i_dict[INCIDENT_KEY_LONGITUDE],
)
corrected_locations += 1
updated_incidents.append(i)
else:
logging.error(
"This incident failed to get a valid location with the "
f"Geocoder: {i}"
)

logging.info(
f"{corrected_locations} of {len(incidents)} "
"had their address updated."
)

nbd_client.update_list_of_incidents(updated_incidents)

logging.info(f"{len(updated_incidents)} addresses were updated.")


def lemmatize_categories(nbd_client: GoogleNBD) -> None:
Expand Down Expand Up @@ -198,11 +272,16 @@ def parse_and_save_records(
continue

i[INCIDENT_KEY_ID] = key

i[INCIDENT_KEY_LOCATION] = address_correction(
i[INCIDENT_KEY_LOCATION]
)

address = (
i[INCIDENT_KEY_LOCATION].split(" (")[0]
if "(" in i[INCIDENT_KEY_LOCATION]
else i[INCIDENT_KEY_LOCATION]
).replace("&", "and")
)

i[INCIDENT_KEY_REPORTED] = i[INCIDENT_KEY_REPORTED].replace(
";", ":"
Expand Down
5 changes: 4 additions & 1 deletion incident_scraper/external/geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
INCIDENT_KEY_LATITUDE,
INCIDENT_KEY_LONGITUDE,
LOCATION_CHICAGO,
LOCATION_HYDE_PARK,
LOCATION_ILLINOIS,
LOCATION_US,
)
Expand All @@ -38,6 +39,8 @@ def get_address_information(self, address: str, i_dict: dict) -> bool:
INCIDENT_KEY_ADDRESS not in i_dict
and "between" not in address
and " and " not in address
and " to " not in address
and " at " not in address
):
self._get_address_from_cache(
i_dict, self._get_address_from_census(address)
Expand Down Expand Up @@ -99,7 +102,7 @@ def _get_address_from_google(self, address: str) -> dict:
[address],
# Enable Coding Accuracy Support System
enableUspsCass=True,
locality=LOCATION_CHICAGO,
locality=LOCATION_HYDE_PARK,
regionCode=LOCATION_US,
)

Expand Down
2 changes: 2 additions & 0 deletions incident_scraper/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

# Location Constants
LOCATION_CHICAGO = "Chicago"
LOCATION_HYDE_PARK = "Hyde Park, Chicago"
LOCATION_ILLINOIS = "IL"
LOCATION_US = "US"
TIMEZONE_KEY_CHICAGO = f"America/{LOCATION_CHICAGO}"
Expand All @@ -57,6 +58,7 @@ class SystemFlags:
BUILD_MODEL = "build-model"
CATEGORIZE = "categorize"
CORRECT_GEOPT = "correct-geopt"
CORRECT_LOCATION = "correct-location"
DAYS_BACK = "days-back"
DOWNLOAD = "download"
LEMMATIZE_CATEGORIES = "lemmatize-categories"
Expand Down
90 changes: 89 additions & 1 deletion incident_scraper/utils/functions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,84 @@
import re
from datetime import datetime
from typing import Optional
from typing import Optional, Tuple

from incident_scraper.utils.constants import (
INCIDENT_KEY_REPORTED,
UCPD_DATE_FORMATS,
)


def create_street_tuple(
street: str, blvd: bool = False
) -> Tuple[str, str, str]:
street_type = "Ave." if not blvd else "Blvd."

return street, f"S. {street}", f"S. {street} {street_type}"


STREET_CORRECTIONS = [
create_street_tuple("Blackstone"),
create_street_tuple("Cottage Grove"),
create_street_tuple("Cornell"),
create_street_tuple("Dorchester"),
create_street_tuple("Drexel"),
create_street_tuple("East End"),
create_street_tuple("Ellis"),
create_street_tuple("Everett"),
create_street_tuple("Greenwood"),
create_street_tuple("Harper"),
create_street_tuple("Hyde Park", blvd=True),
create_street_tuple("Ingleside"),
create_street_tuple("Kenwood"),
create_street_tuple("Kimbark"),
create_street_tuple("Lake Park"),
create_street_tuple("Maryland"),
create_street_tuple("Oakenwald"),
create_street_tuple("Oakwood", blvd=True),
create_street_tuple("Stony Island"),
create_street_tuple("University"),
create_street_tuple("Woodlawn"),
]


def address_correction(address: str) -> str:
address = (
address.replace("&", "and")
.replace(" .s ", " .S ")
.replace(" .e ", " .E ")
.replace(" st. ", " St. ")
.replace(" pl. ", " Pl. ")
.replace(" Midway Pl. ", " Midway Plaisance ")
)

address = re.sub(r"\s{2,}", " ", address)

numerical_streets = [make_ordinal(s) for s in range(37, 66)]
for s in numerical_streets:
dir_s = f"E. {s}"
if s in address and dir_s not in address:
address = address.replace(s, dir_s)

full_s = f"{dir_s} St."
if (
dir_s in address
and full_s not in address
and f"{s} Pl" not in address
):
address = address.replace(dir_s, full_s)

for sc in STREET_CORRECTIONS:
name, dir_name, full_name = sc

if name in address and dir_name not in address:
address = address.replace(name, dir_name)

if dir_name in address and full_name not in address:
address = address.replace(dir_name, full_name)

return address


# Source: https://www.geeksforgeeks.org/convert-string-to-title-case-in-python/
def custom_title_case(input_string: str) -> str:
# List of articles.
Expand Down Expand Up @@ -66,6 +137,23 @@ def custom_title_case(input_string: str) -> str:
return " ".join(output_list)


# Source: https://stackoverflow.com/a/50992575
def make_ordinal(n: int):
"""
Convert an integer into its ordinal representation::
make_ordinal(0) => '0th'
make_ordinal(3) => '3rd'
make_ordinal(122) => '122nd'
make_ordinal(213) => '213th'
"""
if 11 <= (n % 100) <= 13:
suffix = "th"
else:
suffix = ["th", "st", "nd", "rd", "th"][min(n % 10, 4)]
return str(n) + suffix


def parse_scraped_incident_timestamp(i: dict) -> Optional[str]:
result = None

Expand Down

0 comments on commit 1cfb264

Please sign in to comment.