-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from BU-Spark/ocr-testing
Added read_all_tiffs.py
- Loading branch information
Showing
5 changed files
with
381 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Potential racism detection model - Bigotry_dict keyword dectection. Assuming you have already downloaded the outputs folder of deeds from the original eda.ipynb. " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Extracted text for 600 tiffs\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"def read_from_dir(directory):\n", | ||
" txt_content_array = []\n", | ||
" \n", | ||
" for file_name in os.listdir(directory):\n", | ||
" if file_name.endswith('.txt'):\n", | ||
" file_path = os.path.join(directory, file_name)\n", | ||
" with open(file_path, 'r', encoding='utf-8') as file:\n", | ||
" content = file.read()\n", | ||
" txt_content_array.append(content)\n", | ||
" \n", | ||
" return txt_content_array\n", | ||
"\n", | ||
"directory_path_outputs = './outputs'\n", | ||
"\n", | ||
"outputs = read_from_dir(directory_path_outputs)\n", | ||
"print(f'Extracted text for {len(outputs)} tiffs')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Converted text 0 into object\n", | ||
"Converted text 100 into object\n", | ||
"Converted text 200 into object\n", | ||
"Converted text 300 into object\n", | ||
"Converted text 400 into object\n", | ||
"Converted text 500 into object\n", | ||
"{'NUM': 38, 'NOUN': 219, 'PROPN': 87, 'DET': 97, 'ADP': 135, 'PUNCT': 112, 'CCONJ': 89, 'VERB': 104, 'AUX': 40, 'ADV': 37, 'ADJ': 48, 'PART': 13, 'SCONJ': 20, 'PRON': 31, 'SPACE': 2}\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import re\n", | ||
"import spacy\n", | ||
"from collections import Counter\n", | ||
"\n", | ||
"nlp = spacy.load('en_core_web_sm')\n", | ||
"\n", | ||
"def preprocess_text(text):\n", | ||
" text = re.sub(r'[\\n\\r\\t]', ' ', text)\n", | ||
" text = re.sub(r'[^\\x00-\\x7F]+', '', text)\n", | ||
" doc = nlp(text)\n", | ||
" \n", | ||
" result = {\n", | ||
" \"original_text\": text,\n", | ||
" \"sentences\": [],\n", | ||
" \"pos_groups\": {},\n", | ||
" \"named_entities\": [],\n", | ||
" \"dependencies\": [],\n", | ||
" \"token_offsets\": [],\n", | ||
" \"word_frequency\": {},\n", | ||
" \"sentence_lengths\": [],\n", | ||
" \"pos_counts\": {}\n", | ||
" }\n", | ||
" \n", | ||
" pos_groups = {\n", | ||
" \"NOUN\": [], \"VERB\": [], \"ADJ\": [], \"ADV\": [], \"PROPN\": [],\n", | ||
" \"DET\": [], \"AUX\": [], \"PRON\": [], \"ADP\": [], \"NUM\": [],\n", | ||
" \"PART\": [], \"PUNCT\": [], \"INTJ\": [], \"X\": []\n", | ||
" }\n", | ||
" \n", | ||
" all_tokens = []\n", | ||
" \n", | ||
" for sent in doc.sents:\n", | ||
" result[\"sentences\"].append(sent.text)\n", | ||
" result[\"sentence_lengths\"].append(len(sent))\n", | ||
" \n", | ||
" for token in sent:\n", | ||
" pos = token.pos_\n", | ||
" all_tokens.append(token.text)\n", | ||
" \n", | ||
" if pos in pos_groups:\n", | ||
" pos_groups[pos].append(token.text)\n", | ||
" \n", | ||
" result[\"dependencies\"].append({\n", | ||
" \"token\": token.text,\n", | ||
" \"dep\": token.dep_,\n", | ||
" \"head\": token.head.text\n", | ||
" })\n", | ||
" result[\"token_offsets\"].append({\n", | ||
" \"token\": token.text,\n", | ||
" \"start\": token.idx,\n", | ||
" \"end\": token.idx + len(token.text)\n", | ||
" })\n", | ||
" \n", | ||
" result[\"pos_groups\"] = pos_groups\n", | ||
" result[\"named_entities\"] = [{\"text\": ent.text, \"label\": ent.label_} for ent in doc.ents]\n", | ||
" result[\"word_frequency\"] = dict(Counter(all_tokens))\n", | ||
" result[\"pos_counts\"] = dict(Counter([token.pos_ for token in doc]))\n", | ||
" \n", | ||
" return result\n", | ||
"\n", | ||
"text_objects = []\n", | ||
"for i, text in enumerate(outputs):\n", | ||
" text_objects.append(preprocess_text(text))\n", | ||
" if i % 100 == 0:\n", | ||
" print(f'Converted text {i} into object')\n", | ||
"\n", | ||
"print(text_objects[0][\"pos_counts\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 15, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Deed saved to: racist_deeds\\deed_53.txt\n", | ||
"Deed saved to: racist_deeds\\deed_67.txt\n", | ||
"Deed saved to: racist_deeds\\deed_95.txt\n", | ||
"Deed saved to: racist_deeds\\deed_99.txt\n", | ||
"Deed saved to: racist_deeds\\deed_152.txt\n", | ||
"Deed saved to: racist_deeds\\deed_182.txt\n", | ||
"Deed saved to: racist_deeds\\deed_186.txt\n", | ||
"Deed saved to: racist_deeds\\deed_266.txt\n", | ||
"Deed saved to: racist_deeds\\deed_281.txt\n", | ||
"Deed saved to: racist_deeds\\deed_301.txt\n", | ||
"Deed saved to: racist_deeds\\deed_308.txt\n", | ||
"Deed saved to: racist_deeds\\deed_309.txt\n", | ||
"Deed saved to: racist_deeds\\deed_320.txt\n", | ||
"Deed saved to: racist_deeds\\deed_356.txt\n", | ||
"Deed saved to: racist_deeds\\deed_371.txt\n", | ||
"Deed saved to: racist_deeds\\deed_389.txt\n", | ||
"Deed saved to: racist_deeds\\deed_418.txt\n", | ||
"Deed saved to: racist_deeds\\deed_432.txt\n", | ||
"Deed saved to: racist_deeds\\deed_467.txt\n", | ||
"Deed saved to: racist_deeds\\deed_523.txt\n", | ||
"Deed saved to: racist_deeds\\deed_531.txt\n", | ||
"Deed saved to: racist_deeds\\deed_572.txt\n", | ||
" Keyword Count Texts\n", | ||
"1 white 19 [i 68 See Duscharge, B. 6 15 9248 Stamp: 25 Kn...\n", | ||
"0 race 3 [54 signment su Buh 605 Pug: 355 See Discharge...\n", | ||
"2 Catholic 1 [582 1 Acknowledge Satisfaction and hereby dis...\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import os\n", | ||
"import re\n", | ||
"import pandas as pd\n", | ||
"from bigotry_dict import bigotry_dict\n", | ||
"\n", | ||
"def count_keywords_in_text(text, bigotry_dict):\n", | ||
" keyword_counts = {}\n", | ||
" text_lower = text.lower() # Normalize the text to lowercase\n", | ||
"\n", | ||
" for keyword in bigotry_dict:\n", | ||
" keyword_lower = keyword.lower() # Normalize the keyword to lowercase\n", | ||
" # Use regular expressions to match only whole words\n", | ||
" pattern = r'\\b' + re.escape(keyword_lower) + r'\\b'\n", | ||
" matches = re.findall(pattern, text_lower)\n", | ||
" count = len(matches) # Count the number of whole-word matches\n", | ||
"\n", | ||
" if count > 0:\n", | ||
" # Initialize the keyword count if it's not already present\n", | ||
" if keyword_lower not in keyword_counts:\n", | ||
" keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': keyword}\n", | ||
" keyword_counts[keyword_lower]['count'] += 1 # Count the keyword only once per text\n", | ||
" keyword_counts[keyword_lower]['texts'].append(text) # Add the text where the keyword appears\n", | ||
" \n", | ||
" return keyword_counts\n", | ||
"\n", | ||
"def save_racist_deed(text, deed_id, output_dir=\"racist_deeds\"):\n", | ||
" \"\"\"Saves the deed text to a txt file if racist keywords are found.\"\"\"\n", | ||
" os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist\n", | ||
" \n", | ||
" file_path = os.path.join(output_dir, f\"deed_{deed_id}.txt\")\n", | ||
" with open(file_path, 'w') as f:\n", | ||
" f.write(text)\n", | ||
" \n", | ||
" print(f\"Deed saved to: {file_path}\")\n", | ||
"\n", | ||
"def process_deeds(text_objects):\n", | ||
" total_keyword_counts = {}\n", | ||
"\n", | ||
" for i, text_obj in enumerate(text_objects):\n", | ||
" keyword_counts = count_keywords_in_text(text_obj['original_text'], bigotry_dict)\n", | ||
" \n", | ||
" racist_deed = False\n", | ||
" for keyword_lower, data in keyword_counts.items():\n", | ||
" if data['count'] > 0:\n", | ||
" racist_deed = True # Mark deed as racist if any keyword is found\n", | ||
" if keyword_lower not in total_keyword_counts:\n", | ||
" total_keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': data['display_keyword']}\n", | ||
" total_keyword_counts[keyword_lower]['count'] += 1 # Ensure the keyword is only counted once per deed\n", | ||
" total_keyword_counts[keyword_lower]['texts'].extend(data['texts']) # Collect texts\n", | ||
"\n", | ||
" # If any racist keyword is found, save the deed text\n", | ||
" if racist_deed:\n", | ||
" save_racist_deed(text_obj['original_text'], i)\n", | ||
"\n", | ||
" # Convert the total counts to a pandas DataFrame for easier analysis\n", | ||
" keyword_df = pd.DataFrame([(data['display_keyword'], data['count'], data['texts']) \n", | ||
" for data in total_keyword_counts.values()], \n", | ||
" columns=['Keyword', 'Count', 'Texts'])\n", | ||
"\n", | ||
" # Sort keywords by count for analysis\n", | ||
" keyword_df_sorted = keyword_df.sort_values(by=\"Count\", ascending=False)\n", | ||
"\n", | ||
" # Display the dataframe for analysis (texts associated with each keyword)\n", | ||
" print(keyword_df_sorted)\n", | ||
"\n", | ||
"process_deeds(text_objects)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import os | ||
import re | ||
import pandas as pd | ||
from bigotry_dict import bigotry_dict | ||
|
||
def count_keywords_in_text(text, bigotry_dict): | ||
keyword_counts = {} | ||
text_lower = text.lower() # Normalize the text to lowercase | ||
|
||
for keyword in bigotry_dict: | ||
keyword_lower = keyword.lower() # Normalize the keyword to lowercase | ||
# Use regular expressions to match only whole words | ||
pattern = r'\b' + re.escape(keyword_lower) + r'\b' | ||
matches = re.findall(pattern, text_lower) | ||
count = len(matches) # Count the number of whole-word matches | ||
|
||
if count > 0: | ||
# Initialize the keyword count if it's not already present | ||
if keyword_lower not in keyword_counts: | ||
keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': keyword} | ||
keyword_counts[keyword_lower]['count'] += 1 # Count the keyword only once per text | ||
keyword_counts[keyword_lower]['texts'].append(text) # Add the text where the keyword appears | ||
|
||
return keyword_counts | ||
|
||
def save_racist_deed(text, deed_id, output_dir="racist_deeds"): | ||
"""Saves the deed text to a txt file if racist keywords are found.""" | ||
os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist | ||
|
||
file_path = os.path.join(output_dir, f"deed_{deed_id}.txt") | ||
with open(file_path, 'w') as f: | ||
f.write(text) | ||
|
||
print(f"Deed saved to: {file_path}") | ||
|
||
# Aggregate keyword counts and check for racist deeds in all text objects | ||
def process_deeds(text_objects): | ||
total_keyword_counts = {} | ||
|
||
for i, text_obj in enumerate(text_objects): | ||
keyword_counts = count_keywords_in_text(text_obj['original_text'], bigotry_dict) | ||
|
||
racist_deed = False | ||
for keyword_lower, data in keyword_counts.items(): | ||
if data['count'] > 0: | ||
racist_deed = True # Mark deed as racist if any keyword is found | ||
if keyword_lower not in total_keyword_counts: | ||
total_keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': data['display_keyword']} | ||
total_keyword_counts[keyword_lower]['count'] += 1 # Ensure the keyword is only counted once per deed | ||
total_keyword_counts[keyword_lower]['texts'].extend(data['texts']) # Collect texts | ||
|
||
# If any racist keyword is found, save the deed text | ||
if racist_deed: | ||
save_racist_deed(text_obj['original_text'], i) | ||
|
||
# Convert the total counts to a pandas DataFrame for easier analysis | ||
keyword_df = pd.DataFrame([(data['display_keyword'], data['count'], data['texts']) | ||
for data in total_keyword_counts.values()], | ||
columns=['Keyword', 'Count', 'Texts']) | ||
|
||
# Sort keywords by count for analysis | ||
keyword_df_sorted = keyword_df.sort_values(by="Count", ascending=False) | ||
|
||
# Display the dataframe for analysis (texts associated with each keyword) | ||
print(keyword_df_sorted) | ||
|
||
# Example usage | ||
text_objects = [ | ||
{"original_text": "This deed restricts African Americans and Chinese people."}, | ||
{"original_text": "This is a deed allowing Italian and Irish immigrants."}, | ||
{"original_text": "This is a regular deed with no discriminatory language."}, | ||
{"original_text": "Grace is welcome in my home."} | ||
] | ||
|
||
# Process the deeds and save any racist ones | ||
process_deeds(text_objects) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import os | ||
import zipfile | ||
import importlib.util | ||
from spellcheck import correct_spelling | ||
|
||
spec = importlib.util.spec_from_file_location("google_cloud_ocr", "../google_cloud_ocr/google_cloud_ocr.py") | ||
google_cloud_ocr_module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(google_cloud_ocr_module) | ||
|
||
output_dir = './outputs' | ||
|
||
os.makedirs(output_dir, exist_ok=True) | ||
|
||
for root, dirs, files in os.walk(r'../../../../mass-sec-state-deeds-data/Books 547-1849/'): | ||
for file in files: | ||
|
||
if file.endswith('.TIF'): | ||
tiff_file_path = os.path.join(root, file) | ||
|
||
with open(tiff_file_path, 'rb') as tiff_file: | ||
try: | ||
print(tiff_file_path) | ||
extracted_text = google_cloud_ocr_module.google_cloud_ocr(tiff_file) | ||
|
||
# spell check the extracted text | ||
corrected_text = correct_spelling(extracted_text) | ||
|
||
output_file_name = f"{os.path.splitext(file)[0]}.txt" | ||
output_file_path = os.path.join(output_dir, output_file_name) | ||
|
||
with open(output_file_path, 'w', encoding='utf-8') as output_txt: | ||
output_txt.write(extracted_text) | ||
|
||
except Exception as e: | ||
print(f"Error processing {file}: {str(e)}") | ||
|
||
print("OCR processing complete. Text files are saved in:", output_dir) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.