Skip to content

Commit

Permalink
Merge pull request #6 from BU-Spark/ocr-testing
Browse files Browse the repository at this point in the history
Added read_all_tiffs.py
  • Loading branch information
NathanielQuisel authored Oct 31, 2024
2 parents 59da25c + 929526a commit e217de7
Show file tree
Hide file tree
Showing 5 changed files with 381 additions and 22 deletions.
261 changes: 261 additions & 0 deletions modules/deed_preprocessing/eda2.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Potential racism detection model - Bigotry_dict keyword dectection. Assuming you have already downloaded the outputs folder of deeds from the original eda.ipynb. "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Extracted text for 600 tiffs\n"
]
}
],
"source": [
"import os\n",
"def read_from_dir(directory):\n",
" txt_content_array = []\n",
" \n",
" for file_name in os.listdir(directory):\n",
" if file_name.endswith('.txt'):\n",
" file_path = os.path.join(directory, file_name)\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" content = file.read()\n",
" txt_content_array.append(content)\n",
" \n",
" return txt_content_array\n",
"\n",
"directory_path_outputs = './outputs'\n",
"\n",
"outputs = read_from_dir(directory_path_outputs)\n",
"print(f'Extracted text for {len(outputs)} tiffs')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Converted text 0 into object\n",
"Converted text 100 into object\n",
"Converted text 200 into object\n",
"Converted text 300 into object\n",
"Converted text 400 into object\n",
"Converted text 500 into object\n",
"{'NUM': 38, 'NOUN': 219, 'PROPN': 87, 'DET': 97, 'ADP': 135, 'PUNCT': 112, 'CCONJ': 89, 'VERB': 104, 'AUX': 40, 'ADV': 37, 'ADJ': 48, 'PART': 13, 'SCONJ': 20, 'PRON': 31, 'SPACE': 2}\n"
]
}
],
"source": [
"import re\n",
"import spacy\n",
"from collections import Counter\n",
"\n",
"nlp = spacy.load('en_core_web_sm')\n",
"\n",
"def preprocess_text(text):\n",
" text = re.sub(r'[\\n\\r\\t]', ' ', text)\n",
" text = re.sub(r'[^\\x00-\\x7F]+', '', text)\n",
" doc = nlp(text)\n",
" \n",
" result = {\n",
" \"original_text\": text,\n",
" \"sentences\": [],\n",
" \"pos_groups\": {},\n",
" \"named_entities\": [],\n",
" \"dependencies\": [],\n",
" \"token_offsets\": [],\n",
" \"word_frequency\": {},\n",
" \"sentence_lengths\": [],\n",
" \"pos_counts\": {}\n",
" }\n",
" \n",
" pos_groups = {\n",
" \"NOUN\": [], \"VERB\": [], \"ADJ\": [], \"ADV\": [], \"PROPN\": [],\n",
" \"DET\": [], \"AUX\": [], \"PRON\": [], \"ADP\": [], \"NUM\": [],\n",
" \"PART\": [], \"PUNCT\": [], \"INTJ\": [], \"X\": []\n",
" }\n",
" \n",
" all_tokens = []\n",
" \n",
" for sent in doc.sents:\n",
" result[\"sentences\"].append(sent.text)\n",
" result[\"sentence_lengths\"].append(len(sent))\n",
" \n",
" for token in sent:\n",
" pos = token.pos_\n",
" all_tokens.append(token.text)\n",
" \n",
" if pos in pos_groups:\n",
" pos_groups[pos].append(token.text)\n",
" \n",
" result[\"dependencies\"].append({\n",
" \"token\": token.text,\n",
" \"dep\": token.dep_,\n",
" \"head\": token.head.text\n",
" })\n",
" result[\"token_offsets\"].append({\n",
" \"token\": token.text,\n",
" \"start\": token.idx,\n",
" \"end\": token.idx + len(token.text)\n",
" })\n",
" \n",
" result[\"pos_groups\"] = pos_groups\n",
" result[\"named_entities\"] = [{\"text\": ent.text, \"label\": ent.label_} for ent in doc.ents]\n",
" result[\"word_frequency\"] = dict(Counter(all_tokens))\n",
" result[\"pos_counts\"] = dict(Counter([token.pos_ for token in doc]))\n",
" \n",
" return result\n",
"\n",
"text_objects = []\n",
"for i, text in enumerate(outputs):\n",
" text_objects.append(preprocess_text(text))\n",
" if i % 100 == 0:\n",
" print(f'Converted text {i} into object')\n",
"\n",
"print(text_objects[0][\"pos_counts\"])"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Deed saved to: racist_deeds\\deed_53.txt\n",
"Deed saved to: racist_deeds\\deed_67.txt\n",
"Deed saved to: racist_deeds\\deed_95.txt\n",
"Deed saved to: racist_deeds\\deed_99.txt\n",
"Deed saved to: racist_deeds\\deed_152.txt\n",
"Deed saved to: racist_deeds\\deed_182.txt\n",
"Deed saved to: racist_deeds\\deed_186.txt\n",
"Deed saved to: racist_deeds\\deed_266.txt\n",
"Deed saved to: racist_deeds\\deed_281.txt\n",
"Deed saved to: racist_deeds\\deed_301.txt\n",
"Deed saved to: racist_deeds\\deed_308.txt\n",
"Deed saved to: racist_deeds\\deed_309.txt\n",
"Deed saved to: racist_deeds\\deed_320.txt\n",
"Deed saved to: racist_deeds\\deed_356.txt\n",
"Deed saved to: racist_deeds\\deed_371.txt\n",
"Deed saved to: racist_deeds\\deed_389.txt\n",
"Deed saved to: racist_deeds\\deed_418.txt\n",
"Deed saved to: racist_deeds\\deed_432.txt\n",
"Deed saved to: racist_deeds\\deed_467.txt\n",
"Deed saved to: racist_deeds\\deed_523.txt\n",
"Deed saved to: racist_deeds\\deed_531.txt\n",
"Deed saved to: racist_deeds\\deed_572.txt\n",
" Keyword Count Texts\n",
"1 white 19 [i 68 See Duscharge, B. 6 15 9248 Stamp: 25 Kn...\n",
"0 race 3 [54 signment su Buh 605 Pug: 355 See Discharge...\n",
"2 Catholic 1 [582 1 Acknowledge Satisfaction and hereby dis...\n"
]
}
],
"source": [
"import os\n",
"import re\n",
"import pandas as pd\n",
"from bigotry_dict import bigotry_dict\n",
"\n",
"def count_keywords_in_text(text, bigotry_dict):\n",
" keyword_counts = {}\n",
" text_lower = text.lower() # Normalize the text to lowercase\n",
"\n",
" for keyword in bigotry_dict:\n",
" keyword_lower = keyword.lower() # Normalize the keyword to lowercase\n",
" # Use regular expressions to match only whole words\n",
" pattern = r'\\b' + re.escape(keyword_lower) + r'\\b'\n",
" matches = re.findall(pattern, text_lower)\n",
" count = len(matches) # Count the number of whole-word matches\n",
"\n",
" if count > 0:\n",
" # Initialize the keyword count if it's not already present\n",
" if keyword_lower not in keyword_counts:\n",
" keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': keyword}\n",
" keyword_counts[keyword_lower]['count'] += 1 # Count the keyword only once per text\n",
" keyword_counts[keyword_lower]['texts'].append(text) # Add the text where the keyword appears\n",
" \n",
" return keyword_counts\n",
"\n",
"def save_racist_deed(text, deed_id, output_dir=\"racist_deeds\"):\n",
" \"\"\"Saves the deed text to a txt file if racist keywords are found.\"\"\"\n",
" os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist\n",
" \n",
" file_path = os.path.join(output_dir, f\"deed_{deed_id}.txt\")\n",
" with open(file_path, 'w') as f:\n",
" f.write(text)\n",
" \n",
" print(f\"Deed saved to: {file_path}\")\n",
"\n",
"def process_deeds(text_objects):\n",
" total_keyword_counts = {}\n",
"\n",
" for i, text_obj in enumerate(text_objects):\n",
" keyword_counts = count_keywords_in_text(text_obj['original_text'], bigotry_dict)\n",
" \n",
" racist_deed = False\n",
" for keyword_lower, data in keyword_counts.items():\n",
" if data['count'] > 0:\n",
" racist_deed = True # Mark deed as racist if any keyword is found\n",
" if keyword_lower not in total_keyword_counts:\n",
" total_keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': data['display_keyword']}\n",
" total_keyword_counts[keyword_lower]['count'] += 1 # Ensure the keyword is only counted once per deed\n",
" total_keyword_counts[keyword_lower]['texts'].extend(data['texts']) # Collect texts\n",
"\n",
" # If any racist keyword is found, save the deed text\n",
" if racist_deed:\n",
" save_racist_deed(text_obj['original_text'], i)\n",
"\n",
" # Convert the total counts to a pandas DataFrame for easier analysis\n",
" keyword_df = pd.DataFrame([(data['display_keyword'], data['count'], data['texts']) \n",
" for data in total_keyword_counts.values()], \n",
" columns=['Keyword', 'Count', 'Texts'])\n",
"\n",
" # Sort keywords by count for analysis\n",
" keyword_df_sorted = keyword_df.sort_values(by=\"Count\", ascending=False)\n",
"\n",
" # Display the dataframe for analysis (texts associated with each keyword)\n",
" print(keyword_df_sorted)\n",
"\n",
"process_deeds(text_objects)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
76 changes: 76 additions & 0 deletions modules/deed_preprocessing/keyword_dect2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import re
import pandas as pd
from bigotry_dict import bigotry_dict

def count_keywords_in_text(text, bigotry_dict):
keyword_counts = {}
text_lower = text.lower() # Normalize the text to lowercase

for keyword in bigotry_dict:
keyword_lower = keyword.lower() # Normalize the keyword to lowercase
# Use regular expressions to match only whole words
pattern = r'\b' + re.escape(keyword_lower) + r'\b'
matches = re.findall(pattern, text_lower)
count = len(matches) # Count the number of whole-word matches

if count > 0:
# Initialize the keyword count if it's not already present
if keyword_lower not in keyword_counts:
keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': keyword}
keyword_counts[keyword_lower]['count'] += 1 # Count the keyword only once per text
keyword_counts[keyword_lower]['texts'].append(text) # Add the text where the keyword appears

return keyword_counts

def save_racist_deed(text, deed_id, output_dir="racist_deeds"):
"""Saves the deed text to a txt file if racist keywords are found."""
os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist

file_path = os.path.join(output_dir, f"deed_{deed_id}.txt")
with open(file_path, 'w') as f:
f.write(text)

print(f"Deed saved to: {file_path}")

# Aggregate keyword counts and check for racist deeds in all text objects
def process_deeds(text_objects):
total_keyword_counts = {}

for i, text_obj in enumerate(text_objects):
keyword_counts = count_keywords_in_text(text_obj['original_text'], bigotry_dict)

racist_deed = False
for keyword_lower, data in keyword_counts.items():
if data['count'] > 0:
racist_deed = True # Mark deed as racist if any keyword is found
if keyword_lower not in total_keyword_counts:
total_keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': data['display_keyword']}
total_keyword_counts[keyword_lower]['count'] += 1 # Ensure the keyword is only counted once per deed
total_keyword_counts[keyword_lower]['texts'].extend(data['texts']) # Collect texts

# If any racist keyword is found, save the deed text
if racist_deed:
save_racist_deed(text_obj['original_text'], i)

# Convert the total counts to a pandas DataFrame for easier analysis
keyword_df = pd.DataFrame([(data['display_keyword'], data['count'], data['texts'])
for data in total_keyword_counts.values()],
columns=['Keyword', 'Count', 'Texts'])

# Sort keywords by count for analysis
keyword_df_sorted = keyword_df.sort_values(by="Count", ascending=False)

# Display the dataframe for analysis (texts associated with each keyword)
print(keyword_df_sorted)

# Example usage
text_objects = [
{"original_text": "This deed restricts African Americans and Chinese people."},
{"original_text": "This is a deed allowing Italian and Irish immigrants."},
{"original_text": "This is a regular deed with no discriminatory language."},
{"original_text": "Grace is welcome in my home."}
]

# Process the deeds and save any racist ones
process_deeds(text_objects)
37 changes: 37 additions & 0 deletions modules/deed_preprocessing/read_all_tiffs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os
import zipfile
import importlib.util
from spellcheck import correct_spelling

spec = importlib.util.spec_from_file_location("google_cloud_ocr", "../google_cloud_ocr/google_cloud_ocr.py")
google_cloud_ocr_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(google_cloud_ocr_module)

output_dir = './outputs'

os.makedirs(output_dir, exist_ok=True)

for root, dirs, files in os.walk(r'../../../../mass-sec-state-deeds-data/Books 547-1849/'):
for file in files:

if file.endswith('.TIF'):
tiff_file_path = os.path.join(root, file)

with open(tiff_file_path, 'rb') as tiff_file:
try:
print(tiff_file_path)
extracted_text = google_cloud_ocr_module.google_cloud_ocr(tiff_file)

# spell check the extracted text
corrected_text = correct_spelling(extracted_text)

output_file_name = f"{os.path.splitext(file)[0]}.txt"
output_file_path = os.path.join(output_dir, output_file_name)

with open(output_file_path, 'w', encoding='utf-8') as output_txt:
output_txt.write(extracted_text)

except Exception as e:
print(f"Error processing {file}: {str(e)}")

print("OCR processing complete. Text files are saved in:", output_dir)
2 changes: 1 addition & 1 deletion modules/deed_preprocessing/read_tiffs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
extracted_text = google_cloud_ocr_module.google_cloud_ocr(tiff_file)

# spell check the extracted text
# corrected_text = correct_spelling(extracted_text)
corrected_text = correct_spelling(extracted_text)

output_file_name = f"{os.path.splitext(file)[0]}.txt"
output_file_path = os.path.join(output_dir, output_file_name)
Expand Down
Loading

0 comments on commit e217de7

Please sign in to comment.