Merge pull request #6 from BU-Spark/ocr-testing

Added read_all_tiffs.py
BU-Spark · Oct 31, 2024 · e217de7 · e217de7
2 parents 59da25c + 929526a
commit e217de7
Show file tree

Hide file tree

Showing 5 changed files with 381 additions and 22 deletions.
diff --git a/modules/deed_preprocessing/eda2.ipynb b/modules/deed_preprocessing/eda2.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Potential racism detection model - Bigotry_dict keyword dectection. Assuming you have already downloaded the outputs folder of deeds from the original eda.ipynb. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracted text for 600 tiffs\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "def read_from_dir(directory):\n",
+    "    txt_content_array = []\n",
+    "    \n",
+    "    for file_name in os.listdir(directory):\n",
+    "        if file_name.endswith('.txt'):\n",
+    "            file_path = os.path.join(directory, file_name)\n",
+    "            with open(file_path, 'r', encoding='utf-8') as file:\n",
+    "                content = file.read()\n",
+    "                txt_content_array.append(content)\n",
+    "    \n",
+    "    return txt_content_array\n",
+    "\n",
+    "directory_path_outputs = './outputs'\n",
+    "\n",
+    "outputs = read_from_dir(directory_path_outputs)\n",
+    "print(f'Extracted text for {len(outputs)} tiffs')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converted text 0 into object\n",
+      "Converted text 100 into object\n",
+      "Converted text 200 into object\n",
+      "Converted text 300 into object\n",
+      "Converted text 400 into object\n",
+      "Converted text 500 into object\n",
+      "{'NUM': 38, 'NOUN': 219, 'PROPN': 87, 'DET': 97, 'ADP': 135, 'PUNCT': 112, 'CCONJ': 89, 'VERB': 104, 'AUX': 40, 'ADV': 37, 'ADJ': 48, 'PART': 13, 'SCONJ': 20, 'PRON': 31, 'SPACE': 2}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import spacy\n",
+    "from collections import Counter\n",
+    "\n",
+    "nlp = spacy.load('en_core_web_sm')\n",
+    "\n",
+    "def preprocess_text(text):\n",
+    "    text = re.sub(r'[\\n\\r\\t]', ' ', text)\n",
+    "    text = re.sub(r'[^\\x00-\\x7F]+', '', text)\n",
+    "    doc = nlp(text)\n",
+    "    \n",
+    "    result = {\n",
+    "        \"original_text\": text,\n",
+    "        \"sentences\": [],\n",
+    "        \"pos_groups\": {},\n",
+    "        \"named_entities\": [],\n",
+    "        \"dependencies\": [],\n",
+    "        \"token_offsets\": [],\n",
+    "        \"word_frequency\": {},\n",
+    "        \"sentence_lengths\": [],\n",
+    "        \"pos_counts\": {}\n",
+    "    }\n",
+    "    \n",
+    "    pos_groups = {\n",
+    "        \"NOUN\": [], \"VERB\": [], \"ADJ\": [], \"ADV\": [], \"PROPN\": [],\n",
+    "        \"DET\": [], \"AUX\": [], \"PRON\": [], \"ADP\": [], \"NUM\": [],\n",
+    "        \"PART\": [], \"PUNCT\": [], \"INTJ\": [], \"X\": []\n",
+    "    }\n",
+    "    \n",
+    "    all_tokens = []\n",
+    "    \n",
+    "    for sent in doc.sents:\n",
+    "        result[\"sentences\"].append(sent.text)\n",
+    "        result[\"sentence_lengths\"].append(len(sent))\n",
+    "        \n",
+    "        for token in sent:\n",
+    "            pos = token.pos_\n",
+    "            all_tokens.append(token.text)\n",
+    "            \n",
+    "            if pos in pos_groups:\n",
+    "                pos_groups[pos].append(token.text)\n",
+    "                \n",
+    "            result[\"dependencies\"].append({\n",
+    "                \"token\": token.text,\n",
+    "                \"dep\": token.dep_,\n",
+    "                \"head\": token.head.text\n",
+    "            })\n",
+    "            result[\"token_offsets\"].append({\n",
+    "                \"token\": token.text,\n",
+    "                \"start\": token.idx,\n",
+    "                \"end\": token.idx + len(token.text)\n",
+    "            })\n",
+    "    \n",
+    "    result[\"pos_groups\"] = pos_groups\n",
+    "    result[\"named_entities\"] = [{\"text\": ent.text, \"label\": ent.label_} for ent in doc.ents]\n",
+    "    result[\"word_frequency\"] = dict(Counter(all_tokens))\n",
+    "    result[\"pos_counts\"] = dict(Counter([token.pos_ for token in doc]))\n",
+    "    \n",
+    "    return result\n",
+    "\n",
+    "text_objects = []\n",
+    "for i, text in enumerate(outputs):\n",
+    "    text_objects.append(preprocess_text(text))\n",
+    "    if i % 100 == 0:\n",
+    "        print(f'Converted text {i} into object')\n",
+    "\n",
+    "print(text_objects[0][\"pos_counts\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Deed saved to: racist_deeds\\deed_53.txt\n",
+      "Deed saved to: racist_deeds\\deed_67.txt\n",
+      "Deed saved to: racist_deeds\\deed_95.txt\n",
+      "Deed saved to: racist_deeds\\deed_99.txt\n",
+      "Deed saved to: racist_deeds\\deed_152.txt\n",
+      "Deed saved to: racist_deeds\\deed_182.txt\n",
+      "Deed saved to: racist_deeds\\deed_186.txt\n",
+      "Deed saved to: racist_deeds\\deed_266.txt\n",
+      "Deed saved to: racist_deeds\\deed_281.txt\n",
+      "Deed saved to: racist_deeds\\deed_301.txt\n",
+      "Deed saved to: racist_deeds\\deed_308.txt\n",
+      "Deed saved to: racist_deeds\\deed_309.txt\n",
+      "Deed saved to: racist_deeds\\deed_320.txt\n",
+      "Deed saved to: racist_deeds\\deed_356.txt\n",
+      "Deed saved to: racist_deeds\\deed_371.txt\n",
+      "Deed saved to: racist_deeds\\deed_389.txt\n",
+      "Deed saved to: racist_deeds\\deed_418.txt\n",
+      "Deed saved to: racist_deeds\\deed_432.txt\n",
+      "Deed saved to: racist_deeds\\deed_467.txt\n",
+      "Deed saved to: racist_deeds\\deed_523.txt\n",
+      "Deed saved to: racist_deeds\\deed_531.txt\n",
+      "Deed saved to: racist_deeds\\deed_572.txt\n",
+      "    Keyword  Count                                              Texts\n",
+      "1     white     19  [i 68 See Duscharge, B. 6 15 9248 Stamp: 25 Kn...\n",
+      "0      race      3  [54 signment su Buh 605 Pug: 355 See Discharge...\n",
+      "2  Catholic      1  [582 1 Acknowledge Satisfaction and hereby dis...\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import re\n",
+    "import pandas as pd\n",
+    "from bigotry_dict import bigotry_dict\n",
+    "\n",
+    "def count_keywords_in_text(text, bigotry_dict):\n",
+    "    keyword_counts = {}\n",
+    "    text_lower = text.lower()  # Normalize the text to lowercase\n",
+    "\n",
+    "    for keyword in bigotry_dict:\n",
+    "        keyword_lower = keyword.lower()  # Normalize the keyword to lowercase\n",
+    "        # Use regular expressions to match only whole words\n",
+    "        pattern = r'\\b' + re.escape(keyword_lower) + r'\\b'\n",
+    "        matches = re.findall(pattern, text_lower)\n",
+    "        count = len(matches)  # Count the number of whole-word matches\n",
+    "\n",
+    "        if count > 0:\n",
+    "            # Initialize the keyword count if it's not already present\n",
+    "            if keyword_lower not in keyword_counts:\n",
+    "                keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': keyword}\n",
+    "            keyword_counts[keyword_lower]['count'] += 1  # Count the keyword only once per text\n",
+    "            keyword_counts[keyword_lower]['texts'].append(text)  # Add the text where the keyword appears\n",
+    "    \n",
+    "    return keyword_counts\n",
+    "\n",
+    "def save_racist_deed(text, deed_id, output_dir=\"racist_deeds\"):\n",
+    "    \"\"\"Saves the deed text to a txt file if racist keywords are found.\"\"\"\n",
+    "    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist\n",
+    "    \n",
+    "    file_path = os.path.join(output_dir, f\"deed_{deed_id}.txt\")\n",
+    "    with open(file_path, 'w') as f:\n",
+    "        f.write(text)\n",
+    "    \n",
+    "    print(f\"Deed saved to: {file_path}\")\n",
+    "\n",
+    "def process_deeds(text_objects):\n",
+    "    total_keyword_counts = {}\n",
+    "\n",
+    "    for i, text_obj in enumerate(text_objects):\n",
+    "        keyword_counts = count_keywords_in_text(text_obj['original_text'], bigotry_dict)\n",
+    "        \n",
+    "        racist_deed = False\n",
+    "        for keyword_lower, data in keyword_counts.items():\n",
+    "            if data['count'] > 0:\n",
+    "                racist_deed = True  # Mark deed as racist if any keyword is found\n",
+    "                if keyword_lower not in total_keyword_counts:\n",
+    "                    total_keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': data['display_keyword']}\n",
+    "                total_keyword_counts[keyword_lower]['count'] += 1  # Ensure the keyword is only counted once per deed\n",
+    "                total_keyword_counts[keyword_lower]['texts'].extend(data['texts'])  # Collect texts\n",
+    "\n",
+    "        # If any racist keyword is found, save the deed text\n",
+    "        if racist_deed:\n",
+    "            save_racist_deed(text_obj['original_text'], i)\n",
+    "\n",
+    "    # Convert the total counts to a pandas DataFrame for easier analysis\n",
+    "    keyword_df = pd.DataFrame([(data['display_keyword'], data['count'], data['texts']) \n",
+    "                               for data in total_keyword_counts.values()], \n",
+    "                              columns=['Keyword', 'Count', 'Texts'])\n",
+    "\n",
+    "    # Sort keywords by count for analysis\n",
+    "    keyword_df_sorted = keyword_df.sort_values(by=\"Count\", ascending=False)\n",
+    "\n",
+    "    # Display the dataframe for analysis (texts associated with each keyword)\n",
+    "    print(keyword_df_sorted)\n",
+    "\n",
+    "process_deeds(text_objects)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/modules/deed_preprocessing/keyword_dect2.py b/modules/deed_preprocessing/keyword_dect2.py
@@ -0,0 +1,76 @@
+import os
+import re
+import pandas as pd
+from bigotry_dict import bigotry_dict 
+
+def count_keywords_in_text(text, bigotry_dict):
+    keyword_counts = {}
+    text_lower = text.lower()  # Normalize the text to lowercase
+
+    for keyword in bigotry_dict:
+        keyword_lower = keyword.lower()  # Normalize the keyword to lowercase
+        # Use regular expressions to match only whole words
+        pattern = r'\b' + re.escape(keyword_lower) + r'\b'
+        matches = re.findall(pattern, text_lower)
+        count = len(matches)  # Count the number of whole-word matches
+
+        if count > 0:
+            # Initialize the keyword count if it's not already present
+            if keyword_lower not in keyword_counts:
+                keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': keyword}
+            keyword_counts[keyword_lower]['count'] += 1  # Count the keyword only once per text
+            keyword_counts[keyword_lower]['texts'].append(text)  # Add the text where the keyword appears
+
+    return keyword_counts
+
+def save_racist_deed(text, deed_id, output_dir="racist_deeds"):
+    """Saves the deed text to a txt file if racist keywords are found."""
+    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist
+
+    file_path = os.path.join(output_dir, f"deed_{deed_id}.txt")
+    with open(file_path, 'w') as f:
+        f.write(text)
+
+    print(f"Deed saved to: {file_path}")
+
+# Aggregate keyword counts and check for racist deeds in all text objects
+def process_deeds(text_objects):
+    total_keyword_counts = {}
+
+    for i, text_obj in enumerate(text_objects):
+        keyword_counts = count_keywords_in_text(text_obj['original_text'], bigotry_dict)
+
+        racist_deed = False
+        for keyword_lower, data in keyword_counts.items():
+            if data['count'] > 0:
+                racist_deed = True  # Mark deed as racist if any keyword is found
+                if keyword_lower not in total_keyword_counts:
+                    total_keyword_counts[keyword_lower] = {'count': 0, 'texts': [], 'display_keyword': data['display_keyword']}
+                total_keyword_counts[keyword_lower]['count'] += 1  # Ensure the keyword is only counted once per deed
+                total_keyword_counts[keyword_lower]['texts'].extend(data['texts'])  # Collect texts
+
+        # If any racist keyword is found, save the deed text
+        if racist_deed:
+            save_racist_deed(text_obj['original_text'], i)
+
+    # Convert the total counts to a pandas DataFrame for easier analysis
+    keyword_df = pd.DataFrame([(data['display_keyword'], data['count'], data['texts']) 
+                               for data in total_keyword_counts.values()], 
+                              columns=['Keyword', 'Count', 'Texts'])
+
+    # Sort keywords by count for analysis
+    keyword_df_sorted = keyword_df.sort_values(by="Count", ascending=False)
+
+    # Display the dataframe for analysis (texts associated with each keyword)
+    print(keyword_df_sorted)
+
+# Example usage
+text_objects = [
+    {"original_text": "This deed restricts African Americans and Chinese people."},
+    {"original_text": "This is a deed allowing Italian and Irish immigrants."},
+    {"original_text": "This is a regular deed with no discriminatory language."},
+    {"original_text": "Grace is welcome in my home."}
+]
+
+# Process the deeds and save any racist ones
+process_deeds(text_objects)
diff --git a/modules/deed_preprocessing/read_all_tiffs.py b/modules/deed_preprocessing/read_all_tiffs.py
@@ -0,0 +1,37 @@
+import os
+import zipfile
+import importlib.util
+from spellcheck import correct_spelling
+
+spec = importlib.util.spec_from_file_location("google_cloud_ocr", "../google_cloud_ocr/google_cloud_ocr.py")
+google_cloud_ocr_module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(google_cloud_ocr_module)
+
+output_dir = './outputs'
+
+os.makedirs(output_dir, exist_ok=True)
+
+for root, dirs, files in os.walk(r'../../../../mass-sec-state-deeds-data/Books 547-1849/'):
+    for file in files:
+
+        if file.endswith('.TIF'):
+            tiff_file_path = os.path.join(root, file)
+
+            with open(tiff_file_path, 'rb') as tiff_file:
+                try:
+                    print(tiff_file_path)
+                    extracted_text = google_cloud_ocr_module.google_cloud_ocr(tiff_file)
+
+                    # spell check the extracted text
+                    corrected_text = correct_spelling(extracted_text)
+
+                    output_file_name = f"{os.path.splitext(file)[0]}.txt"
+                    output_file_path = os.path.join(output_dir, output_file_name)
+
+                    with open(output_file_path, 'w', encoding='utf-8') as output_txt:
+                        output_txt.write(extracted_text)
+
+                except Exception as e:
+                    print(f"Error processing {file}: {str(e)}")
+
+print("OCR processing complete. Text files are saved in:", output_dir)
diff --git a/modules/deed_preprocessing/read_tiffs.py b/modules/deed_preprocessing/read_tiffs.py
@@ -26,7 +26,7 @@
                     extracted_text = google_cloud_ocr_module.google_cloud_ocr(tiff_file)
 
                     # spell check the extracted text
-                    # corrected_text = correct_spelling(extracted_text)
+                    corrected_text = correct_spelling(extracted_text)
 
                     output_file_name = f"{os.path.splitext(file)[0]}.txt"
                     output_file_path = os.path.join(output_dir, output_file_name)