diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d8e2e9e --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.pyc +.vs/ +tmp/ +*.tsv diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..ba76e16 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,9 @@ +[submodule "raw/eng2tamildictionary"] + path = raw/eng2tamildictionary + url = https://github.com/linuxkathirvel/eng2tamildictionary +[submodule "raw/english-tamil-dictionary-api"] + path = raw/english-tamil-dictionary-api + url = https://github.com/abuvanth/english-tamil-dictionary-api +[submodule "raw/dictionary"] + path = raw/dictionary + url = https://github.com/sathia27/dictionary diff --git a/consolidate.py b/consolidate.py new file mode 100644 index 0000000..46ddc03 --- /dev/null +++ b/consolidate.py @@ -0,0 +1,62 @@ +from tqdm import tqdm +import re +import pandas as pd + +en_lang_pairs_count = {} + +def add_pair(en_word, lang_word): + if type(en_word) is not str or type(lang_word) is not str: + return + en_word = en_word.replace('"', '').strip().lower().rstrip('.') + lang_word = lang_word.replace('"', '').strip().rstrip('.') + key = (en_word, lang_word) + if len(key[0])<2 or len(key[1])<2: + return + if key in en_lang_pairs_count: + en_lang_pairs_count[key] += 1 + else: + en_lang_pairs_count[key] = 1 + return + +################################################################### + +# https://github.com/linuxkathirvel/eng2tamildictionary + +import json +data = json.load(open('raw/eng2tamildictionary/dictionary.json', encoding="utf-8")) + +for record in tqdm(data, desc='eng2tamildictionary'): + try: + en_word, lang_words = record["eng"], record["tamil"] + except: + if 'word_list' in record: + continue + raise + + en_word = re.sub('\([^)]*?\)', '', en_word) # Remove brackets + lang_words = re.sub('\([^)]*?\)', '', lang_words) # Remove brackets + lang_words = re.sub("[A-Za-z]\.?", ' ', lang_words) # Remove English words from tamil + lang_words = re.sub("-[1-9]", ' ', lang_words) # Remove numbered bullets + for lang_word in lang_words.split(','): + add_pair(en_word, lang_word) + +# https://github.com/abuvanth/english-tamil-dictionary-api +# Same as above, so ignore. + +# https://github.com/sathia27/dictionary +# Bad format. TODO: Clean and parse + +# import sqlite3, os +# con = sqlite3.connect('raw/dictionary/word.db', isolation_level=None, detect_types=sqlite3.PARSE_COLNAMES) + +# df = pd.read_sql_query("SELECT * FROM words", con) +# os.makedirs('tmp', exist_ok=True) +# df.to_csv("tmp/dict.csv") + +## WRITE + +out = open('consolidated.tsv', 'w', encoding='utf-8') +out.write(f"ENG\tLANG\tCOUNT\n") +for (en_word, lang_word), count in en_lang_pairs_count.items(): + out.write(f"{en_word}\t{lang_word}\t{count}\n") +out.close() diff --git a/raw/dictionary b/raw/dictionary new file mode 160000 index 0000000..6f50493 --- /dev/null +++ b/raw/dictionary @@ -0,0 +1 @@ +Subproject commit 6f504935d6fb3b1c0727018159fef9fe65a33201 diff --git a/raw/eng2tamildictionary b/raw/eng2tamildictionary new file mode 160000 index 0000000..a4e0695 --- /dev/null +++ b/raw/eng2tamildictionary @@ -0,0 +1 @@ +Subproject commit a4e06953d6b4d25cb91a34fbf0fa91561e6241dd diff --git a/raw/english-tamil-dictionary-api b/raw/english-tamil-dictionary-api new file mode 160000 index 0000000..6c2ec06 --- /dev/null +++ b/raw/english-tamil-dictionary-api @@ -0,0 +1 @@ +Subproject commit 6c2ec06492b4680ba6c95d8449a91dd3a2b43251