Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring the refactoring #10

Merged
merged 8 commits into from
May 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added portability/__init__.py
Empty file.
File renamed without changes.
4 changes: 2 additions & 2 deletions wordstats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from .word_info import WordInfo
from .language_info import LanguageInfo
from .cognate_info import CognateInfo
from .getchunix import _Getch
from .edit_distance_function_factory import WordDistanceFactory
from portability.getchunix import _Getch
from .edit_distance_function_factory import WordDistance

# Create all tables in the engine. equivalent to "Create Table" in SQL
Base.metadata.create_all(BaseService.engine)
Expand Down
16 changes: 0 additions & 16 deletions wordstats/cognate_automatic_evaluation.py

This file was deleted.

67 changes: 0 additions & 67 deletions wordstats/cognate_files_path.py

This file was deleted.

56 changes: 32 additions & 24 deletions wordstats/cognate_info.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,22 @@
import codecs
import os
from datetime import datetime

from python_translators.translators.glosbe_translator import Translator
from python_translators.translators.glosbe_over_tor_translator import GlosbeOverTorTranslator
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@joelgrondman - note that many of these removed imports are done automatically by Python when refactoring code since they are not used in the corresponding file.

For SQLAlchemy I have had in the past code which requires a class to be loaded, even if it's not used straight away in that file. I hope this is not the case anywhere here!

from python_translators.translation_query import TranslationQuery

from sqlalchemy import Table
import configparser

from wordstats.loading_from_hermit import load_language_from_hermit
from wordstats.file_handling.file_operations import *
from wordstats.file_handling.loading_from_hermit import *
from wordstats.file_handling.cognate_files_path import *

from .word_info import WordInfo, UnknownWordInfo
from .utils.mem_footprint import total_size
from .base_service import BaseService, Base
from .config import MAX_WORDS, SEPARATOR_PRIMARY, SEPARATOR_SECONDARY
from .metrics_computers import *
from .cognate_files_path import *
from .base_service import BaseService
from .config import SEPARATOR_PRIMARY, SEPARATOR_SECONDARY

from .cognate_db import *
from .getchunix import read_single_keypress
from .edit_distance_function_factory import WordDistanceFactory
from .edit_distance_function_factory import WordDistance
from collections import defaultdict

from time import sleep


class CognateInfo(object):
"""
Expand All @@ -37,7 +30,7 @@ class CognateInfo(object):

"""

def __init__(self, primary, secondary, distance_computer_class: WordDistanceFactory, author:str = ""):
def __init__(self, primary, secondary, distance_computer_class: WordDistance, author:str = ""):
"""

either load from file, or compute if needed
Expand All @@ -62,10 +55,15 @@ def best_guess(self):
return best_dict

# generates candidates based on distance function func and word lists
# def apply_distance_metric(self, wordlist1, wordlist2, func):
def generate_candidates(self):
"""
Generates candidates by comparing each possible pair of words in two
language lists

def compute(self):
...

:return:
"""
wordlist1 = list(load_language_from_hermit(self.primary).word_info_dict.keys())
wordlist2 = list(load_language_from_hermit(self.secondary).word_info_dict.keys())

Expand All @@ -76,9 +74,20 @@ def compute(self):
if self.distance_computer.is_candidate(w1, w2):
self.candidates[w1].append(w2)


def compute_translator(self, translator:Translator, save:Boolean = False):

# generate candidates and automatically evaluates candidates to be cognates
# optionally save candidates to database as they are found
def generate_candidates_translator(self, translator:Translator, save:Boolean = False):
"""
Generates candidates by translating words through an API
the translations are stored in candidates
automatically evaluates candidates to be cognates
optionally save candidates to database as they are found

...
:param: A translator. See python_translators repository for options
:param: Boolean for toggling saving candidate/evaluation after each translation
:return:
"""
wordlist = set(load_language_from_hermit(self.primary).word_info_dict.keys())

translator = translator(source_language=self.primary, target_language=self.secondary)
Expand Down Expand Up @@ -134,7 +143,7 @@ def add_to_blacklist(self, primaryWord, secondaryWord):
# ========================

@classmethod
def load_cached(cls, primary, secondary, distance_computer_class: WordDistanceFactory, author:str = ""):
def load_cached(cls, primary, secondary, distance_computer_class: WordDistance, author:str = ""):

new_registry = cls.load_from_db(primary, secondary,
distance_computer_class, author)
Expand All @@ -151,7 +160,6 @@ def load_cached(cls, primary, secondary, distance_computer_class: WordDistanceFa

new_registry = cls(primary, secondary,
distance_computer_class, author)
new_registry.compute() # compute candidates

return new_registry

Expand Down Expand Up @@ -326,7 +334,7 @@ def add_candidate_to_db(self, primaryWord, secondaryWord):
Try to add one cognate pair to candidate db.

:param primaryWord: word from primary language
:param secondaryWord: word from secundary language
:param secondaryWord: word from secondary language
:return: None
"""
try:
Expand All @@ -343,7 +351,7 @@ def add_to_db(self, primaryWord, secondaryWord, whitelist: Boolean):
Try to add one cognate pair to evaluation db.

:param primaryWord: word from primary language
:param secondaryWord: word from secundary language
:param secondaryWord: word from secondary language
:param whitelist: True to add cognate pair to whitelist otherwise blacklist
:return: None
"""
Expand Down
10 changes: 5 additions & 5 deletions wordstats/edit_distance.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .edit_distance_function_factory import WordDistanceFactory
from .edit_distance_function_factory import WordDistance
from nltk.metrics.distance import edit_distance

class LanguageAwareEditDistance(WordDistanceFactory):
class EditDistance(WordDistance):
def __init__(self, primary, secondary, author:str = ""):
super().__init__(primary, secondary, author)
self.method_name = "edit_distance"
Expand All @@ -10,12 +10,12 @@ def __init__(self, primary, secondary, author:str = ""):

def _initialize_distances(self):

# these might change based on the primary secondayr
# these might change based on the primary secondary
self.replace_distance = 1
self.add_distance = 1

def edit_distance_function(self, word1: str, word2: str):

lengthLongest = max(len(word1),len(word2))
length_longest = max(len(word1),len(word2))

return edit_distance(word1, word2)/lengthLongest
return edit_distance(word1, word2)/length_longest
7 changes: 2 additions & 5 deletions wordstats/edit_distance_absolute.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import codecs
import configparser
from .cognate_files_path import *
from .edit_distance_function_factory import WordDistanceFactory
from .edit_distance_function_factory import WordDistance

class WordDistanceAbsolute(WordDistanceFactory):
class WordDistanceAbsolute(WordDistance):
def __init__(self, primary, secondary, author:str = ""):
super().__init__(primary, secondary, author)
self.replace_distance = 2
Expand Down
6 changes: 3 additions & 3 deletions wordstats/edit_distance_function_factory.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from wordstats.cognate_files_path import *
import configparser
from wordstats.file_handling.cognate_files_path import *
from wordstats.file_handling.file_operations import *
from .rules_db import TransformRules
from sqlalchemy import Table
from .base_service import BaseService, Base
Expand All @@ -9,7 +9,7 @@
# abstract class for creating a distance measure between two rules
# support for loading from .cfg file and loading rules is supplied
# abstract methods are specific to the distance measure
class WordDistanceFactory(ABC):
class WordDistance(ABC):

def __init__(self, primary, secondary, author:str = ""):
super().__init__()
Expand Down
7 changes: 2 additions & 5 deletions wordstats/edit_distance_overlap.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import codecs
import configparser
from .cognate_files_path import *
from .edit_distance_function_factory import WordDistanceFactory
from .edit_distance_function_factory import WordDistance

class WordDistanceOverlap(WordDistanceFactory):
class WordDistanceOverlap(WordDistance):
def __init__(self, primary, secondary, author:str = ""):
super().__init__(primary, secondary, author)
self.method_name = "overlap"
Expand Down
36 changes: 0 additions & 36 deletions wordstats/edit_distance_translate.py

This file was deleted.

Empty file.
38 changes: 38 additions & 0 deletions wordstats/file_handling/cognate_files_path.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from wordstats.config import DATA_FOLDER_COGNATES, WHITELIST, BLACKLIST, RULES, CANDIDATES
import os


def path_of_cognate_languages(primary, secondary):
package_directory = os.path.dirname(os.path.abspath(__file__)) + os.sep + ".."
file_path = package_directory + os.sep + "{0}{1}{2}{3}".format(DATA_FOLDER_COGNATES, os.sep, primary, secondary)
return file_path


def path_of_cognate_candidates(primary, secondary, method_name):
return _path_to_cognate_file(primary, secondary, CANDIDATES, method_name)


def path_of_cognate_blacklist(primary, secondary, author: str = ""):
return _path_to_cognate_file(primary, secondary,
BLACKLIST if len(author) == 0 else (BLACKLIST + "_" + author))


def path_of_cognate_whitelist(primary, secondary, author: str = ""):
return _path_to_cognate_file(primary, secondary,
WHITELIST if len(author) == 0 else (WHITELIST + "_" + author))


def path_of_cognate_rules(primary, secondary, method_name, author: str = ""):
return _path_to_cognate_file(primary, secondary, RULES if len(author) == 0 else (RULES + "_" + author)
, method_name)


def _path_to_cognate_file(primary, secondary, file_name, method_name=""):
file_path = path_of_cognate_languages(primary, secondary) + os.sep
if method_name is not "":
file_path += method_name + os.sep
file_path += file_name + ".txt"

os.makedirs(os.path.dirname(file_path), exist_ok=True)

return file_path
26 changes: 26 additions & 0 deletions wordstats/file_handling/file_operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import codecs


# useful methods for handling files

def load_from_path(path):
try:
with codecs.open(path, encoding="utf8") as file:
content = file.read()

except FileNotFoundError:
print(path + " not found, creating empty file.")
codecs.open(path, encoding="utf8", mode="w")
content = load_from_path(path)

return content


def save_to_file(path, content):
with codecs.open(path, encoding="utf8", mode="w") as words_file:
words_file.write(content)


def append_to_file(path, content):
with codecs.open(path, encoding="utf8", mode="a") as words_file:
words_file.writelines(content + "\n")
Loading