Skip to content

Commit

Permalink
typing and comments
Browse files Browse the repository at this point in the history
  • Loading branch information
e-gun committed Apr 17, 2019
1 parent be5cfee commit bcc499c
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 200 deletions.
9 changes: 6 additions & 3 deletions builder/corpusbuilder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
import time
from multiprocessing import Manager
from os import path
from typing import List

import builder.dbinteraction.dbhelperfunctions
import builder.dbinteraction.dbprepsubstitutions
import builder.parsers.betacodefontshifts
from builder.builderclasses import dbAuthor
from builder.dbinteraction import dbloading
from builder.dbinteraction.connection import setconnection
from builder.dbinteraction.dbhelperfunctions import resetauthorsandworksdbs
Expand Down Expand Up @@ -293,7 +295,7 @@ def addoneauthor(authordict, language, uidprefix, datapath, dataprefix, dbconnec
return success


def thecollectedworksof(authorobject, language, datapath, dbconnection, debugoutput=False, debugnewlines=True, skipdbload=False):
def thecollectedworksof(authorobject: dbAuthor, language: str, datapath: str, dbconnection, debugoutput=False, debugnewlines=True, skipdbload=False):
"""
give me a authorobject and i will build you a corpus in three stages
[a] initial parsing of original files
Expand Down Expand Up @@ -353,7 +355,7 @@ def buildauthorobject(authortabnumber, language, datapath, uidprefix, dataprefix
return authorobj


def initialworkparsing(authorobject, language, datapath, debugoutput=False, debugnewlines=True):
def initialworkparsing(authorobject, language, datapath, debugoutput=False, debugnewlines=True) -> str:
"""
grab a raw file and start cleaning it up; the order of files and of items within files usually matters
Expand Down Expand Up @@ -402,7 +404,7 @@ def initialworkparsing(authorobject, language, datapath, debugoutput=False, debu
return thetext


def secondaryworkparsing(authorobject, thetext, debugoutput=False, debugnewlines=True):
def secondaryworkparsing(authorobject, thetext: str, debugoutput=False, debugnewlines=True) -> List[str]:
"""
the next big step is turning the datastream into a citeable text
Expand Down Expand Up @@ -434,6 +436,7 @@ def secondaryworkparsing(authorobject, thetext, debugoutput=False, debugnewlines

count = 1
for f in functionlist:
# note that we are shifting types in the course of this: str -> str; str -> list; list -> list
thetext = f(thetext)

if debugoutput:
Expand Down
6 changes: 3 additions & 3 deletions builder/dbinteraction/dbdataintoobjects.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@
from builder.dbinteraction.dbhelperfunctions import resultiterator


def dbfetchauthorobject(uid, cursor):
def dbfetchauthorobject(uid, dbcursor):
# only call this AFTER you have built all of the work objects so that they can be placed into it

query = 'SELECT * from authors where universalid = %s'
data = (uid,)
cursor.execute(query, data)
dbcursor.execute(query, data)
try:
results = cursor.fetchone()
results = dbcursor.fetchone()
except:
# note that there is no graceful way out of this: you have to have an authorobject in the end
print('failed to find the requested author:', query, data)
Expand Down
21 changes: 11 additions & 10 deletions builder/parsers/copticsubstitutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

import re

def replacecoptic(texttoclean):

def replacecoptic(texttoclean: str) -> str:
"""
:param texttoclean:
Expand All @@ -24,7 +25,7 @@ def replacecoptic(texttoclean):
return cleaned


def copticprobe(match):
def copticprobe(match: re.Match) -> str:
"""
:param match:
Expand All @@ -47,10 +48,10 @@ def copticprobe(match):
return newcoptic


def copticuppercases(val):
def copticuppercases(toreplace: str) -> str:
"""
:param val:
:param toreplace:
:return:
"""

Expand Down Expand Up @@ -89,15 +90,15 @@ def copticuppercases(val):
}

try:
substitute = substitutions[val]
substitute = substitutions[toreplace]
except KeyError:
# print('coptic capital confusion:', val)
substitute = val
# print('coptic capital confusion:', toreplace)
substitute = toreplace

return substitute


def copticlowercases(val):
def copticlowercases(toreplace: str) -> str:

substitutions = {
'A': u'\u2c81',
Expand Down Expand Up @@ -135,10 +136,10 @@ def copticlowercases(val):
}

try:
substitute = substitutions[val]
substitute = substitutions[toreplace]
except KeyError:
# print('coptic lowercase confusion:', val)
substitute = val
substitute = toreplace

return substitute

Expand Down
54 changes: 30 additions & 24 deletions builder/parsers/greekcanonfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import configparser
import re
from typing import List

from builder import file_io
from builder.dbinteraction.dbdataintoobjects import loadallauthorsasobjects, loadallworksasobjects
Expand All @@ -21,11 +22,15 @@
tlg = config['io']['tlg']


def languagesubstitutes(opentag, foundtext, closetag):
def languagesubstitutes(opentag: str, foundtext: str, closetag: str) -> str:
"""
tricky because greek is not turned off properly
:param matchgroup:
:return:
:param opentag:
:param foundtext:
:param closetag:
:return:
"""

clean = re.sub(r'(&1){0,1}\[2', r'⟨', foundtext)
Expand Down Expand Up @@ -56,7 +61,7 @@ def languagesubstitutes(opentag, foundtext, closetag):
return clean


def loadgkcanon(canonfile):
def loadgkcanon(canonfile: str):
"""
this is suprisingly slow at the end of a build
Expand Down Expand Up @@ -96,7 +101,7 @@ def loadgkcanon(canonfile):
return


def gkcanoncleaner(txt):
def gkcanoncleaner(txt: str) -> str:
"""
tidy up the greek canon file
Expand Down Expand Up @@ -236,7 +241,7 @@ def gkcanoncleaner(txt):
return txt


def temptableauthorline(newauthorinfo, allauthors):
def temptableauthorline(newauthorinfo: str, allauthors: dict) -> List[str]:
"""
prepare a line of author data to send to the temporary table
Expand All @@ -252,26 +257,26 @@ def temptableauthorline(newauthorinfo, allauthors):

try:
a = 'gr' + au.group(1)
except:
except AttributeError:
a = 'gr0000'

short = re.search(sh, newauthorinfo)
try:
s = re.sub('/s+$', '', short.group(1))
s = re.sub(percents, percentsubstitutes, s)
except:
s = ''
except AttributeError:
s = str()

try:
ao = allauthors[a]
name = ao.shortname
except:
name = ''
name = str()

try:
name = re.sub('/s+$', '', name)
except:
name = ''
name = str()

if len(name) < 1:
name = s
Expand Down Expand Up @@ -325,20 +330,21 @@ def temptableworkline(newworkinfo, allworks):

try:
count = int(re.sub(r'[^\d]', '', count.group(1)))
except:
except AttributeError:
# AttributeError: 'NoneType' object has no attribute 'group'
count = -1

try:
n = name.group(1)
if n[0] == '1':
# why are these still here?
n = n[1:]
except:
except AttributeError:
if work.group(1) == '0058w001':
n = 'Poliorcetica' # 0058w001 does not have a name with this version of the parser....: 'wrk &1Poliorcetica&'
else:
# print('no name for',work.group(1))
n = ''
n = str()

if re.search(r'\[Sp\.\]', n) is not None:
authentic = False
Expand All @@ -347,17 +353,17 @@ def temptableworkline(newworkinfo, allworks):

try:
p = pub.group(1)
except:
p = ''
except AttributeError:
p = str()

p = re.sub(percents, percentsubstitutes, p)
p = re.sub(ands, andsubstitutes, p)
p = re.sub(r' $', '', p)

try:
g = genre.group(1)
except:
g = ''
except AttributeError:
g = str()

g = re.sub(percents, percentsubstitutes, g)
g = re.sub(r' $', '', g)
Expand All @@ -366,13 +372,13 @@ def temptableworkline(newworkinfo, allworks):

try:
tr = trans.group(1)
except:
tr = ''
except AttributeError:
tr = str()

try:
wt = wtype.group(1)
except:
wt = ''
except AttributeError:
wt = str()

wt = re.sub(percents, percentsubstitutes, wt)
wt = re.sub(r' $', '', wt)
Expand Down Expand Up @@ -421,7 +427,7 @@ def temptableworkline(newworkinfo, allworks):
return newdata


def peekatcanon(workdbname):
def peekatcanon(workdbname: str) -> List[str]:
"""
an emergency appeal to the canon for a work's structure
Expand All @@ -441,7 +447,7 @@ def peekatcanon(workdbname):
# careful - structure set to {0: 'Volumépagéline'} [gr0598]
allauthors = loadallauthorsasobjects()
txt = gkcanoncleaner(txt, allauthors)
structure = []
structure = list()
for line in txt:
if line[0:6] == '\t<work':
if re.search(workdbname[2:],line) is not None:
Expand Down
4 changes: 2 additions & 2 deletions builder/parsers/latinsubstitutions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import re


def latindiacriticals(texttoclean):
def latindiacriticals(texttoclean: str) -> str:
"""
find text with latin diacritical marks
Expand All @@ -26,7 +26,7 @@ def latindiacriticals(texttoclean):
return texttoclean


def latinsubstitutes(matchgroup):
def latinsubstitutes(matchgroup) -> str:

val = matchgroup.group(0)

Expand Down
10 changes: 5 additions & 5 deletions builder/parsers/lexica.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# lexica parser helpers
#

def greekwithoutvowellengths(betagreek):
def greekwithoutvowellengths(betagreek: str) -> str:
"""
quick vowel len stripper that then sends you to greek conversion
Expand Down Expand Up @@ -60,7 +60,7 @@ def greekwithvowellengths(ttc):
return ttc


def latinvowellengths(texttoclean):
def latinvowellengths(texttoclean: str) -> str:
"""
now you have a new problem: matching vowel lengths when the TXT files do not have that information
only send items this way whose transformation will prevent successful searches
Expand Down Expand Up @@ -93,14 +93,14 @@ def latinvowellengths(texttoclean):
return texttoclean


def betaconvertandsave(convertme):
def betaconvertandsave(convertme: re.Match) -> str:
betagreek = convertme.group(1)
notgreek = convertme.group(2)
unigreek = replacegreekbetacode(betagreek.upper())+notgreek
return unigreek


def lsjgreekswapper(match):
def lsjgreekswapper(match: re.Match) -> str:
"""
greekfinder in mpgreekdictionaryinsert() will find 5 things:
match1 + match 3 + match 4 reassembles the markup block
Expand Down Expand Up @@ -128,7 +128,7 @@ def lsjgreekswapper(match):
return substitute


def translationsummary(fullentry, translationlabel):
def translationsummary(fullentry: str, translationlabel: str) -> str:
"""
sample:
Expand Down
Loading

0 comments on commit bcc499c

Please sign in to comment.