typing and comments

e-gun · Apr 17, 2019 · bcc499c · bcc499c
1 parent be5cfee
commit bcc499c
Show file tree

Hide file tree

Showing 7 changed files with 210 additions and 200 deletions.
diff --git a/builder/corpusbuilder.py b/builder/corpusbuilder.py
@@ -12,10 +12,12 @@
 import time
 from multiprocessing import Manager
 from os import path
+from typing import List
 
 import builder.dbinteraction.dbhelperfunctions
 import builder.dbinteraction.dbprepsubstitutions
 import builder.parsers.betacodefontshifts
+from builder.builderclasses import dbAuthor
 from builder.dbinteraction import dbloading
 from builder.dbinteraction.connection import setconnection
 from builder.dbinteraction.dbhelperfunctions import resetauthorsandworksdbs
@@ -293,7 +295,7 @@ def addoneauthor(authordict, language, uidprefix, datapath, dataprefix, dbconnec
 	return success
 
 
-def thecollectedworksof(authorobject, language, datapath, dbconnection, debugoutput=False, debugnewlines=True, skipdbload=False):
+def thecollectedworksof(authorobject: dbAuthor, language: str, datapath: str, dbconnection, debugoutput=False, debugnewlines=True, skipdbload=False):
 	"""
 	give me a authorobject and i will build you a corpus in three stages
 	[a] initial parsing of original files
@@ -353,7 +355,7 @@ def buildauthorobject(authortabnumber, language, datapath, uidprefix, dataprefix
 	return authorobj
 
 
-def initialworkparsing(authorobject, language, datapath, debugoutput=False, debugnewlines=True):
+def initialworkparsing(authorobject, language, datapath, debugoutput=False, debugnewlines=True) -> str:
 	"""
 
 	grab a raw file and start cleaning it up; the order of files and of items within files usually matters
@@ -402,7 +404,7 @@ def initialworkparsing(authorobject, language, datapath, debugoutput=False, debu
 	return thetext
 
 
-def secondaryworkparsing(authorobject, thetext, debugoutput=False, debugnewlines=True):
+def secondaryworkparsing(authorobject, thetext: str, debugoutput=False, debugnewlines=True) -> List[str]:
 	"""
 
 	the next big step is turning the datastream into a citeable text
@@ -434,6 +436,7 @@ def secondaryworkparsing(authorobject, thetext, debugoutput=False, debugnewlines
 
 	count = 1
 	for f in functionlist:
+		# note that we are shifting types in the course of this: str -> str; str -> list; list -> list
 		thetext = f(thetext)
 
 		if debugoutput:

diff --git a/builder/dbinteraction/dbdataintoobjects.py b/builder/dbinteraction/dbdataintoobjects.py
@@ -13,14 +13,14 @@
 from builder.dbinteraction.dbhelperfunctions import resultiterator
 
 
-def dbfetchauthorobject(uid, cursor):
+def dbfetchauthorobject(uid, dbcursor):
 	# only call this AFTER you have built all of the work objects so that they can be placed into it
 
 	query = 'SELECT * from authors where universalid = %s'
 	data = (uid,)
-	cursor.execute(query, data)
+	dbcursor.execute(query, data)
 	try:
-		results = cursor.fetchone()
+		results = dbcursor.fetchone()
 	except:
 		# note that there is no graceful way out of this: you have to have an authorobject in the end
 		print('failed to find the requested author:', query, data)

diff --git a/builder/parsers/copticsubstitutions.py b/builder/parsers/copticsubstitutions.py
@@ -8,7 +8,8 @@
 
 import re
 
-def replacecoptic(texttoclean):
+
+def replacecoptic(texttoclean: str) -> str:
 	"""
 
 	:param texttoclean:
@@ -24,7 +25,7 @@ def replacecoptic(texttoclean):
 	return cleaned
 
 
-def copticprobe(match):
+def copticprobe(match: re.Match) -> str:
 	"""
 
 	:param match:
@@ -47,10 +48,10 @@ def copticprobe(match):
 	return newcoptic
 
 
-def copticuppercases(val):
+def copticuppercases(toreplace: str) -> str:
 	"""
 
-	:param val:
+	:param toreplace:
 	:return:
 	"""
 
@@ -89,15 +90,15 @@ def copticuppercases(val):
 	}
 
 	try:
-		substitute = substitutions[val]
+		substitute = substitutions[toreplace]
 	except KeyError:
-		# print('coptic capital confusion:', val)
-		substitute = val
+		# print('coptic capital confusion:', toreplace)
+		substitute = toreplace
 
 	return substitute
 
 
-def copticlowercases(val):
+def copticlowercases(toreplace: str) -> str:
 
 	substitutions = {
 		'A': u'\u2c81',
@@ -135,10 +136,10 @@ def copticlowercases(val):
 	}
 
 	try:
-		substitute = substitutions[val]
+		substitute = substitutions[toreplace]
 	except KeyError:
 		# print('coptic lowercase confusion:', val)
-		substitute = val
+		substitute = toreplace
 
 	return substitute
 

diff --git a/builder/parsers/greekcanonfunctions.py b/builder/parsers/greekcanonfunctions.py
@@ -7,6 +7,7 @@
 
 import configparser
 import re
+from typing import List
 
 from builder import file_io
 from builder.dbinteraction.dbdataintoobjects import loadallauthorsasobjects, loadallworksasobjects
@@ -21,11 +22,15 @@
 tlg = config['io']['tlg']
 
 
-def languagesubstitutes(opentag, foundtext, closetag):
+def languagesubstitutes(opentag: str, foundtext: str, closetag: str) -> str:
 	"""
+	
 	tricky because greek is not turned off properly
-	:param matchgroup:
-	:return:
+	
+	:param opentag: 
+	:param foundtext: 
+	:param closetag: 
+	:return: 
 	"""
 
 	clean = re.sub(r'(&1){0,1}\[2', r'⟨', foundtext)
@@ -56,7 +61,7 @@ def languagesubstitutes(opentag, foundtext, closetag):
 	return clean
 
 
-def loadgkcanon(canonfile):
+def loadgkcanon(canonfile: str):
 	"""
 
 	this is suprisingly slow at the end of a build
@@ -96,7 +101,7 @@ def loadgkcanon(canonfile):
 	return
 
 
-def gkcanoncleaner(txt):
+def gkcanoncleaner(txt: str) -> str:
 	"""
 	tidy up the greek canon file
 
@@ -236,7 +241,7 @@ def gkcanoncleaner(txt):
 	return txt
 
 
-def temptableauthorline(newauthorinfo, allauthors):
+def temptableauthorline(newauthorinfo: str, allauthors: dict) -> List[str]:
 	"""
 
 	prepare a line of author data to send to the temporary table
@@ -252,26 +257,26 @@ def temptableauthorline(newauthorinfo, allauthors):
 
 	try:
 		a = 'gr' + au.group(1)
-	except:
+	except AttributeError:
 		a = 'gr0000'
 
 	short = re.search(sh, newauthorinfo)
 	try:
 		s = re.sub('/s+$', '', short.group(1))
 		s = re.sub(percents, percentsubstitutes, s)
-	except:
-		s = ''
+	except AttributeError:
+		s = str()
 
 	try:
 		ao = allauthors[a]
 		name = ao.shortname
 	except:
-		name = ''
+		name = str()
 
 	try:
 		name = re.sub('/s+$', '', name)
 	except:
-		name = ''
+		name = str()
 
 	if len(name) < 1:
 		name = s
@@ -325,20 +330,21 @@ def temptableworkline(newworkinfo, allworks):
 
 	try:
 		count = int(re.sub(r'[^\d]', '', count.group(1)))
-	except:
+	except AttributeError:
+		# AttributeError: 'NoneType' object has no attribute 'group'
 		count = -1
 
 	try:
 		n = name.group(1)
 		if n[0] == '1':
 			# why are these still here?
 			n = n[1:]
-	except:
+	except AttributeError:
 		if work.group(1) == '0058w001':
 			n = 'Poliorcetica'  # 0058w001 does not have a name with this version of the parser....: 'wrk &1Poliorcetica&'
 		else:
 			# print('no name for',work.group(1))
-			n = ''
+			n = str()
 
 	if re.search(r'\[Sp\.\]', n) is not None:
 		authentic = False
@@ -347,17 +353,17 @@ def temptableworkline(newworkinfo, allworks):
 
 	try:
 		p = pub.group(1)
-	except:
-		p = ''
+	except AttributeError:
+		p = str()
 
 	p = re.sub(percents, percentsubstitutes, p)
 	p = re.sub(ands, andsubstitutes, p)
 	p = re.sub(r' $', '', p)
 
 	try:
 		g = genre.group(1)
-	except:
-		g = ''
+	except AttributeError:
+		g = str()
 
 	g = re.sub(percents, percentsubstitutes, g)
 	g = re.sub(r' $', '', g)
@@ -366,13 +372,13 @@ def temptableworkline(newworkinfo, allworks):
 
 	try:
 		tr = trans.group(1)
-	except:
-		tr = ''
+	except AttributeError:
+		tr = str()
 
 	try:
 		wt = wtype.group(1)
-	except:
-		wt = ''
+	except AttributeError:
+		wt = str()
 
 	wt = re.sub(percents, percentsubstitutes, wt)
 	wt = re.sub(r' $', '', wt)
@@ -421,7 +427,7 @@ def temptableworkline(newworkinfo, allworks):
 	return newdata
 
 
-def peekatcanon(workdbname):
+def peekatcanon(workdbname: str) -> List[str]:
 	"""
 	an emergency appeal to the canon for a work's structure
 
@@ -441,7 +447,7 @@ def peekatcanon(workdbname):
 	# careful - structure set to {0: 'Volumépagéline'} [gr0598]
 	allauthors = loadallauthorsasobjects()
 	txt = gkcanoncleaner(txt, allauthors)
-	structure = []
+	structure = list()
 	for line in txt:
 		if line[0:6] == '\t<work':
 			if re.search(workdbname[2:],line) is not None:

diff --git a/builder/parsers/latinsubstitutions.py b/builder/parsers/latinsubstitutions.py
@@ -9,7 +9,7 @@
 import re
 
 
-def latindiacriticals(texttoclean):
+def latindiacriticals(texttoclean: str) -> str:
 	"""
 
 	find text with latin diacritical marks
@@ -26,7 +26,7 @@ def latindiacriticals(texttoclean):
 	return texttoclean
 
 
-def latinsubstitutes(matchgroup):
+def latinsubstitutes(matchgroup) -> str:
 
 	val = matchgroup.group(0)
 

diff --git a/builder/parsers/lexica.py b/builder/parsers/lexica.py
@@ -16,7 +16,7 @@
 # lexica parser helpers
 #
 
-def greekwithoutvowellengths(betagreek):
+def greekwithoutvowellengths(betagreek: str) -> str:
 	"""
 	quick vowel len stripper that then sends you to greek conversion
 
@@ -60,7 +60,7 @@ def greekwithvowellengths(ttc):
 	return ttc
 
 
-def latinvowellengths(texttoclean):
+def latinvowellengths(texttoclean: str) -> str:
 	"""
 	now you have a new problem: matching vowel lengths when the TXT files do not have that information
 	only send items this way whose transformation will prevent successful searches
@@ -93,14 +93,14 @@ def latinvowellengths(texttoclean):
 	return texttoclean
 
 
-def betaconvertandsave(convertme):
+def betaconvertandsave(convertme: re.Match) -> str:
 	betagreek = convertme.group(1)
 	notgreek = convertme.group(2)
 	unigreek = replacegreekbetacode(betagreek.upper())+notgreek
 	return unigreek
 
 
-def lsjgreekswapper(match):
+def lsjgreekswapper(match: re.Match) -> str:
 	"""
 	greekfinder in mpgreekdictionaryinsert() will find 5 things:
 		match1 + match 3 + match 4 reassembles the markup block
@@ -128,7 +128,7 @@ def lsjgreekswapper(match):
 	return substitute
 
 
-def translationsummary(fullentry, translationlabel):
+def translationsummary(fullentry: str, translationlabel: str) -> str:
 	"""
 
 	sample: