Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Warning cleanup. #42

Merged
merged 9 commits into from
Jun 20, 2019
14 changes: 4 additions & 10 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,6 @@ before_install:
jython -c "print ''";
jython -c "import sys; print sys.version"
fi
if [[ $TRAVIS_OS_NAME == 'osx' ]]; then
brew update
brew upgrade
brew upgrade python
brew install python3
python3 --version
fi

install:
- |
Expand All @@ -55,7 +48,8 @@ install:
fi
$PIP install wheel
$PIP install setuptools
$PIP install ply pep8 mako
$PIP install ply mako
$PIP install pycodestyle
if [ "$MYPYTHON" != "jython" ]; then
$PIP install --upgrade pytest pytest-cov codecov
fi
Expand All @@ -67,12 +61,12 @@ script:
$MYPYTHON -c "from pyoracc import _generate_parsetab; _generate_parsetab()"
echo "Running tests"
if [ "$MYPYTHON" == "jython" ]; then
py.test
pytest
else
pytest --cov=pyoracc
fi

- pep8 --exclude=parsetab.py .
- pycodestyle --exclude=parsetab.py

after_success:
- |
Expand Down
74 changes: 37 additions & 37 deletions pyoracc/atf/common/atflex.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,16 @@ def resolve_keyword(self, value, source, fallback=None, extra=None):

states = AtfLexicon.STATES

t_AMPERSAND = r'\&'
t_HASH = r'\#'
t_EXCLAIM = r'\!'
t_AMPERSAND = r'&'
t_HASH = r'#'
t_EXCLAIM = r'!'
t_QUERY = r'\?'
t_STAR = r'\*'
t_DOLLAR = r'\$'
t_MINUS = r'\-'
t_FROM = r'\<\<'
t_TO = r'\>\>'
t_COMMA = r'\,'
t_MINUS = r'-'
t_FROM = r'<<'
t_TO = r'>>'
t_COMMA = r','
t_PARBAR = r'\|\|'

t_INITIAL_transctrl_PARENTHETICALID = r'\([^\n\r]*\)'
Expand All @@ -88,22 +88,22 @@ def t_INITIAL_transctrl_WHITESPACE(self, t):
# NO TOKEN

def t_MULTILINGUAL(self, t):
r'\=\='
r'=='
t.lexer.push_state("text")
return t

def t_EQUALBRACE(self, t):
r'^\=\{'
r'^=\{'
t.lexer.push_state('text')
return t

def t_EQUALS(self, t):
r'\='
r'='
t.lexer.push_state('flagged')
return t

def t_INITIAL_parallel_labeled_COMMENT(self, t):
r'^\#+(?![a-zA-Z]+\:)'
r'^#+(?![a-zA-Z]+:)'
# Negative lookahead to veto protocols as comments
t.lexer.push_state('absorb')
return t
Expand All @@ -121,7 +121,7 @@ def t_NEWLINE(self, t):
return t

def t_INITIAL_parallel_labeled_ATID(self, t):
r'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
r'^@[a-zA-Z][a-zA-Z0-9\[\]]*\+?'
t.value = t.value[1:]
t.lexpos += 1
t.type = self.resolve_keyword(t.value,
Expand Down Expand Up @@ -171,13 +171,13 @@ def t_INITIAL_parallel_labeled_ATID(self, t):
return t

def t_labeled_OPENR(self, t):
r'\@\('
r'@\('
t.lexer.push_state("para")
t.lexer.push_state("transctrl")
return t

def t_INITIAL_parallel_labeled_HASHID(self, t):
r'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:'
r'#[a-zA-Z][a-zA-Z0-9\[\]]+:'
# Note that \:? absorbs a trailing colon in protocol keywords
t.value = t.value[1:-1]
t.lexpos += 1
Expand Down Expand Up @@ -213,19 +213,19 @@ def t_INITIAL_parallel_labeled_HASHID(self, t):
return t

def t_LINELABEL(self, t):
r'^[^\ \t\n]*\.'
r'^[^ \t\n]*\.'
t.value = t.value[:-1]
t.lexer.push_state('text')
return t

def t_score_SCORELABEL(self, t):
r'^[^.:\ \t\#][^.:\ \t]*\:'
r'^[^.: \t#][^.: \t]*:'
t.value = t.value[:-1]
t.lexer.push_state('text')
return t

def t_ID(self, t):
u'[a-zA-Z0-9][a-zA-Z\'\u2019\xb4\/\.0-9\:\-\[\]_\u2080-\u2089]*'
r'[a-zA-Z0-9][a-zA-Z0-9/.:_\-\[\]' u'\'\u2019\xb4\u2080-\u2089]*'
t.value = t.value.replace(u'\u2019', "'")
t.value = t.value.replace(u'\xb4', "'")
t.type = self.resolve_keyword(t.value,
Expand Down Expand Up @@ -271,7 +271,7 @@ def t_flagged_text_lemmatize_transctrl_nonequals_absorb_NEWLINE(self, t):
# Unicode 2032 is PRIME
# All of these could be used as prime
def t_transctrl_ID(self, t):
u'[a-zA-Z0-9][a-zA-Z\'\u2019\u2032\u02CA\xb4\/\.0-9\:\-\[\]_' \
r'[a-zA-Z0-9][a-zA-Z0-9/.:_\-\[\]' u'\'\u2019\u2032\u02CA\xb4' \
u'\u2080-\u2089]*'
t.value = t.value.replace(u'\u2019', "'")
t.value = t.value.replace(u'\u2032', "'")
Expand Down Expand Up @@ -306,7 +306,7 @@ def t_transctrl_ID(self, t):
t_parallel_QUERY = r'\?'

def t_parallel_LINELABEL(self, t):
r'^([^\.\ \t]*)\.[\ \t]*'
r'^([^. \t]*)\.[ \t]*'
t.value = t.value.strip(" \t.")
return t

Expand All @@ -315,7 +315,7 @@ def t_parallel_labeled_DOLLAR(self, t):
t.lexer.push_state("absorb")
return t

t_transctrl_MINUS = r'\-\ '
t_transctrl_MINUS = r'- '

def t_transctrl_CLOSER(self, t):
r'\)'
Expand Down Expand Up @@ -347,12 +347,12 @@ def t_labeled_NEWLINE(self, t):
# Flag characters (#! etc ) don't apply in translations
# But reference anchors ^1^ etc do.
# lines beginning with a space are continuations
white = r'[\ \t]*'
white = r'[ \t]*'
# translation_regex1 and translation_regex2 are identical appart from the
# fact that the first character may not be a ?
# We are looking for a string that does not start with ? it may include
# newlines if they are followed by a whitespace.
translation_regex1 = r'([^\?\^\n\r]|([\n\r](?=[ \t])))'
translation_regex1 = r'([^?\^\n\r]|([\n\r](?=[ \t])))'
translation_regex2 = r'([^\^\n\r]|([\n\r](?=[ \t])))*'
translation_regex = white + translation_regex1 + translation_regex2 + white

Expand All @@ -366,7 +366,7 @@ def t_parallel_interlinear_ID(self, t):
return t

def t_parallel_labeled_AMPERSAND(self, t):
r'\&'
r'&'
# New document, so leave translation state
t.lexer.pop_state()
return t
Expand All @@ -383,9 +383,9 @@ def t_parallel_labeled_AMPERSAND(self, t):
# Used for states where only flag# characters! and ^1^ references
# Are separately tokenised

nonflagnonwhite = r'[^\ \t\#\!\^\*\?\n\r\=]'
internalonly = r'[^\n\^\r\=]'
nonflag = r'[^\ \t\#\!\^\*\?\n\r\=]'
nonflagnonwhite = r'[^ \t#!\^*?\n\r=]'
internalonly = r'[^\n\^\r=]'
nonflag = r'[^ \t#!\^*?\n\r=]'
many_int_then_nonflag = '(' + internalonly + '*' + nonflag + '+' + ')'
many_nonflag = nonflag + '*'
intern_or_nonflg = '(' + many_int_then_nonflag + '|' + many_nonflag + ')'
Expand All @@ -399,17 +399,17 @@ def t_flagged_ID(self, t):
t.value = t.value.strip()
return t

t_flagged_HASH = r'\#'
t_flagged_EXCLAIM = r'\!'
t_flagged_HASH = r'#'
t_flagged_EXCLAIM = r'!'
t_flagged_QUERY = r'\?'
t_flagged_STAR = r'\*'
t_flagged_parallel_para_HAT = r'[\ \t]*\^[\ \t]*'
t_flagged_EQUALS = r'\='
t_flagged_parallel_para_HAT = r'[ \t]*\^[ \t]*'
t_flagged_EQUALS = r'='
# --- Rules for paragaph state----------------------------------
# Free text, ended by double new line

terminates_para = \
"(\#|\@[^i][^\{]|\&|\Z|(^[0-9]+[\'\u2019\u2032\u02CA\xb4]?\.))"
r'(#|@[^i][^{]|&|\Z|(^[0-9]+' u'[\'\u2019\u2032\u02CA\xb4]\\.))'

@lex.TOKEN(r'([^\^\n\r]|(\r?\n(?!\s*\r?\n)(?!' +
terminates_para + ')))+')
Expand Down Expand Up @@ -441,11 +441,11 @@ def t_para_MAGICNEWLINE(self, t):
# --- RULES FOR THE nonequals STATE -----
# Absorb everything except an equals
def t_nonequals_ID(self, t):
r'[^\=\n\r]+'
r'[^=\n\r]+'
t.value = t.value.strip()
return t

t_nonequals_EQUALS = r'\='
t_nonequals_EQUALS = r'='

# --- RULES FOR THE absorb STATE -----
# Absorb everything
Expand All @@ -455,15 +455,15 @@ def t_absorb_ID(self, t):
return t

# --- RULES FOR THE text STATE ----
t_text_ID = r'[^\ \t \n\r]+'
t_text_ID = r'[^ \t\n\r]+'

def t_text_SPACE(self, t):
r'[\ \t]'
r'[ \t]'
# No token generated

# --- RULES FOR THE lemmatize STATE
t_lemmatize_ID = r'[^\;\n\r]+'
t_lemmatize_SEMICOLON = r'\;[\ \t]*'
t_lemmatize_ID = r'[^;\n\r]+'
t_lemmatize_SEMICOLON = r';[ \t]*'

# Error handling rule
def t_ANY_error(self, t):
Expand Down
43 changes: 29 additions & 14 deletions pyoracc/wrapper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@
from pyoracc.atf.common.atffile import check_atf


def check_atf_message((segpathname, atftype, verbose)):
click.echo('\n Info: Parsing {0}.'.format(segpathname))
def check_atf_message(args):
segpathname, atftype, verbose = args
if verbose:
click.echo('\n Info: Parsing {0}.'.format(segpathname))
try:
check_atf(segpathname, atftype, verbose)
click.echo('Info: Correctly parsed {0}.'.format(segpathname))
if verbose:
click.echo('Info: Correctly parsed {0}.'.format(segpathname))
except (SyntaxError, IndexError, AttributeError,
UnicodeDecodeError) as e:
click.echo("Info: Failed with message: {0} in {1}"
Expand All @@ -36,15 +39,18 @@ def check_and_process(pathname, atftype, whole, verbose=False):
if verbose:
click.echo('Info: Segmented into {0}.'.format(outfolder))

files = map(lambda f: os.path.join(outfolder, f), os.listdir(outfolder))
count_files = len(files)
files = map(lambda f: os.path.join(outfolder, f),
os.listdir(outfolder))
count_files = len(list(files))
atftypelist = [atftype]*count_files
verboselist = [verbose]*count_files
pool.map(check_atf_message, zip(files, atftypelist, verboselist))
pool.map(check_atf_message,
zip(files, atftypelist, verboselist))
pool.close()
else:
check_atf_message((pathname, atftype, verbose))
click.echo('Info: Finished parsing {0}.'.format(pathname))
if verbose:
click.echo('Info: Finished parsing {0}.'.format(pathname))
return 1
except (SyntaxError, IndexError, AttributeError,
UnicodeDecodeError) as e:
Expand All @@ -62,7 +68,7 @@ def check_and_process(pathname, atftype, whole, verbose=False):
prompt=True, required=True,
help='Input the atf file type.')
@click.option('--whole', '-w', default=False, required=False, is_flag=True,
help='Disables the segmentation of the atf file and run as a whole.')
help='Disable segmentation of the atf file and run as a whole.')
@click.option('--verbose', '-v', default=False, required=False, is_flag=True,
help='Enables verbose mode.')
@click.version_option()
Expand All @@ -73,8 +79,8 @@ def main(input_path, atf_type, whole, verbose):
failures = 0
successes = 0
with click.progressbar(os.listdir(input_path),
label='Info: Checking the files') as bar:
for index, f in enumerate(bar):
label='Info: Checking the files') as entries:
for f in entries:
pathname = os.path.join(input_path, f)
try:
check_and_process(pathname, atf_type, whole, verbose)
Expand All @@ -86,12 +92,21 @@ def main(input_path, atf_type, whole, verbose):
click.echo("Info: Failed with message: {0} in {1}"
.format(e, pathname))
finally:
try:
click.echo("Failed with {0} out of {1} ({2}%)"
.format(failures, failures + successes, failures * 100.0 / (failures + successes)))
except ZeroDivisionError:
total = failures + successes
if not total:
click.echo("Empty files to process")
elif failures:
click.echo("Failed with {0} out of {1} ({2}%)"
.format(failures,
total,
failures * 100.0 / total))
else:
click.echo("All {0} passed!".format(successes))
else:
check_and_process(input_path, atf_type, whole, verbose)
tsend = time.time()
click.echo("Total time taken: {0} minutes".format((tsend-tsbegin)/60.0))


if __name__ == '__main__':
main()
8 changes: 5 additions & 3 deletions pyoracc/wrapper/segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@


ts = time.time()
OUTPUT_FOLDER = 'segment'+str(ts)
OUTPUT_FOLDER = '-segment-'+str(ts)


class Segmentor:
def __init__(self, inputFile, verbose):
self.inputFileName = inputFile
self.outfolder = os.path.join(os.path.dirname(self.inputFileName), "..",
os.path.basename(self.inputFileName)+OUTPUT_FOLDER)
folderName = os.path.basename(self.inputFileName) + OUTPUT_FOLDER
self.outfolder = os.path.join(os.path.dirname(self.inputFileName),
"..",
folderName)
self.verbose = verbose
self.__reset__()

Expand Down