diff --git a/.travis.yml b/.travis.yml index 5020226..e6fade6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,13 +38,6 @@ before_install: jython -c "print ''"; jython -c "import sys; print sys.version" fi - if [[ $TRAVIS_OS_NAME == 'osx' ]]; then - brew update - brew upgrade - brew upgrade python - brew install python3 - python3 --version - fi install: - | @@ -55,7 +48,8 @@ install: fi $PIP install wheel $PIP install setuptools - $PIP install ply pep8 mako + $PIP install ply mako + $PIP install pycodestyle if [ "$MYPYTHON" != "jython" ]; then $PIP install --upgrade pytest pytest-cov codecov fi @@ -67,12 +61,12 @@ script: $MYPYTHON -c "from pyoracc import _generate_parsetab; _generate_parsetab()" echo "Running tests" if [ "$MYPYTHON" == "jython" ]; then - py.test + pytest else pytest --cov=pyoracc fi - - pep8 --exclude=parsetab.py . + - pycodestyle --exclude=parsetab.py after_success: - | diff --git a/pyoracc/atf/common/atflex.py b/pyoracc/atf/common/atflex.py index e5bf3c0..0e7623c 100644 --- a/pyoracc/atf/common/atflex.py +++ b/pyoracc/atf/common/atflex.py @@ -69,16 +69,16 @@ def resolve_keyword(self, value, source, fallback=None, extra=None): states = AtfLexicon.STATES - t_AMPERSAND = r'\&' - t_HASH = r'\#' - t_EXCLAIM = r'\!' + t_AMPERSAND = r'&' + t_HASH = r'#' + t_EXCLAIM = r'!' t_QUERY = r'\?' t_STAR = r'\*' t_DOLLAR = r'\$' - t_MINUS = r'\-' - t_FROM = r'\<\<' - t_TO = r'\>\>' - t_COMMA = r'\,' + t_MINUS = r'-' + t_FROM = r'<<' + t_TO = r'>>' + t_COMMA = r',' t_PARBAR = r'\|\|' t_INITIAL_transctrl_PARENTHETICALID = r'\([^\n\r]*\)' @@ -88,22 +88,22 @@ def t_INITIAL_transctrl_WHITESPACE(self, t): # NO TOKEN def t_MULTILINGUAL(self, t): - r'\=\=' + r'==' t.lexer.push_state("text") return t def t_EQUALBRACE(self, t): - r'^\=\{' + r'^=\{' t.lexer.push_state('text') return t def t_EQUALS(self, t): - r'\=' + r'=' t.lexer.push_state('flagged') return t def t_INITIAL_parallel_labeled_COMMENT(self, t): - r'^\#+(?![a-zA-Z]+\:)' + r'^#+(?![a-zA-Z]+:)' # Negative lookahead to veto protocols as comments t.lexer.push_state('absorb') return t @@ -121,7 +121,7 @@ def t_NEWLINE(self, t): return t def t_INITIAL_parallel_labeled_ATID(self, t): - r'^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?' + r'^@[a-zA-Z][a-zA-Z0-9\[\]]*\+?' t.value = t.value[1:] t.lexpos += 1 t.type = self.resolve_keyword(t.value, @@ -171,13 +171,13 @@ def t_INITIAL_parallel_labeled_ATID(self, t): return t def t_labeled_OPENR(self, t): - r'\@\(' + r'@\(' t.lexer.push_state("para") t.lexer.push_state("transctrl") return t def t_INITIAL_parallel_labeled_HASHID(self, t): - r'\#[a-zA-Z][a-zA-Z0-9\[\]]+\:' + r'#[a-zA-Z][a-zA-Z0-9\[\]]+:' # Note that \:? absorbs a trailing colon in protocol keywords t.value = t.value[1:-1] t.lexpos += 1 @@ -213,19 +213,19 @@ def t_INITIAL_parallel_labeled_HASHID(self, t): return t def t_LINELABEL(self, t): - r'^[^\ \t\n]*\.' + r'^[^ \t\n]*\.' t.value = t.value[:-1] t.lexer.push_state('text') return t def t_score_SCORELABEL(self, t): - r'^[^.:\ \t\#][^.:\ \t]*\:' + r'^[^.: \t#][^.: \t]*:' t.value = t.value[:-1] t.lexer.push_state('text') return t def t_ID(self, t): - u'[a-zA-Z0-9][a-zA-Z\'\u2019\xb4\/\.0-9\:\-\[\]_\u2080-\u2089]*' + r'[a-zA-Z0-9][a-zA-Z0-9/.:_\-\[\]' u'\'\u2019\xb4\u2080-\u2089]*' t.value = t.value.replace(u'\u2019', "'") t.value = t.value.replace(u'\xb4', "'") t.type = self.resolve_keyword(t.value, @@ -271,7 +271,7 @@ def t_flagged_text_lemmatize_transctrl_nonequals_absorb_NEWLINE(self, t): # Unicode 2032 is PRIME # All of these could be used as prime def t_transctrl_ID(self, t): - u'[a-zA-Z0-9][a-zA-Z\'\u2019\u2032\u02CA\xb4\/\.0-9\:\-\[\]_' \ + r'[a-zA-Z0-9][a-zA-Z0-9/.:_\-\[\]' u'\'\u2019\u2032\u02CA\xb4' \ u'\u2080-\u2089]*' t.value = t.value.replace(u'\u2019', "'") t.value = t.value.replace(u'\u2032', "'") @@ -306,7 +306,7 @@ def t_transctrl_ID(self, t): t_parallel_QUERY = r'\?' def t_parallel_LINELABEL(self, t): - r'^([^\.\ \t]*)\.[\ \t]*' + r'^([^. \t]*)\.[ \t]*' t.value = t.value.strip(" \t.") return t @@ -315,7 +315,7 @@ def t_parallel_labeled_DOLLAR(self, t): t.lexer.push_state("absorb") return t - t_transctrl_MINUS = r'\-\ ' + t_transctrl_MINUS = r'- ' def t_transctrl_CLOSER(self, t): r'\)' @@ -347,12 +347,12 @@ def t_labeled_NEWLINE(self, t): # Flag characters (#! etc ) don't apply in translations # But reference anchors ^1^ etc do. # lines beginning with a space are continuations - white = r'[\ \t]*' + white = r'[ \t]*' # translation_regex1 and translation_regex2 are identical appart from the # fact that the first character may not be a ? # We are looking for a string that does not start with ? it may include # newlines if they are followed by a whitespace. - translation_regex1 = r'([^\?\^\n\r]|([\n\r](?=[ \t])))' + translation_regex1 = r'([^?\^\n\r]|([\n\r](?=[ \t])))' translation_regex2 = r'([^\^\n\r]|([\n\r](?=[ \t])))*' translation_regex = white + translation_regex1 + translation_regex2 + white @@ -366,7 +366,7 @@ def t_parallel_interlinear_ID(self, t): return t def t_parallel_labeled_AMPERSAND(self, t): - r'\&' + r'&' # New document, so leave translation state t.lexer.pop_state() return t @@ -383,9 +383,9 @@ def t_parallel_labeled_AMPERSAND(self, t): # Used for states where only flag# characters! and ^1^ references # Are separately tokenised - nonflagnonwhite = r'[^\ \t\#\!\^\*\?\n\r\=]' - internalonly = r'[^\n\^\r\=]' - nonflag = r'[^\ \t\#\!\^\*\?\n\r\=]' + nonflagnonwhite = r'[^ \t#!\^*?\n\r=]' + internalonly = r'[^\n\^\r=]' + nonflag = r'[^ \t#!\^*?\n\r=]' many_int_then_nonflag = '(' + internalonly + '*' + nonflag + '+' + ')' many_nonflag = nonflag + '*' intern_or_nonflg = '(' + many_int_then_nonflag + '|' + many_nonflag + ')' @@ -399,17 +399,17 @@ def t_flagged_ID(self, t): t.value = t.value.strip() return t - t_flagged_HASH = r'\#' - t_flagged_EXCLAIM = r'\!' + t_flagged_HASH = r'#' + t_flagged_EXCLAIM = r'!' t_flagged_QUERY = r'\?' t_flagged_STAR = r'\*' - t_flagged_parallel_para_HAT = r'[\ \t]*\^[\ \t]*' - t_flagged_EQUALS = r'\=' + t_flagged_parallel_para_HAT = r'[ \t]*\^[ \t]*' + t_flagged_EQUALS = r'=' # --- Rules for paragaph state---------------------------------- # Free text, ended by double new line terminates_para = \ - "(\#|\@[^i][^\{]|\&|\Z|(^[0-9]+[\'\u2019\u2032\u02CA\xb4]?\.))" + r'(#|@[^i][^{]|&|\Z|(^[0-9]+' u'[\'\u2019\u2032\u02CA\xb4]\\.))' @lex.TOKEN(r'([^\^\n\r]|(\r?\n(?!\s*\r?\n)(?!' + terminates_para + ')))+') @@ -441,11 +441,11 @@ def t_para_MAGICNEWLINE(self, t): # --- RULES FOR THE nonequals STATE ----- # Absorb everything except an equals def t_nonequals_ID(self, t): - r'[^\=\n\r]+' + r'[^=\n\r]+' t.value = t.value.strip() return t - t_nonequals_EQUALS = r'\=' + t_nonequals_EQUALS = r'=' # --- RULES FOR THE absorb STATE ----- # Absorb everything @@ -455,15 +455,15 @@ def t_absorb_ID(self, t): return t # --- RULES FOR THE text STATE ---- - t_text_ID = r'[^\ \t \n\r]+' + t_text_ID = r'[^ \t\n\r]+' def t_text_SPACE(self, t): - r'[\ \t]' + r'[ \t]' # No token generated # --- RULES FOR THE lemmatize STATE - t_lemmatize_ID = r'[^\;\n\r]+' - t_lemmatize_SEMICOLON = r'\;[\ \t]*' + t_lemmatize_ID = r'[^;\n\r]+' + t_lemmatize_SEMICOLON = r';[ \t]*' # Error handling rule def t_ANY_error(self, t): diff --git a/pyoracc/wrapper/cli.py b/pyoracc/wrapper/cli.py index c4ec969..c6ae59a 100644 --- a/pyoracc/wrapper/cli.py +++ b/pyoracc/wrapper/cli.py @@ -10,11 +10,14 @@ from pyoracc.atf.common.atffile import check_atf -def check_atf_message((segpathname, atftype, verbose)): - click.echo('\n Info: Parsing {0}.'.format(segpathname)) +def check_atf_message(args): + segpathname, atftype, verbose = args + if verbose: + click.echo('\n Info: Parsing {0}.'.format(segpathname)) try: check_atf(segpathname, atftype, verbose) - click.echo('Info: Correctly parsed {0}.'.format(segpathname)) + if verbose: + click.echo('Info: Correctly parsed {0}.'.format(segpathname)) except (SyntaxError, IndexError, AttributeError, UnicodeDecodeError) as e: click.echo("Info: Failed with message: {0} in {1}" @@ -36,15 +39,18 @@ def check_and_process(pathname, atftype, whole, verbose=False): if verbose: click.echo('Info: Segmented into {0}.'.format(outfolder)) - files = map(lambda f: os.path.join(outfolder, f), os.listdir(outfolder)) - count_files = len(files) + files = map(lambda f: os.path.join(outfolder, f), + os.listdir(outfolder)) + count_files = len(list(files)) atftypelist = [atftype]*count_files verboselist = [verbose]*count_files - pool.map(check_atf_message, zip(files, atftypelist, verboselist)) + pool.map(check_atf_message, + zip(files, atftypelist, verboselist)) pool.close() else: check_atf_message((pathname, atftype, verbose)) - click.echo('Info: Finished parsing {0}.'.format(pathname)) + if verbose: + click.echo('Info: Finished parsing {0}.'.format(pathname)) return 1 except (SyntaxError, IndexError, AttributeError, UnicodeDecodeError) as e: @@ -62,7 +68,7 @@ def check_and_process(pathname, atftype, whole, verbose=False): prompt=True, required=True, help='Input the atf file type.') @click.option('--whole', '-w', default=False, required=False, is_flag=True, - help='Disables the segmentation of the atf file and run as a whole.') + help='Disable segmentation of the atf file and run as a whole.') @click.option('--verbose', '-v', default=False, required=False, is_flag=True, help='Enables verbose mode.') @click.version_option() @@ -73,8 +79,8 @@ def main(input_path, atf_type, whole, verbose): failures = 0 successes = 0 with click.progressbar(os.listdir(input_path), - label='Info: Checking the files') as bar: - for index, f in enumerate(bar): + label='Info: Checking the files') as entries: + for f in entries: pathname = os.path.join(input_path, f) try: check_and_process(pathname, atf_type, whole, verbose) @@ -86,12 +92,21 @@ def main(input_path, atf_type, whole, verbose): click.echo("Info: Failed with message: {0} in {1}" .format(e, pathname)) finally: - try: - click.echo("Failed with {0} out of {1} ({2}%)" - .format(failures, failures + successes, failures * 100.0 / (failures + successes))) - except ZeroDivisionError: + total = failures + successes + if not total: click.echo("Empty files to process") + elif failures: + click.echo("Failed with {0} out of {1} ({2}%)" + .format(failures, + total, + failures * 100.0 / total)) + else: + click.echo("All {0} passed!".format(successes)) else: check_and_process(input_path, atf_type, whole, verbose) tsend = time.time() click.echo("Total time taken: {0} minutes".format((tsend-tsbegin)/60.0)) + + +if __name__ == '__main__': + main() diff --git a/pyoracc/wrapper/segment.py b/pyoracc/wrapper/segment.py index 2227f53..937fb2d 100644 --- a/pyoracc/wrapper/segment.py +++ b/pyoracc/wrapper/segment.py @@ -6,14 +6,16 @@ ts = time.time() -OUTPUT_FOLDER = 'segment'+str(ts) +OUTPUT_FOLDER = '-segment-'+str(ts) class Segmentor: def __init__(self, inputFile, verbose): self.inputFileName = inputFile - self.outfolder = os.path.join(os.path.dirname(self.inputFileName), "..", - os.path.basename(self.inputFileName)+OUTPUT_FOLDER) + folderName = os.path.basename(self.inputFileName) + OUTPUT_FOLDER + self.outfolder = os.path.join(os.path.dirname(self.inputFileName), + "..", + folderName) self.verbose = verbose self.__reset__()