-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
- Loading branch information
There are no files selected for viewing
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
"""Get predictions from the log file of SEMPRE.""" | ||
|
||
import sys, os, shutil, re, argparse | ||
|
||
PATTERN = re.compile(r'Pred@0000: ' | ||
r'\(derivation \(formula (.*)\)\) ' | ||
r'\(value (.*)\) ' | ||
r'\(type (.*)\)\) \[score=(.*), prob=(.*), comp=(.*)\]') | ||
|
||
def lisptree_to_python_object(charbuffer): | ||
"""Convert the lisptree to Python object. | ||
Args: | ||
charbuffer: REVERSED list of characters of the lisptree string. | ||
Characters will be consumed from the list. | ||
""" | ||
c = charbuffer.pop() | ||
if c == '(': | ||
answer = [] | ||
while charbuffer[-1] != ')': | ||
if charbuffer[-1] == ' ': | ||
charbuffer.pop() | ||
else: | ||
answer.append(lisptree_to_python_object(charbuffer)) | ||
assert charbuffer.pop() == ')' | ||
return answer | ||
elif c == '"': | ||
answer = [] | ||
while charbuffer[-1] != '"': | ||
c = charbuffer.pop() | ||
if c == '\\': | ||
answer.append(charbuffer.pop()) | ||
else: | ||
answer.append(c) | ||
assert charbuffer.pop() == '"' | ||
return ''.join(answer) | ||
else: | ||
answer = [c if c != '\\' else charbuffer.pop()] | ||
while charbuffer[-1] not in (' ', ')'): | ||
c = charbuffer.pop() | ||
if c == '\\': | ||
answer.append(charbuffer.pop()) | ||
else: | ||
assert c != '(' | ||
answer.append(c) | ||
return ''.join(answer) | ||
|
||
def lisptree_to_values(tree): | ||
assert tree.startswith('(list ') and tree.endswith(')') | ||
tree = lisptree_to_python_object(list(tree.decode('utf8'))[::-1]) | ||
assert tree[0] == 'list' | ||
answer = [] | ||
for subtree in tree[1:]: | ||
if subtree[0] == 'number': | ||
answer.append(float(subtree[1])) | ||
elif subtree[0] == 'date': | ||
answer.append('{}-{}-{}'.format( | ||
int(subtree[1]) if subtree[1] != '-1' else 'xx', | ||
int(subtree[2]) if subtree[2] != '-1' else 'xx', | ||
int(subtree[3]) if subtree[3] != '-1' else 'xx')) | ||
else: | ||
assert subtree[0] == 'name' | ||
answer.append(re.sub('\s+', ' ', subtree[2]).strip()) | ||
return '\t'.join(unicode(x) for x in answer) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('infile', help='log file') | ||
parser.add_argument('iteration', help='iteration to extract') | ||
args = parser.parse_args() | ||
|
||
prefix = 'iter=%s:' % args.iteration | ||
ex_id = None | ||
with open(args.infile) as fin: | ||
for line in fin: | ||
line = line.strip() | ||
if line.startswith(prefix): | ||
if ex_id is not None: | ||
# No prediction for the previous example | ||
print ex_id | ||
ex_id = line.split()[3] | ||
elif ex_id is not None and line.startswith('Pred@0000:'): | ||
match = PATTERN.match(line) | ||
formula, denotation, deno_type, score, prob, comp = match.groups() | ||
denotation = lisptree_to_values(denotation) | ||
print u'{}\t{}'.format(ex_id, denotation) | ||
ex_id = None | ||
if ex_id is not None: | ||
print '\t'.join([ex_id, 'None']) | ||
|
||
if __name__ == '__main__': | ||
main() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
"""Convert HTML table into CSV / TSV / pretty-printed table.""" | ||
|
||
import sys, os, re, argparse, json | ||
from codecs import open | ||
from collections import defaultdict | ||
from weblib.table import Table | ||
from itertools import izip_longest | ||
|
||
################ Dump CSV | ||
|
||
def simple_normalize_text(text): | ||
return text.replace('\\', '\\\\').replace('"', r'\"').replace('\n', r'\\n').replace(u'\xa0', ' ').strip() | ||
|
||
def dump_csv(rows, fout): | ||
for row in rows: | ||
fout.write(','.join('"%s"' % simple_normalize_text(x[1]) for x in row) + '\n') | ||
|
||
def tab_normalize_text(text): | ||
return re.sub(r'\s+', ' ', text.replace('\\', '\\\\').replace('|', r'\p').replace('\n', r'\n'), re.U).strip() | ||
|
||
def dump_tsv(rows, fout): | ||
for row in rows: | ||
fout.write('\t'.join('%s' % tab_normalize_text(x[1]) for x in row) + '\n') | ||
|
||
def table_normalize_text(text): | ||
return re.sub(r'\s+', ' ', text, re.U).strip() | ||
|
||
def dump_table(rows, fout): | ||
widths = defaultdict(int) | ||
for row in rows: | ||
for i, cell in enumerate(row): | ||
widths[i] = max(widths[i], len(table_normalize_text(cell[1])) + 1) | ||
for row in rows: | ||
fout.write('|') | ||
for i, cell in enumerate(row): | ||
# wow this is so hacky | ||
fout.write((' %-' + str(widths[i]) + 's') % table_normalize_text(cell[1])) | ||
fout.write('|') | ||
fout.write('\n') | ||
|
||
################ More table normalization | ||
|
||
def debug_print(stuff): | ||
for x in stuff: | ||
print >> sys.stderr, [simple_normalize_text(y[1]) for y in x] | ||
|
||
def transpose(rows): | ||
cols = [] | ||
n = max(len(row) for row in rows) | ||
for i in xrange(n): | ||
col = [] | ||
for row in rows: | ||
try: | ||
col.append(row[i]) | ||
except LookupError: | ||
col.append(('', '')) | ||
cols.append(col) | ||
return cols | ||
|
||
def anti_transpose(cols): | ||
# All col in cols must have equal length | ||
assert len(set(len(col) for col in cols)) == 1 | ||
rows = [] | ||
n = len(cols[0]) | ||
for i in xrange(n): | ||
row = [] | ||
for col in cols: | ||
if col[i] is not None: | ||
row.append(col[i]) | ||
else: | ||
row.append(('', '')) | ||
rows.append(row) | ||
return rows | ||
|
||
def remove_full_rowspans(rows): | ||
"""Remove rows in which all cells have the same text.""" | ||
return [row for row in rows if len(set(row)) > 1] | ||
|
||
def remove_empty_columns(orig_cols): | ||
"""Remove columns with <= 1 non-empty cells.""" | ||
cols = [] | ||
for col in orig_cols: | ||
non_empty = sum((bool(cell[1]) for cell in col), 0) | ||
if non_empty >= 2: | ||
cols.append(col) | ||
return cols | ||
|
||
#### Merge columns | ||
|
||
def are_mergeable(col1, col2): | ||
assert len(col1) == len(col2) | ||
merged = [] | ||
for i in xrange(len(col1)): | ||
c1, c2 = col1[i], col2[i] | ||
if not c1[1]: | ||
merged.append(c2) | ||
elif not c2[1] or c1 == c2: | ||
merged.append(c1) | ||
else: | ||
return None | ||
return merged | ||
|
||
def merge_similar_columns(orig_cols): | ||
"""Merge similar columns.""" | ||
i = 0 | ||
while i + 1 < len(orig_cols): | ||
merged = are_mergeable(orig_cols[i], orig_cols[i+1]) | ||
if merged is not None: | ||
orig_cols[i:i+2] = [merged] | ||
else: | ||
i += 1 | ||
return orig_cols | ||
|
||
#### Merge header rows | ||
|
||
def merge_header_rows(orig_rows): | ||
"""Merge all header rows together.""" | ||
header_rows, body_rows = [], [] | ||
still_header = True | ||
for row in orig_rows: | ||
if not still_header or any(cell[0] == 'td' for cell in row): | ||
still_header = False | ||
body_rows.append(row) | ||
else: | ||
header_rows.append(row) | ||
if len(header_rows) < 2 or not body_rows: | ||
return orig_rows | ||
# Merge header rows with '\n' | ||
header_cols = transpose(header_rows) | ||
header_row = [] | ||
for col in header_cols: | ||
texts = [None] | ||
for cell in col: | ||
if cell[1] != texts[-1]: | ||
texts.append(cell[1]) | ||
header_row.append(('th', '\n'.join(texts[1:]))) | ||
return [header_row] + body_rows | ||
|
||
################ Main function | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('-j', '--turk-json', | ||
help="json metadata file from MTurk task") | ||
parser.add_argument('-o', '--outfile', | ||
help="output filename (default = stdout)") | ||
parser.add_argument('--tsv', action='store_true', | ||
help='also print out tsv') | ||
parser.add_argument('--human', action='store_true', | ||
help='also print out human-readable table') | ||
parser.add_argument('--html', action='store_true', | ||
help='also print out cleaned html for the table') | ||
parser.add_argument('--keep-hidden', action='store_true', | ||
help='keep hidden texts as is') | ||
args = parser.parse_args() | ||
assert not args.tsv or args.outfile.endswith('.csv') | ||
|
||
with open(args.turk_json) as fin: | ||
metadata = json.load(fin) | ||
|
||
# Get the path to the HTML file | ||
# This is kind of hacky | ||
match = re.match(r'^(?:json|page)/(\d+)-(?:json|page)/(\d+).json$', args.turk_json) | ||
batch_id, data_id = match.groups() | ||
inhtml = 'page/{}-page/{}.html'.format(batch_id, data_id) | ||
|
||
with open(inhtml, 'r', 'utf8') as fin: | ||
raw = fin.read() | ||
table = Table.get_wikitable(raw, metadata['tableIndex'], | ||
normalization=Table.NORM_DUPLICATE, | ||
remove_hidden=(not args.keep_hidden)) | ||
if args.html: | ||
raw_table = Table.get_wikitable(raw, metadata['tableIndex'], | ||
remove_hidden=False).table | ||
|
||
rows = table.rows | ||
# rows = list of columns; column = list of cells; cell = (tag, text) | ||
# Remove redundant rows and columns | ||
rows = remove_full_rowspans(rows) | ||
cols = transpose(rows) | ||
cols = remove_empty_columns(cols) | ||
cols = merge_similar_columns(cols) | ||
rows = anti_transpose(cols) | ||
rows = merge_header_rows(rows) | ||
# Dump | ||
if not args.outfile: | ||
dump_csv(rows, sys.stdout) | ||
else: | ||
stem = re.sub('\.csv$', '', args.outfile) | ||
with open(args.outfile, 'w', 'utf8') as fout: | ||
dump_csv(rows, fout) | ||
if args.tsv: | ||
with open(stem + '.tsv', 'w', 'utf8') as fout: | ||
dump_tsv(rows, fout) | ||
if args.human: | ||
with open(stem + '.table', 'w', 'utf8') as fout: | ||
dump_table(rows, fout) | ||
if args.html: | ||
with open(stem + '.html', 'w', 'utf8') as fout: | ||
print >> fout, unicode(raw_table) | ||
|
||
if __name__ == '__main__': | ||
main() |