Skip to content

Commit

Permalink
Removed duplicated files + Added 2 scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Panupong Pasupat committed Feb 16, 2017
1 parent 00d56bb commit 0dc9ff1
Show file tree
Hide file tree
Showing 22 changed files with 300 additions and 141,520 deletions.
2,831 changes: 0 additions & 2,831 deletions data/random-split-seed-1-test.examples

This file was deleted.

2,831 changes: 0 additions & 2,831 deletions data/random-split-seed-1-test.tsv

This file was deleted.

11,321 changes: 0 additions & 11,321 deletions data/random-split-seed-1-train.examples

This file was deleted.

11,321 changes: 0 additions & 11,321 deletions data/random-split-seed-1-train.tsv

This file was deleted.

2,838 changes: 0 additions & 2,838 deletions data/random-split-seed-2-test.examples

This file was deleted.

2,838 changes: 0 additions & 2,838 deletions data/random-split-seed-2-test.tsv

This file was deleted.

11,314 changes: 0 additions & 11,314 deletions data/random-split-seed-2-train.examples

This file was deleted.

11,314 changes: 0 additions & 11,314 deletions data/random-split-seed-2-train.tsv

This file was deleted.

2,838 changes: 0 additions & 2,838 deletions data/random-split-seed-3-test.examples

This file was deleted.

2,838 changes: 0 additions & 2,838 deletions data/random-split-seed-3-test.tsv

This file was deleted.

11,314 changes: 0 additions & 11,314 deletions data/random-split-seed-3-train.examples

This file was deleted.

11,314 changes: 0 additions & 11,314 deletions data/random-split-seed-3-train.tsv

This file was deleted.

2,831 changes: 0 additions & 2,831 deletions data/random-split-seed-4-test.examples

This file was deleted.

2,831 changes: 0 additions & 2,831 deletions data/random-split-seed-4-test.tsv

This file was deleted.

11,321 changes: 0 additions & 11,321 deletions data/random-split-seed-4-train.examples

This file was deleted.

11,321 changes: 0 additions & 11,321 deletions data/random-split-seed-4-train.tsv

This file was deleted.

2,836 changes: 0 additions & 2,836 deletions data/random-split-seed-5-test.examples

This file was deleted.

2,836 changes: 0 additions & 2,836 deletions data/random-split-seed-5-test.tsv

This file was deleted.

11,316 changes: 0 additions & 11,316 deletions data/random-split-seed-5-train.examples

This file was deleted.

11,316 changes: 0 additions & 11,316 deletions data/random-split-seed-5-train.tsv

This file was deleted.

95 changes: 95 additions & 0 deletions get-predictions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Get predictions from the log file of SEMPRE."""

import sys, os, shutil, re, argparse

PATTERN = re.compile(r'Pred@0000: '
r'\(derivation \(formula (.*)\)\) '
r'\(value (.*)\) '
r'\(type (.*)\)\) \[score=(.*), prob=(.*), comp=(.*)\]')

def lisptree_to_python_object(charbuffer):
"""Convert the lisptree to Python object.
Args:
charbuffer: REVERSED list of characters of the lisptree string.
Characters will be consumed from the list.
"""
c = charbuffer.pop()
if c == '(':
answer = []
while charbuffer[-1] != ')':
if charbuffer[-1] == ' ':
charbuffer.pop()
else:
answer.append(lisptree_to_python_object(charbuffer))
assert charbuffer.pop() == ')'
return answer
elif c == '"':
answer = []
while charbuffer[-1] != '"':
c = charbuffer.pop()
if c == '\\':
answer.append(charbuffer.pop())
else:
answer.append(c)
assert charbuffer.pop() == '"'
return ''.join(answer)
else:
answer = [c if c != '\\' else charbuffer.pop()]
while charbuffer[-1] not in (' ', ')'):
c = charbuffer.pop()
if c == '\\':
answer.append(charbuffer.pop())
else:
assert c != '('
answer.append(c)
return ''.join(answer)

def lisptree_to_values(tree):
assert tree.startswith('(list ') and tree.endswith(')')
tree = lisptree_to_python_object(list(tree.decode('utf8'))[::-1])
assert tree[0] == 'list'
answer = []
for subtree in tree[1:]:
if subtree[0] == 'number':
answer.append(float(subtree[1]))
elif subtree[0] == 'date':
answer.append('{}-{}-{}'.format(
int(subtree[1]) if subtree[1] != '-1' else 'xx',
int(subtree[2]) if subtree[2] != '-1' else 'xx',
int(subtree[3]) if subtree[3] != '-1' else 'xx'))
else:
assert subtree[0] == 'name'
answer.append(re.sub('\s+', ' ', subtree[2]).strip())
return '\t'.join(unicode(x) for x in answer)

def main():
parser = argparse.ArgumentParser()
parser.add_argument('infile', help='log file')
parser.add_argument('iteration', help='iteration to extract')
args = parser.parse_args()

prefix = 'iter=%s:' % args.iteration
ex_id = None
with open(args.infile) as fin:
for line in fin:
line = line.strip()
if line.startswith(prefix):
if ex_id is not None:
# No prediction for the previous example
print ex_id
ex_id = line.split()[3]
elif ex_id is not None and line.startswith('Pred@0000:'):
match = PATTERN.match(line)
formula, denotation, deno_type, score, prob, comp = match.groups()
denotation = lisptree_to_values(denotation)
print u'{}\t{}'.format(ex_id, denotation)
ex_id = None
if ex_id is not None:
print '\t'.join([ex_id, 'None'])

if __name__ == '__main__':
main()

205 changes: 205 additions & 0 deletions table-to-csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Convert HTML table into CSV / TSV / pretty-printed table."""

import sys, os, re, argparse, json
from codecs import open
from collections import defaultdict
from weblib.table import Table
from itertools import izip_longest

################ Dump CSV

def simple_normalize_text(text):
return text.replace('\\', '\\\\').replace('"', r'\"').replace('\n', r'\\n').replace(u'\xa0', ' ').strip()

def dump_csv(rows, fout):
for row in rows:
fout.write(','.join('"%s"' % simple_normalize_text(x[1]) for x in row) + '\n')

def tab_normalize_text(text):
return re.sub(r'\s+', ' ', text.replace('\\', '\\\\').replace('|', r'\p').replace('\n', r'\n'), re.U).strip()

def dump_tsv(rows, fout):
for row in rows:
fout.write('\t'.join('%s' % tab_normalize_text(x[1]) for x in row) + '\n')

def table_normalize_text(text):
return re.sub(r'\s+', ' ', text, re.U).strip()

def dump_table(rows, fout):
widths = defaultdict(int)
for row in rows:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(table_normalize_text(cell[1])) + 1)
for row in rows:
fout.write('|')
for i, cell in enumerate(row):
# wow this is so hacky
fout.write((' %-' + str(widths[i]) + 's') % table_normalize_text(cell[1]))
fout.write('|')
fout.write('\n')

################ More table normalization

def debug_print(stuff):
for x in stuff:
print >> sys.stderr, [simple_normalize_text(y[1]) for y in x]

def transpose(rows):
cols = []
n = max(len(row) for row in rows)
for i in xrange(n):
col = []
for row in rows:
try:
col.append(row[i])
except LookupError:
col.append(('', ''))
cols.append(col)
return cols

def anti_transpose(cols):
# All col in cols must have equal length
assert len(set(len(col) for col in cols)) == 1
rows = []
n = len(cols[0])
for i in xrange(n):
row = []
for col in cols:
if col[i] is not None:
row.append(col[i])
else:
row.append(('', ''))
rows.append(row)
return rows

def remove_full_rowspans(rows):
"""Remove rows in which all cells have the same text."""
return [row for row in rows if len(set(row)) > 1]

def remove_empty_columns(orig_cols):
"""Remove columns with <= 1 non-empty cells."""
cols = []
for col in orig_cols:
non_empty = sum((bool(cell[1]) for cell in col), 0)
if non_empty >= 2:
cols.append(col)
return cols

#### Merge columns

def are_mergeable(col1, col2):
assert len(col1) == len(col2)
merged = []
for i in xrange(len(col1)):
c1, c2 = col1[i], col2[i]
if not c1[1]:
merged.append(c2)
elif not c2[1] or c1 == c2:
merged.append(c1)
else:
return None
return merged

def merge_similar_columns(orig_cols):
"""Merge similar columns."""
i = 0
while i + 1 < len(orig_cols):
merged = are_mergeable(orig_cols[i], orig_cols[i+1])
if merged is not None:
orig_cols[i:i+2] = [merged]
else:
i += 1
return orig_cols

#### Merge header rows

def merge_header_rows(orig_rows):
"""Merge all header rows together."""
header_rows, body_rows = [], []
still_header = True
for row in orig_rows:
if not still_header or any(cell[0] == 'td' for cell in row):
still_header = False
body_rows.append(row)
else:
header_rows.append(row)
if len(header_rows) < 2 or not body_rows:
return orig_rows
# Merge header rows with '\n'
header_cols = transpose(header_rows)
header_row = []
for col in header_cols:
texts = [None]
for cell in col:
if cell[1] != texts[-1]:
texts.append(cell[1])
header_row.append(('th', '\n'.join(texts[1:])))
return [header_row] + body_rows

################ Main function

def main():
parser = argparse.ArgumentParser()
parser.add_argument('-j', '--turk-json',
help="json metadata file from MTurk task")
parser.add_argument('-o', '--outfile',
help="output filename (default = stdout)")
parser.add_argument('--tsv', action='store_true',
help='also print out tsv')
parser.add_argument('--human', action='store_true',
help='also print out human-readable table')
parser.add_argument('--html', action='store_true',
help='also print out cleaned html for the table')
parser.add_argument('--keep-hidden', action='store_true',
help='keep hidden texts as is')
args = parser.parse_args()
assert not args.tsv or args.outfile.endswith('.csv')

with open(args.turk_json) as fin:
metadata = json.load(fin)

# Get the path to the HTML file
# This is kind of hacky
match = re.match(r'^(?:json|page)/(\d+)-(?:json|page)/(\d+).json$', args.turk_json)
batch_id, data_id = match.groups()
inhtml = 'page/{}-page/{}.html'.format(batch_id, data_id)

with open(inhtml, 'r', 'utf8') as fin:
raw = fin.read()
table = Table.get_wikitable(raw, metadata['tableIndex'],
normalization=Table.NORM_DUPLICATE,
remove_hidden=(not args.keep_hidden))
if args.html:
raw_table = Table.get_wikitable(raw, metadata['tableIndex'],
remove_hidden=False).table

rows = table.rows
# rows = list of columns; column = list of cells; cell = (tag, text)
# Remove redundant rows and columns
rows = remove_full_rowspans(rows)
cols = transpose(rows)
cols = remove_empty_columns(cols)
cols = merge_similar_columns(cols)
rows = anti_transpose(cols)
rows = merge_header_rows(rows)
# Dump
if not args.outfile:
dump_csv(rows, sys.stdout)
else:
stem = re.sub('\.csv$', '', args.outfile)
with open(args.outfile, 'w', 'utf8') as fout:
dump_csv(rows, fout)
if args.tsv:
with open(stem + '.tsv', 'w', 'utf8') as fout:
dump_tsv(rows, fout)
if args.human:
with open(stem + '.table', 'w', 'utf8') as fout:
dump_table(rows, fout)
if args.html:
with open(stem + '.html', 'w', 'utf8') as fout:
print >> fout, unicode(raw_table)

if __name__ == '__main__':
main()

0 comments on commit 0dc9ff1

Please sign in to comment.