-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathmain.py
127 lines (102 loc) · 4.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
"""
A simple Python wrapper for the stanford NER binary that makes it easier to use it
on UNIX/Windows systems.
Note: The script does some minimal sanity checking of the input, but don't
expect it to cover all cases. After all, it is a just a wrapper.
Example:
> echo "Barack Obama was born in Hawaii." > text.txt
> python python main.py -f text.txt
Should display
1.000: (Barack Obama; was; born)
1.000: (Barack Obama; was born in; Hawaii)
Authors: Philippe Remy <github: philipperemy>
Version: 2016-07-08
"""
# Copyright (c) 2016, Philippe Remy <github: philipperemy>
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import print_function
import os
import pickle
from argparse import ArgumentParser
from platform import system
from subprocess import Popen
from sys import argv
from sys import stderr
IS_WINDOWS = True if system() == 'Windows' else False
JAVA_BIN_PATH = 'java.exe' if IS_WINDOWS else 'java'
STANFORD_NER_FOLDER = 'stanford-ner'
def arg_parse():
arg_p = ArgumentParser('Stanford NER Python Wrapper')
arg_p.add_argument('-f', '--filename', type=str, default=None)
arg_p.add_argument('-v', '--verbose', action='store_true')
return arg_p
def debug_print(log, verbose):
if verbose:
print(log)
def process_entity_relations(entity_relations_str, verbose=True):
# format is ollie.
entity_relations = list()
for s in entity_relations_str:
entity_relations.append(s[s.find("(") + 1:s.find(")")].split(';'))
return entity_relations
def stanford_ner(filename, verbose=True, absolute_path=None):
out = 'out.txt'
command = ''
if absolute_path is not None:
command = 'cd {};'.format(absolute_path)
else:
filename = '../{}'.format(filename)
command += 'cd {}; {} -mx1g -cp "*:lib/*" edu.stanford.nlp.ie.NERClassifierCombiner ' \
'-ner.model classifiers/english.all.3class.distsim.crf.ser.gz ' \
'-outputFormat tabbedEntities -textFile {} > ../{}' \
.format(STANFORD_NER_FOLDER, JAVA_BIN_PATH, filename, out)
if verbose:
debug_print('Executing command = {}'.format(command), verbose)
java_process = Popen(command, stdout=stderr, shell=True)
else:
java_process = Popen(command, stdout=stderr, stderr=open(os.devnull, 'w'), shell=True)
java_process.wait()
assert not java_process.returncode, 'ERROR: Call to stanford_ner exited with a non-zero code status.'
if absolute_path is not None:
out = absolute_path + out
with open(out, 'r') as output_file:
results_str = output_file.readlines()
os.remove(out)
results = []
for res in results_str:
if len(res.strip()) > 0:
split_res = res.split('\t')
entity_name = split_res[0]
entity_type = split_res[1]
if len(entity_name) > 0 and len(entity_type) > 0:
results.append([entity_name.strip(), entity_type.strip()])
if verbose:
pickle.dump(results_str, open('out.pkl', 'wb'))
debug_print('wrote to out.pkl', verbose)
return results
def main(args):
arg_p = arg_parse().parse_args(args[1:])
filename = arg_p.filename
verbose = arg_p.verbose
debug_print(arg_p, verbose)
if filename is None:
print('please provide a text file containing your input. Program will exit.')
exit(1)
if verbose:
debug_print('filename = {}'.format(filename), verbose)
entities = stanford_ner(filename, verbose)
print('\n'.join([entity[0].ljust(20) + '\t' + entity[1] for entity in entities]))
if __name__ == '__main__':
exit(main(argv))