-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathdetector_try.py
79 lines (60 loc) · 2.8 KB
/
detector_try.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
import os, requests, re, csv
import pandas as pd
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.parse.stanford import StanfordParser
from nltk.parse.stanford import StanfordDependencyParser
java_path = "usr/bin/java"
os.environ['JAVAHOME'] = java_path
current_dir = os.path.dirname(os.path.abspath(__file__))
stanford_parser_dir = current_dir + '/stanford_NLP/stanford-parser-full-2015-04-20'
eng_model_path = stanford_parser_dir + "/edu/stanford/nlp/models/lexparser/englishRNN.ser.gz"
my_path_to_models_jar = stanford_parser_dir + "/stanford-parser-3.5.2-models.jar"
my_path_to_jar = stanford_parser_dir + "/stanford-parser.jar"
parser = StanfordParser(model_path = eng_model_path, path_to_models_jar = my_path_to_models_jar, path_to_jar = my_path_to_jar)
dependency_parser = StanfordDependencyParser(path_to_jar = my_path_to_jar, path_to_models_jar = my_path_to_models_jar)
def save_csv_file(this_text_data, database_location):
with open(database_location , 'w', newline='', encoding='utf-8') as csvFile:
csvWriter = csv.writer(csvFile, delimiter = ',')
for n in range(0, len(this_text_data)):
csvWriter.writerow(this_text_data[n])
def parse_sentence(sentence):
print (sentence)
result = dependency_parser.raw_parse(sentence)
dep = result.__next__()
parsed_result = list(dep.triples())
print (parsed_result)
auxpass = False
nsubjpass = False
nsubj = False
if "nsubj" in str(parsed_result):
nsubj = True
if "nsubjpass" in str(parsed_result):
nsubjpass = True
if "auxpass" in str(parsed_result):
auxpass = True
words_in_sentence = word_tokenize(sentence)
sentence_length_in_words = len(words_in_sentence)
longest_word = max(words_in_sentence, key=len)
longest_word_length = len(longest_word)
return [sentence,nsubj,nsubjpass,auxpass,sentence_length_in_words,longest_word,longest_word_length]
def extract_sentences(page_content):
this_text_data = []
this_text_data.append(['sentence','nsubj','nsubjpass','auxpass','sentence_length_in_words','longest_word','longest_word_length'])
sentences_to_parse = sent_tokenize(page_content)
for sentence in sentences_to_parse:
new_data_row = parse_sentence(sentence)
this_text_data.append(new_data_row)
return this_text_data
def get_text_from_dir(location):
with open(location, 'r') as local_file:
file_content = local_file.read()
return file_content
def run_prog(dictionary_array):
print(dictionary_array)
for dictionary in dictionary_array:
print ('Processing ' + dictionary['location'] )
text = get_text_from_dir(dictionary['location'])
sentences_data = extract_sentences(text)
save_csv_file(sentences_data,'PA_output' + dictionary['results'])