-
Notifications
You must be signed in to change notification settings - Fork 0
/
cloud_lemmatizer.py
72 lines (63 loc) · 2.33 KB
/
cloud_lemmatizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from googleapiclient import discovery
import httplib2
from oauth2client.client import GoogleCredentials
import os.path
import sys
from textblob import TextBlob
def analyze_syntax(text):
"""Use the NL API to analyze the given text string, and returns the
response from the API. Requests an encodingType that matches
the encoding used natively by Python. Raises an
errors.HTTPError if there is a connection problem.
"""
credentials = GoogleCredentials.get_application_default()
scoped_credentials = credentials.create_scoped(
['https://www.googleapis.com/auth/cloud-platform'])
http = httplib2.Http()
scoped_credentials.authorize(http)
service = discovery.build(
'language', 'v1beta1', http=http)
body = {
'document': {
'type': 'PLAIN_TEXT',
'language': 'ES',
'content': text,
},
'features': {
'extract_syntax': True,
},
'encodingType': get_native_encoding_type(),
}
request = service.documents().annotateText(body=body)
return request.execute()
def get_native_encoding_type():
"""Returns the encoding type that matches Python's native strings."""
if sys.maxunicode == 65535:
return 'UTF16'
else:
return 'UTF32'
def paginate(text):
max_tokens = 25000
text = TextBlob(text)
sentences = text.sentences
word_per_sentence = len(text.words) / len(sentences)
sentences_per_page = int(max_tokens / word_per_sentence)
for i in range(0, len(sentences), sentences_per_page):
yield ' '.join(str(s) for s in sentences[i:i + sentences_per_page])
for filename in sys.argv[1:]:
lemma_file = filename + '.lemmas'
if os.path.isfile(lemma_file):
print('Lemma file already exists for:', filename)
continue
print('Retrieving lemmas for:', filename)
with open(filename) as infile:
text = infile.read()
lemma_triplets = []
for i, page in enumerate(paginate(text)):
for token in analyze_syntax(page)['tokens']:
lemma_triplets.append(
(token['text']['content'], token['lemma'], token['partOfSpeech']['tag']))
print('finished page', i)
with open(lemma_file, 'w') as outfile:
for word, lemma, pos in lemma_triplets:
outfile.write('{}\t{}\t{}\n'.format(word, lemma, pos))