-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword.py
executable file
·728 lines (663 loc) · 32.9 KB
/
word.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
#!/usr/bin/env python3.6
#########/
# word searches thesaruses and reverse dictionaries with human-readable queries
# Copyright (C) 2018 Seamus Johnston https://seamusjohnston.com
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#########\
# NB: this program was written, in part, to help a friend who is learning
# to code. For that reason, it has comments out the wazoo.
# HOW TO READ THIS FILE
# like many programs, it starts executing from near the bottom
# skip to line 684-ish if you want to see what code runs first
# from here down to there are a bunch of variables and functions
# get code from other libraries that we'll need
import sys, re, requests
from subprocess import call
from colorama import init as colorama_init
# re is a library for regular expressions
# sys(tem) is a library for interacting with the computer "outside" the program
# requests is an HTTP library for talking to websites on the internet
# subprocess is a library for running other programs (any programs, not just python)
# colorama is a library for displaying ANSI escape codes correctly on Windows
# notice that in some cases we import the entire library (import _libraryname_)
# and in others we import only a function or two (from _libraryname_ import _function_)
# in colorama we even rename the function (because init() is too vague, imo)
#### CONSTANTS
# Here we declare global contants that will be used throughout the program
# They are in UPPERCASE by convention (unlike some languages, you can't make
# constants immutable in python -- in C++ for example, we'd do const MY_VAR = 5
# and the const keyword would forbid MY_VAR from ever being changed later)
# putting this string in triple quotes means I can put line breaks in it
USAGE = """
Usage: word [options] [primary modifiers] <word|phrase>
[primary|secondary modifiers <word|phrase|number>]
Commandline interface to Datamuse and Owlbot APIs
-v[vv] verbose modes.
-h, --help Print this help.
Primary modifier examples:
meaning "word meaning feeling tired"
sounds like "word which sounds like tung"
"word sounding like doe but spelled differently"
rhymes with "word rhymes with culminate"
comes after "word comes after sea and that rhymes with norse"
spelled like "word spelled like 'cens?r'"
defined "word nostrum defined"
pronunciation "word pronunciation of otolaryngology"
Secondary modifiers:
max "word like beautiful max 7" (default is 20)
about "word meaning refuse about trash" (max is 5 nouns)
"word meaning refuse about negotiation contracts"
word tries to guess your intent. If it messes up, use quotes.
word meaning of life == ml=of+life
word "meaning of life" == ml=meaning+of+life
Optionally, read <<CONTRIBUTING.md>> and open an issue,
so I can fix it for the next person.
"""
# Variables for creating the API request
# API means application programming interface
# (basically, websites made for computers to read, not humans)
MAXIMUM = "20"
GLOSS = { "ml": "have a meaning like",
"sl": "sound like",
"sp": "are spelled like",
"rel_jja": "is a noun describing the adjective",
"rel_jjb": "is an adjective describing the noun",
"rel_syn": "are synonyms of",
"rel_trg": "are triggered by",
"rel_ant": "are antonyms of",
"rel_spc": "are a more general word for",
"rel_gen": "are a specific kind of",
"rel_com": "are the parts of",
"rel_par": "describe things made with",
"rel_bga": "usually follow",
"rel_bgb": "usually preceed",
"rel_rhy": "are perfect rhymes of",
"rel_nry": "are approximate rhymes of",
"rel_hom": "are homophones of",
"rel_cns": "have the same consonants as",
"lc": "often follow",
"rc": "often come before",
"topics": "are about" }
# in RESTful APIs, data is exchanged via http using a limited number of "verbs"
# you are familiar with GET from sites like youtube:
# https://www.youtube.com/watch?v=yO3MwSbs8&list=WIL&index=89 is a GET request
# v, list, and index are parameters, while yO3MwSbs8, WIL, and 89 are values
# GLOSS is a matching dictionary of parameters and what those parameters mean
#
#### END CONSTANTS
#### HELPER FUNCTIONS
#
def is_rhymes(word):
""" Rhymes is hard to spell :3 """
rhymes = ["rhymes", "rhytms", "rhytems", "ryhms", "rhyms", "rhymnes", "ryhmes", "rhimes", \
"rymes", "rhtyms", "ryhtyms", "rhyemes", "rhymmes", "rymhs", "rhmes", "rhyms", \
"rhyhms", "rhytams", "ryphmes"]
# go through the list of mispellings, one at a time
for r in rhymes:
# if we get a match, return true
# (lower() converts word to lowercase)
if word.lower() == r:
return True
# if nothing matches, return false
return False
# the point of returning true or false from a function is that
# we can use this function later inside an "if" statement
def is_pronounced(word):
""" Pronounced is also hard to spell """
pronounced = ["pronounced", "pronunciation", "pronounsed", "pronouced", "pronouned", \
"pronounciated", "prenounced", "prounouced", "pernounced", "purnounced", \
"pronoused", "pronuced", "pronunced", "pronnounced", "pronanced", \
"prononced", "prounounced", "prononsed", "prononuced", "pernunciation", \
"prononciation", "prounciation", "pronouciation", "pronounciated", \
"pronounciation", "pronanciation", "prononcation", "pernounciation", \
"prononceation", "prenunciation", "prononseation", "prounouciation", \
"pronuniation", "pronunication", "prenounciation", "pronuntiation", \
"pronuncition", "pronociation", "prenunsiation", "pronounsation", \
"pronounceation", "pronounication", "pronauciation", "pronounciacion", \
"pronounsiation"]
for p in pronounced:
if word.lower() == p:
return True
return False
def convert_num(num):
""" Let's let the user enter alphabetical numbers to set the max results they want """
# create a dictionary mapping alphabetical to numeric
nums = {"one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, \
"seven": 7, "eight": 8, "nine": 9, "ten": 10}
# if user's number is less than ten
if num in nums:
# set user's max to the number stored in the dictionary
# (i.e. if num is "five", nums[num] will be 5)
return str(nums[num])
# otherwise, let's assume they entered a numeric string (like "15")
else:
try:
# we convert it to an int before turning it back into a str just to make sure
# it really is a number -- if it's not, python will raise a ValueError
return str(int(num))
# if they entered something silly like "max elephants"
except ValueError:
# set user's max to be nothing, i.e. false
return None
def parse(args, query):
""" Parse the commandline args into a dictionary data structure. """
global query_type
# Deal first with requests for definition or pronunciation
# 1. Make the code easier to read
first_word = args[0]
second_word = args[1] if len(args) > 1 else ""
third_word = args[2] if len(args) > 2 else ""
fourth_word = args[3] if len(args) > 3 else ""
# we use the teranary operator (this if ____ else that) to avoid an IndexError
# IndexError would be raised if we tried to access the second element (args[1])
# in a list which contained only one item (eg args == ["lonely"])
# the teranary operator (in most languages it looks like "____ ? this : that")
# returns "this" when the if is true and "that" when the if is false
# meaning, if len(args) is NOT greater than 1, second_word == ""
# 2. Check for keywords in the list of arguments
# Example: nostrum defined
# Example: pronunciation of otolaryngology
if first_word == "define":
# e.g. if the first word is "define" we'll add the second word to the query
query = {"sp": second_word, "md": "d", "max": "1", "qe": "sp", "ipa": "1"}
# the query is a dictionary of GET parameters for the http request, eg
# https://api.datamuse.com/words?max=1&sp=SECOND_WORD_HERE&qe=sp&md=d&ipa=1
elif second_word == "defined" or second_word == "definition":
query = {"sp": first_word, "md": "d", "max": "1", "qe": "sp", "ipa": "1"}
# this one uses string interpolation (the f"" stuff)
elif f"{second_word} {third_word}" == "means what":
query = {"sp": first_word, "md": "d", "max": "1", "qe": "sp", "ipa": "1"}
elif f"{second_word} {third_word} {fourth_word}" == "is said how":
query = {"sp": first_word, "md": "r", "max": "1", "qe": "sp", "ipa": "1"}
# this one uses regular expressions -- i.e. if the second_word is "of" or "for"
elif first_word == "definition" and re.match(r'(of)|(for)',second_word):
query = {"sp": third_word, "md": "d", "max": "1", "qe": "sp", "ipa": "1"}
# the is_pronounced function returns true if first_word is a (mis)spelling of pronounced
elif re.match(r'(of)|(for)',second_word) and is_pronounced(first_word):
query = {"sp": third_word, "md": "r", "max": "1", "qe": "sp", "ipa": "1"}
# the ordering in the above list is not entirely random
# since an if-elif-else statement won't keep evaluating after it finds a match
# it makes sense to put the most computationally complex clauses at the end
# >>> import timeit
# >>> timeit.timeit('from word_helpers import is_pronounced; is_pronounced("pronounced")', number=10000)
# 0.022870146989589557
# >>> timeit.timeit('args = ["defined"]; args[0] == "defined"', number=10000)
# 0.002359684993280098
# it takes 2 milliseconds to compare a string in a list 10,000 times
# -- versus 2 centiseconds to run is_pronounced 10,000 times
# (on my Intel Core i5 2.67GHz CPU -- obviously speed depends on the processor)
# it's also worth noting that readability counts more than speed optimization (most of the time!)
# Quick way to check if any of the above if statements matched
if "sp" in query:
# if so, we are done in this function
if query["md"] == "r": query_type = "PRO"
if query["md"] == "d": query_type = "DEF"
return query
# these will be useful later
STOP_WORDS = ("and", "meaning", "means", "max", "about", "which", "that")
# Parse more complicated requests for synonyms, etc
# 0 is false in python, so this loop will run until we've removed all the args
while len(args):
# we must reset these vars each time the loop starts
# in case we've deleted items from the args list
first_word = args[0]
second_word = args[1] if len(args) > 1 else ""
third_word = args[2] if len(args) > 2 else ""
# we use the teranary operator (this if ____ else that) to avoid an IndexError
# IndexError would be raised if we tried to access the second element (args[1])
# in a list which contained only one item (eg args == ["lonely"])
# the teranary operator (in most languages it looks like "____ ? this : that")
# returns "this" when the if is true and "that" when the if is false
# meaning, if len(args) is NOT greater than 1, second_word == ""
# Disambiguate homonym requests from spelling correction requests
# Example: sounding like tung
# Example: sounds like doe but spelled differently
if re.match(r'sound((s)|(ing)) like',f"{first_word} {second_word}"):
# again, use len(args) to avoid an IndexError
if len(args) >= 6 and \
re.match(r'((but)|(except)) spelled different(ly)?',f"{args[3]} {args[4]} {args[5]}"):
# but instead of teranary operator,
# use "short circuit logic" -- when python sees "if __A__ and __B__ ",
# it knows that if A is false, the whole thing will be false
# (you can't have "ice cream and potatoes" for dinner if you don't have ice cream)
# and it won't waste time evaluating B, so re.match won't run and args[4]
# won't be accessed and no IndexError will be raised, yay!
# regex explained: ? means the prior thing matched zero or one times
# different(ly)? matches "different" and "differently"
query["rel_hom"] = third_word
# now, delete 6 items from args, starting at item 0
del args[0:6]
else:
query["sl"] = third_word
del args[0:3]
# Example: spelled like 'cens?r'
elif re.match(r'spell((ed)|(ing)) like',f"{first_word} {second_word}"):
# two stars (**) means "unpack" a dictionary
# just like unpacking a suitcase, we've dumped the old contents of query
# into a new dictionary (which we are saving with the same variable name!)
query = {**query,"sp": third_word}
# query["sp"] = third_word also works fine
# just showing off how to combine two dictionaries :)
del args[0:3]
# Example: rhymes with culminate
elif len(args) > 2 and second_word == "with" and is_rhymes(first_word):
query["rel_rhy"] = third_word
del args[0:3]
# Example: almost rhymes with culminate
elif len(args) > 3 and \
f"{first_word} {third_word}" == "almost with" and \
is_rhymes(second_word):
query["rel_nry"] = args[3] # fourth_word
del args[0:4]
# Example: comes after sea
elif f"{first_word} {second_word}" == "comes after":
query["lc"] = third_word
del args[0:3]
elif first_word == "follows":
query["lc"] = second_word
del args[0:2]
elif f"{first_word} {second_word}" == "comes before":
query["rc"] = third_word
del args[0:3]
elif first_word == "preceeds":
query["rc"] = second_word
del args[0:2]
# Example: describes paint
elif first_word == "describes":
query["rel_jjb"] = second_word
del args[0:2]
# Example: associated with feet
elif f"{first_word} {second_word}" == "associated with" or \
f"{first_word} {second_word}" == "triggered by":
query["rel_trg"] = third_word
del args[0:3]
# Example: meaning feeling tired
elif first_word in ["means","meaning","like"]:
# get rid of first_word
del args[0]
# now short circuit logic again, plus using the tuple from ealier
# b/c if we have "meaning deer and sounds like roe" we don't want
# query["ml"] == "deer and sounds like roe" -- it should be just "deer"
while len(args) and args[0] not in STOP_WORDS:
# teranary operator prevents KeyError if "ml" not already in query dictionary
query["ml"] = f"{query['ml']} {args[0]}" if "ml" in query else args[0]
del args[0]
# an example with the previous code to make things clearer
# say args == ["means", "egg", "beater", "and", "max", "35"]
# first_word IS in ["means","meaning","like"]
# del first_word, args is now ["egg", "beater", "and", "max", "35"]
# len(args) == 5, args[0] is NOT in STOP_WORDS
# "ml" is NOT in query, so teranary returns args[0] ("egg")
# args[0] is copied to query["ml"] (query is now {ml: "egg"})
# del args[0], args is now ["beater", "and", "max", "35"]
# return to top of while loop, len(args) == 4, args[0] is NOT in STOP_WORDS
# "ml" IS in query, so teranary returns f"{query['ml']} {args[0]}" ("egg beater")
# f"{query['ml']} {args[0]}" is copied to query["ml"]
# (query is now {ml: "egg beater"})
# del args[0], args is now ["and", "max", "35"]
# return to top of while loop, len(args) == 3,
# args[0] IS in STOP_WORDS (args[0] == "and")
# DO NOT enter the while loop, continue past this code block
# Discover the topic of our query
elif first_word == "about":
del args[0]
count = 0
# Datamuse allows a max of five topic words
while len(args) and args[0] not in STOP_WORDS and count <= 5:
query["topics"] = f"{query['topics']} {args[0]}" if "topics" in query else args[0]
del args[0]
# count += 1 is the same as count = count + 1
count += 1
# How many results to return (max 1000)
elif first_word in ["max", "maximum", "only"]:
user_max = convert_num(second_word)
if user_max and int(user_max) <= 1000:
query["max"] = user_max
del args[0:2]
# Remove filler words if they weren't parsed out above
elif first_word in ["that","which","and","like","is"]:
del args[0]
# Add anything not otherwise parsable to the ml parameter
else:
query["ml"] = f"{query['ml']} {first_word}" if "ml" in query else first_word
del args[0]
# this is the bottom of that massive while loop
# if args is not empty by now, we'll start over from the top ^
return query
# and this is the end of the "def parse(args, query)" function
# whew!
def go_fetch(query):
""" Turn the query dictionary into a real http request using the requests library! """
responses = []
explained = ""
global query_type
global verbose
global GLOSS
global MAXIMUM
if query_type == "PRO": explained = f"You asked for the pronunciation of '{query['sp']}'."
elif query_type == "DEF": explained = f"You asked for the definition of '{query['sp']}'."
else:
# loop through the dictionary, one key at a time, and explain what each entry is for
query_glossed = []
for param in query:
# if this one is max or md (metadata), skip it
if param == "max" or param == "md": continue
# it's not an accident that the keys in query are the same as the keys in GLOSS
query_glossed.append(f"{GLOSS[param]} {query[param]}")
# eg GLOSS has {"sp": "are spelled like"} and query has {"sp": "dear"}, then
# explained[0] == f"{GLOSS['sp']} {query['sp']} == "are spelled like dear"
explained = "You asked for words which " + " and ".join(query_glossed)
# Let's set a default
if "max" not in query: query["max"] = MAXIMUM
# there's a TON of stuff going on in this line
datamuse = requests.get('https://api.datamuse.com/words',params=query)
# first, the requests library's get() function "urlencodes" the url and parameters
# e.g. if query == {"ml": "ringing in the ears"}, it becomes "?ml=ringing+in+the+ears"
# next, it opens an http connection to datamuse.com, something like:
# * Trying 54.225.209.164...
# * Connected to api.datamuse.com (54.225.209.164) port 443 (#0)
# then, it sends an http request which consists of a "header" and (optionally) a "body"
# which looks something like this:
#
# GET https://api.datamuse.com/words?ml=ringing+in+the+ears
# Connection: 'keep-alive'
# Accept: */*
# User-Agent: python-requests/2.18.4
# Accept-Encoding: gzip, deflate
#
# and the datamuse API sends back a response which looks something like:
# HTTP/1.1 200 OK
# Cache-Control: no-transform, max-age=86400
# Content-Type: application/json
# Date: Fri, 02 Feb 2018 02:53:45 GMT
# Vary: Accept-Encoding
# Content-Length: 4634
# Connection: keep-alive
#
# [{"word":"tinnitus","score":51691,"tags":["syn","n"]},{"word":"ring",". . .
#
# then the response is parsed into a python object
# (sticks the headers in one variable, the body into another, etc)
# and the object is returned from get() and we store it in "datamuse"
# finally, we stick the response object into a list, like so:
responses.append(datamuse)
# If a definition is asked for, we'll use two APIs
if query_type == "DEF":
owlbot = requests.get(f"https://owlbot.info/api/v2/dictionary/{query['sp']}")
responses.append(owlbot)
# print out helpful info if the user asked for it
if verbose: print(explained) # Plain english description of our query
return responses
def fortune_cookie():
""" Give the user something nice if the query fails :) """
r = requests.get('http://www.bsdfortune.com')
# a regular expression in python can be "compiled"
# which a) makes it a tiny bit faster (important if you are using the same one many times)
# and b) gives access to some more advanced features, like re.MULTILINE
# www.bsdfortune.com doesn't have an API, so this regex is for getting the fortune out
# of the source code of a human-readable webpage
rx = re.compile(r'http://www\.aasted\.org -->\n(.*)<br/> \n</p>\n<a href="./" rel="self" title="BSD Fortune">',re.MULTILINE|re.DOTALL)
s = re.search(rx,r.text)
# then use substitution to remove "<br/>" tags (substitution: replace with nothing, lol)
# s.groups() returns the caputred groups in the regex above in a list
# s.groups()[0] gets the first item in the list (in this case, the text of the fortune)
quote = re.sub(r'<br/>','',s.groups()[0])
return quote
def print_response(responses):
""" Turn JSON formatted responses into nice printable output. """
connection_error, empty_results = False, False
# the "global" keyword tells python that these variables are defined
# *outside* our print_response() function
global query_type
global verbose
# you should mostly avoid global variables, but they are sometimes handy
# First, check if we have gotten any errors when connecting to the api
# enumerate() returns each item in a list along with the item's index
for index, response in enumerate(responses):
# an http status code is a number sent from the web server
# everyone knows the dreaded "404" (not found)
# there is also 200 (ok), 503 (service unavailable), 418 (i'm a teapot -- not joking!)
# and dozens of others
if response.status_code != requests.codes.OK:
connection_error = True
del responses[index]
# we also check if the response is empty
# (that means the api found no words matching our query)
elif response.json() == []:
empty_results = True
del responses[index]
# this is because Windows doesn't understand ANSI color codes >:(
# e.g. \033[0;36m means "turn the text after me blue" -- but windows is like "??"
# so the colorama library translates the ANSI codes
colorama_init()
if responses == [] and connection_error == True:
print("\033[0;36mUnable to reach API.\033[0m Check your internet connection or try again with more feeling.")
sys.exit(1)
elif responses == [] and empty_results == True:
# if the user has the BSD 'fortune' program installed, use it
try:
fortune = call(['fortune','-s'])
except FileNotFoundError:
# otherwise, get a fortune from the web
fortune = fortune_cookie()
if fortune:
print("\033[0;36mNo results found!\033[0m Have a fortune cookie:")
print(fortune)
else:
print("\033[0;36mNo results found!\033[0m Try a paper dictionary instead?")
sys.exit(1)
# quick note about JSON before we dive in further
# json is a method of representing abitrarily complex objects
# it comes from javascript (JavaScript Object Notation)
# like most js stuff it is excellently useful and a touch unholy
# together with xml, yaml, and cvs, it is the commonest way of
# making text data machine-readable
# to help you understand, here are some examples of json objects
#
# [ {'type': 'noun', 'definition': 'available money; cash.', 'example': None},
# {'type': 'adjective', 'definition': 'willing or eager to do something.',
# 'example': 'she is ready to die for her political convictions'} ]
# a list containing two dictionaries
# each dictionary contains keys of 'type', 'definition', and 'example'
#
# [ {'word': 'ready', 'score': 2147483647, 'tags': ['query'],
# 'defs': ['n\tpoised for action', 'v\tprepare for eating by applying heat'] } ]
# a list containing one dictionary with keys 'word','score','tags', and 'defs'
# notice that the value of 'tags' and 'defs' are both lists!
#
# [ {'word': 'devil', 'score': 2147483647,
# 'tags': ['query', 'pron:D EH1 V AH0 L ', 'ipa_pron:dˈɛvʌɫ'] } ]
# a list containing one dictionary with keys 'word', 'score', and 'tags'
#
# [ {'word': 'coleslaw', 'score': 26424, 'tags': ['n']},
# {'word': 'dressing', 'score': 26424, 'tags': ['n']},
# {'word': 'greens', 'score': 26424, 'tags': ['n'] } ]
# you can read this one by yourself :)
if query_type == "DEF":
for response in responses:
# print out helpful info if the user asked for it
if verbose > 1: print(response.url) # What we asked the remote server
if verbose > 2: print(response.text) # The raw return JSON
# check if this is the datamuse API or the owlbot API
if re.search(r'datamuse',response.url):
api = "datamuse"
# the json() function turns the raw response (bytes of data)
# into python lists, dictionaries, etc (like demonstrated above)
# we take the first item in the list [0] because a dictionary query
# only has one entry (the word and its definition)
payload = response.json()[0]
word = payload["word"]
# since 'defs' is a list, let's join it together into a string for printing
definition = '\n'.join(payload['defs'])
lines = []
for entry in payload['defs']:
# get the word type and its definition out of the string
# yes, you can have two (or more!) return values from a function in python
# groups() returns a tuple of all the capture groups in the regex (see below)
# notice that _def not def (b/c def is a keyword)
type,_def = re.match(r'([^\\]*)\t(.*)',entry).groups()
# put the type and def back into a string :)
# ljust(11) is left justify by 11 spaces (neat formatted columns!)
line = f"{type.ljust(11)} {_def}"
# put that line into a list
lines.append(line)
# go back up and get another ^
# now join all the lines together with a new line character (\n) between them
definition = '\n'.join(lines)
# regex explained: ([^\\]*)\t(.*)
# () capturing group -- what we find in here, we keep, lol
# [] character set -- match any of the characters in here
# [^ ] negation -- do not match any of the characters in here
# \\ *one* literal backslash -- b/c \ is special in regex \\ means \
# * the previous thing, zero or more times
# \t literal tab character
# . any character at all ever -- even ones you weren't thinking about when you typed it :D
# all together: anything which is not a \, followed by a \t, followed by anything
# capture the first bit (type), forget the \t, caputre the second bit (_def)
else:
api = "owlbot"
payload = response.json()
word = re.search(r'dictionary/(.*)$',response.url).groups()[0]
# regex explained: $ means "end of the line"
# it's not a character like \n or \r
# it is an anchor (^ means "start of the line")
lines = []
for entry in payload:
line = f"{entry['type'].ljust(11)} {entry['definition']}"
# ' ' * 12 means insert 12 spaces
if entry['example']: line += f"\n{' ' * 12}Example:{entry['example']}"
lines.append(line)
definition = '\n'.join(lines)
# lots of work, but now we print it! \o/
print(f"\033[0;36m{api}\033[0m says word \033[0;32m{word}\033[0m means")
print(definition)
if query_type == "PRO":
# print out helpful info if the user asked for it
if verbose > 1: print("The answer came from: ",responses[0].url)
if verbose > 2: print("The raw JSON response was: ",responses[0].text)
# no for loop and only one response (responses[0])
# (b/c we use only one API for everything except dictionary lookups)
payload = responses[0].json()[0]
word = payload["word"]
for tag in payload['tags']:
if re.match(r'pron:',tag):
pron = re.match(r'pron:(.*)',tag).groups()[0]
elif re.match(r'ipa_pron:',tag):
ipa = re.match(r'ipa_pron:(.*)',tag).groups()[0]
pronunciation = f"\033[0;32m{pron}\033[0m (\033[0;32m{ipa}\033[0m)"
print(f"\033[0;36mdatamuse\033[0m says word \033[0;32m{word}\033[0m is pronounced like {pronunciation}")
else:
# print out helpful info if the user asked for it
if verbose > 1: print("The answer came from: ",responses[0].url)
if verbose > 2: print("The raw JSON response was: ",responses[0].text)
payload = responses[0].json()
# this will be fun to explain but. . .
# 1. go through each entry. if it has tags (a list), turn the list into a string
for entry in payload:
entry['tags'] = ', '.join(entry['tags']) if 'tags' in entry else ''
# 2. create a function which takes one argument (entry -- a dictionary)
# and returns a formatted string with justification and coloring
fentry = lambda entry: (f"\033[0;32m{entry['word'].rjust(13)}\033[0m "
f"\033[0;36m{entry['tags'].rjust(13)}\033[0m ")
# 3. for each entry in the payload list, run fentry(entry)*
# (all the entries are now formatted as strings)
entries = list(map(fentry, payload))
# 4. starting at 0, go up to len(entries)-1 in steps of 3 (0,3,6,9. . .)
# for each step *i*, take a slice of entries from i to i+3
# join them together
# this creates a single string containing three list entries
# store all the strings in a list in the variable lines
lines = (''.join(entries[i:i+3]) for i in range(0,len(entries),3))
print("\033[0;36mdatamuse thinks these words may help!\033[0m".rjust(94))
# 5. join the lines together with \n in between each
print('\n'.join(lines))
# * extra note here about map()
# since you are interested in data stuff :3
# there's two very common data operations
# one is "for every datum, do something to it"
# another is "keep some data, get rid of others"
# the first is usually called map
# the second is called filter
# python has functions for both of them (helpfully called map() and fliter(), tada!)
# both take two arguments: a function and a list (or tuple or dictionary)
# eg. filter(my_function,my_list)
# with map, the function should take one argument, transform it, and return it
# eg. def my_function(x):
# return x + 3
# (or my_function = lambda x: x + 3)
# that function adds three but you can do any kind of (very complex) transforms
# with filter, the function should take one argument, return true if it should be kept,
# or false if not
# eg. def my_function(x):
# if x > 34.99: return True
# else: return False
# (or my_function = lambda x: True if x > 34.99 else False )
#
# the tricky bit is that neither map() or filter() return your data (huh?)
# they return iterators
# what's an iterator, sam?
# an iterator is like a soda vending machine
# it has all the cans of pop inside,
# but you stick your quarters in and get them out one by one
# for example:
# >>> lst = [1,2,3,4,5]
# >>> map(lambda x: x + 3, lst)
# <map object at 0x7f1c78673b38> <-- this is the iterator
# . . . and here's the loop that "iterates" over it:
# >>> for item_plus_three in map(lambda x: x + 3, lst):
# ... print(item_plus_three)
# ...
# 4
# 5
# 6
# 7
# 8
#
#### END HELPER FUNCTIONS
if __name__ == "__main__":
# if there are no comandline arguments or if the first arg is help
if len(sys.argv) == 1 or sys.argv[1] in ["-h","--help"]:
# print instructions
print(USAGE)
# and exit the program
sys.exit()
# sys.argv is a list of stuff you typed to start the program
# if you typed "word.py --help" you get sys.argv == ["word.py","--help"]
# copy sys.argv starting with the second element (index 1)
args = sys.argv[1:]
# a flag to set if the user asks for definitions or pronuciation help
query_type = None
# verbose flag is off (set to zero/false) by default
verbose = 0
# Turn on verbose flag if asked (this will output helpful debugging info)
# if first comandline arg starts with a dash followed by 1, 2, or 3 v's
if re.match(r"-(?:v){1,3}\b",args[0]):
# set verbose to the number of v's (minus 1 for the dash!)
verbose = len(args[0]) - 1
# get rid of the flag from the list, so we don't re-read it later
args.pop(0)
# regex explained: -(?:v){1,3}\b
# - literal dash
# (?:) a NON capturing group (will make more sense later)
# v literal letter v
# {1,3} the prior group, found once, twice, or thrice (in a row)
# \b word boundary (so we match "-v" or "-vvv" not "-vvvabc")
# here's the "heart" of the program <3
# 1. turn the user input into a usable web address
query = parse(args, {})
# 2. go get data from that web address
responses = go_fetch(query)
# 3. print out the response we got back from the internet
print_response(responses)