forked from moricard/hackapun
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnonpunsExtractor.py
74 lines (45 loc) · 2.25 KB
/
nonpunsExtractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 25 21:48:14 2015
@author: phc
"""
"""
NON-PUNS HOMONYMS CREATION
"""
from urllib import urlopen #To save information in a URL
import re #Import regular expression library (Regex)
""" -----------------------------Definitions------------------------------- """
def urlSave(url): #Will save the information in a URL
webPage = urlopen(url)
return webPage
""" -----------------------------Declarations------------------------------ """
websiteSource = 'http://dictionary.reference.com/browse/'#Name of website to get the sentences
websiteSource1 = 'http://www.thefreedictionary.com/' #Not used, but could be used. Need to change regex pattern
homonymesFile = '/home/phc/Desktop/Dropbox/Hackapuns/hackapun/data/homophones.csv' #File containing the Homonymes
fileNameOutput = '/home/phc/Desktop/Non_puns_List.txt' #File that will contain the non-pun sentences
NONpunsFile = open(fileNameOutput,'w')
homoPat = re.compile(r'(.*)(\n?)') #Extract the homonyms
wordPat = re.compile(r'[^,]+') #Extract each word in a 'homonym family'
words = [] #Will contain each words of each homonym
allSentences = [] #Will contain all the sentences
""" ---------------------------------Script-------------------------------- """
homonymesList = open(homonymesFile)
homonymesList = homonymesList.read()
homonymes = re.findall(homoPat,homonymesList) #Containes the list of homonymes
#The following loop will go through the homonymes and find a sentence in
#the declared website containing each of the homonymes
for homo in homonymes:
words = re.findall(wordPat,homo[0]) #Seperates word in each homonyms
for word in words:
print word
url = websiteSource + word
webPage = urlSave(url)
exPatTXT = '(<span class="dbox-example">)(.*\s'+word+'\s.*?\.)(<\/span>)'
exPat = re.compile(exPatTXT,re.I) #Fnd the pattern of 'example sentences'
sentences = re.findall(exPat,webPage.read())
if any(sentences): #Only uses webpages that contains ex
for sentence in sentences:
if len(sentence[1]) >=25: #sentences with more than 25 carachters
if len(re.findall('<.*',sentence[1])) == 0:
allSentences.append(sentence[1])
NONpunsFile.write(sentence[1]+'\n') #Write in a txt files