Skip to content

Commit

Permalink
continue to next article if not able to extract text
Browse files Browse the repository at this point in the history
  • Loading branch information
skillachie committed Sep 30, 2015
1 parent b400805 commit 3c4f7d0
Showing 1 changed file with 14 additions and 3 deletions.
17 changes: 14 additions & 3 deletions news_corpus_builder/news_corpus_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
db_name (Optional[str]): Name of database if 'sqlite' is selected.
'''

self.g = Goose({'browser_user_agent': 'Mozilla'})
self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
#self.g = Goose({'browser_user_agent': 'Mozilla'})
self.corpus_dir = corpus_dir
self.datastore_type = datastore_type
self.db_name = db_name
Expand Down Expand Up @@ -77,9 +78,20 @@ def generate_corpus(self,articles):
for article in articles:
category = article[0]
link = article[1]
ex_article = self.g.extract(url=link)

ex_article = None

try:
ex_article = self.g.extract(url=link)
except Exception:
print('failed to extract article..')
continue

ex_title = ex_article.title
ex_body = ex_article.cleaned_text
#print ex_title
#print ex_body
#sys.exit(1)

if ex_body == '':
self.stats['empty_body_articles'] += 1
Expand All @@ -88,7 +100,6 @@ def generate_corpus(self,articles):
self._save_article({'title':ex_title, 'body': ex_body,
'category':category})


def _save_article(self,clean_article):

print "Saving article %s..." %(clean_article['title'])
Expand Down

0 comments on commit 3c4f7d0

Please sign in to comment.