From 3c4f7d0e386051e9370b3ecc83f9615be8a6b489 Mon Sep 17 00:00:00 2001 From: Skillachie Date: Wed, 30 Sep 2015 17:11:29 -0400 Subject: [PATCH] continue to next article if not able to extract text --- news_corpus_builder/news_corpus_generator.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/news_corpus_builder/news_corpus_generator.py b/news_corpus_builder/news_corpus_generator.py index fc8e38f..60be3ab 100644 --- a/news_corpus_builder/news_corpus_generator.py +++ b/news_corpus_builder/news_corpus_generator.py @@ -25,7 +25,8 @@ def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'): db_name (Optional[str]): Name of database if 'sqlite' is selected. ''' - self.g = Goose({'browser_user_agent': 'Mozilla'}) + self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'}) + #self.g = Goose({'browser_user_agent': 'Mozilla'}) self.corpus_dir = corpus_dir self.datastore_type = datastore_type self.db_name = db_name @@ -77,9 +78,20 @@ def generate_corpus(self,articles): for article in articles: category = article[0] link = article[1] - ex_article = self.g.extract(url=link) + + ex_article = None + + try: + ex_article = self.g.extract(url=link) + except Exception: + print('failed to extract article..') + continue + ex_title = ex_article.title ex_body = ex_article.cleaned_text + #print ex_title + #print ex_body + #sys.exit(1) if ex_body == '': self.stats['empty_body_articles'] += 1 @@ -88,7 +100,6 @@ def generate_corpus(self,articles): self._save_article({'title':ex_title, 'body': ex_body, 'category':category}) - def _save_article(self,clean_article): print "Saving article %s..." %(clean_article['title'])