From 3c4f7d0e386051e9370b3ecc83f9615be8a6b489 Mon Sep 17 00:00:00 2001
From: Skillachie <dwaynecampbell13@gmail.com>
Date: Wed, 30 Sep 2015 17:11:29 -0400
Subject: [PATCH] continue to next article if not able to extract text

---
 news_corpus_builder/news_corpus_generator.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/news_corpus_builder/news_corpus_generator.py b/news_corpus_builder/news_corpus_generator.py
index fc8e38f..60be3ab 100644
--- a/news_corpus_builder/news_corpus_generator.py
+++ b/news_corpus_builder/news_corpus_generator.py
@@ -25,7 +25,8 @@ def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
             db_name (Optional[str]): Name of database if 'sqlite' is selected.
         '''
 
-        self.g = Goose({'browser_user_agent': 'Mozilla'})
+        self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
+        #self.g = Goose({'browser_user_agent': 'Mozilla'})
         self.corpus_dir = corpus_dir
         self.datastore_type = datastore_type
         self.db_name = db_name
@@ -77,9 +78,20 @@ def generate_corpus(self,articles):
         for article in articles:
             category = article[0]
             link = article[1]
-            ex_article = self.g.extract(url=link)
+
+	    ex_article = None
+
+	    try:
+            	ex_article = self.g.extract(url=link)
+	    except Exception:
+		print('failed to extract article..')
+		continue
+
             ex_title = ex_article.title
             ex_body = ex_article.cleaned_text
+	    #print ex_title
+            #print ex_body
+            #sys.exit(1)
 
             if ex_body == '':
                 self.stats['empty_body_articles'] += 1
@@ -88,7 +100,6 @@ def generate_corpus(self,articles):
             self._save_article({'title':ex_title, 'body': ex_body,
                 'category':category})
 
-
     def _save_article(self,clean_article):
 
         print "Saving article %s..." %(clean_article['title'])