Re fictive-kin#105 Add spiders for Smells Like Home

mikefiresheets · Aug 14, 2013 · b529a8e · b529a8e
1 parent c47644d
commit b529a8e
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 0 deletions.
diff --git a/scrapy_proj/openrecipes/spiders/smellslikehome_feedspider.py b/scrapy_proj/openrecipes/spiders/smellslikehome_feedspider.py
@@ -0,0 +1,22 @@
+from scrapy.spider import BaseSpider
+from scrapy.http import Request
+from scrapy.selector import XmlXPathSelector
+from openrecipes.spiders.smellslikehome_spider import SmellslikehomeMixin
+
+
+class SmellslikehomefeedSpider(BaseSpider, SmellslikehomeMixin):
+    name = "smellslikehome.feed"
+    allowed_domains = [
+        "www.smells-like-home.com",
+        "feeds.feedburner.com",
+        "feedproxy.google.com",
+    ]
+    start_urls = [
+        "http://www.smells-like-home.com/feed/",
+    ]
+
+    def parse(self, response):
+        xxs = XmlXPathSelector(response)
+        links = xxs.select("//item/*[local-name()='link']/text()").extract()
+
+        return [Request(x, callback=self.parse_item) for x in links]
diff --git a/scrapy_proj/openrecipes/spiders/smellslikehome_spider.py b/scrapy_proj/openrecipes/spiders/smellslikehome_spider.py
@@ -0,0 +1,77 @@
+from scrapy.contrib.spiders import CrawlSpider, Rule
+from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
+from scrapy.selector import HtmlXPathSelector
+from openrecipes.items import RecipeItem, RecipeItemLoader
+
+
+class SmellslikehomeMixin(object):
+    source = 'smellslikehome'
+
+    def parse_item(self, response):
+
+        hxs = HtmlXPathSelector(response)
+
+        # Older recipe posts have a different, much looser format
+        base_path = '//div[@itemtype="http://schema.org/Recipe"] | //div[@class="post-content"]'
+
+        recipes_scopes = hxs.select(base_path)
+
+        name_path = '//div[@itemprop="name"]/text() | //p/span/strong/text()'
+        description_path = '//meta[@name="description"]/@content'
+        image_path = '//img[1]/@src'
+        prepTime_path = '//span[@itemprop="prepTime"][contains(@datetime, "PT")]/text()'
+        cookTime_path = '//span[@itemprop="cookTime"][contains(@datetime, "PT")]/text()'
+        totalTime_path = '//span[@itemprop="totalTime"][contains(@content, "PT")]/text()'
+        recipeYield_path = '//span[@itemprop="recipeYield"]/text()'
+        ingredients_path = '//li[@itemprop="ingredients"]/text() | //ul/li/text()'
+        datePublished = '//div[contains(concat(" ", @class, " "), " post-date ")]/text()'
+
+        recipes = []
+
+        for r_scope in recipes_scopes:
+            il = RecipeItemLoader(item=RecipeItem())
+
+            il.add_value('source', self.source)
+
+            il.add_value('name', r_scope.select(name_path).extract())
+            il.add_value('image', r_scope.select(image_path).extract())
+            il.add_value('url', response.url)
+            il.add_value('description', r_scope.select(description_path).extract())
+
+            il.add_value('prepTime', r_scope.select(prepTime_path).extract())
+            il.add_value('cookTime', r_scope.select(cookTime_path).extract())
+            il.add_value('totalTime', r_scope.select(totalTime_path).extract())
+            il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())
+
+            il.add_value('ingredients', r_scope.select(ingredients_path).extract())
+
+            il.add_value('datePublished', r_scope.select(datePublished).extract())
+
+            recipes.append(il.load_item())
+
+        return recipes
+
+
+class SmellslikehomecrawlSpider(CrawlSpider, SmellslikehomeMixin):
+
+    name = "smellslikehome.com"
+
+    allowed_domains = ["www.smells-like-home.com"]
+
+    start_urls = [
+        "http://www.smells-like-home.com/full-archives/",
+    ]
+
+    rules = (
+        Rule(SgmlLinkExtractor(allow=('/20[01][0-9]/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2007/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2008/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2009/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2010/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2011/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2012/'))),
+        #Rule(SgmlLinkExtractor(allow=('/2013/'))),
+
+        Rule(SgmlLinkExtractor(allow=('/\d{4}/\d{2}/.+')),
+             callback='parse_item'),
+    )