Skip to content

Commit

Permalink
Re fictive-kin#105 Add spiders for Smells Like Home
Browse files Browse the repository at this point in the history
  • Loading branch information
Mike Firesheets committed Aug 14, 2013
1 parent c47644d commit b529a8e
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 0 deletions.
22 changes: 22 additions & 0 deletions scrapy_proj/openrecipes/spiders/smellslikehome_feedspider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from scrapy.spider import BaseSpider
from scrapy.http import Request
from scrapy.selector import XmlXPathSelector
from openrecipes.spiders.smellslikehome_spider import SmellslikehomeMixin


class SmellslikehomefeedSpider(BaseSpider, SmellslikehomeMixin):
name = "smellslikehome.feed"
allowed_domains = [
"www.smells-like-home.com",
"feeds.feedburner.com",
"feedproxy.google.com",
]
start_urls = [
"http://www.smells-like-home.com/feed/",
]

def parse(self, response):
xxs = XmlXPathSelector(response)
links = xxs.select("//item/*[local-name()='link']/text()").extract()

return [Request(x, callback=self.parse_item) for x in links]
77 changes: 77 additions & 0 deletions scrapy_proj/openrecipes/spiders/smellslikehome_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from openrecipes.items import RecipeItem, RecipeItemLoader


class SmellslikehomeMixin(object):
source = 'smellslikehome'

def parse_item(self, response):

hxs = HtmlXPathSelector(response)

# Older recipe posts have a different, much looser format
base_path = '//div[@itemtype="http://schema.org/Recipe"] | //div[@class="post-content"]'

recipes_scopes = hxs.select(base_path)

name_path = '//div[@itemprop="name"]/text() | //p/span/strong/text()'
description_path = '//meta[@name="description"]/@content'
image_path = '//img[1]/@src'
prepTime_path = '//span[@itemprop="prepTime"][contains(@datetime, "PT")]/text()'
cookTime_path = '//span[@itemprop="cookTime"][contains(@datetime, "PT")]/text()'
totalTime_path = '//span[@itemprop="totalTime"][contains(@content, "PT")]/text()'
recipeYield_path = '//span[@itemprop="recipeYield"]/text()'
ingredients_path = '//li[@itemprop="ingredients"]/text() | //ul/li/text()'
datePublished = '//div[contains(concat(" ", @class, " "), " post-date ")]/text()'

recipes = []

for r_scope in recipes_scopes:
il = RecipeItemLoader(item=RecipeItem())

il.add_value('source', self.source)

il.add_value('name', r_scope.select(name_path).extract())
il.add_value('image', r_scope.select(image_path).extract())
il.add_value('url', response.url)
il.add_value('description', r_scope.select(description_path).extract())

il.add_value('prepTime', r_scope.select(prepTime_path).extract())
il.add_value('cookTime', r_scope.select(cookTime_path).extract())
il.add_value('totalTime', r_scope.select(totalTime_path).extract())
il.add_value('recipeYield', r_scope.select(recipeYield_path).extract())

il.add_value('ingredients', r_scope.select(ingredients_path).extract())

il.add_value('datePublished', r_scope.select(datePublished).extract())

recipes.append(il.load_item())

return recipes


class SmellslikehomecrawlSpider(CrawlSpider, SmellslikehomeMixin):

name = "smellslikehome.com"

allowed_domains = ["www.smells-like-home.com"]

start_urls = [
"http://www.smells-like-home.com/full-archives/",
]

rules = (
Rule(SgmlLinkExtractor(allow=('/20[01][0-9]/'))),
#Rule(SgmlLinkExtractor(allow=('/2007/'))),
#Rule(SgmlLinkExtractor(allow=('/2008/'))),
#Rule(SgmlLinkExtractor(allow=('/2009/'))),
#Rule(SgmlLinkExtractor(allow=('/2010/'))),
#Rule(SgmlLinkExtractor(allow=('/2011/'))),
#Rule(SgmlLinkExtractor(allow=('/2012/'))),
#Rule(SgmlLinkExtractor(allow=('/2013/'))),

Rule(SgmlLinkExtractor(allow=('/\d{4}/\d{2}/.+')),
callback='parse_item'),
)

0 comments on commit b529a8e

Please sign in to comment.