Skip to content

Commit

Permalink
Added spider for Williams Sonoma - issue fictive-kin#95
Browse files Browse the repository at this point in the history
  • Loading branch information
mickaobrien committed May 3, 2013
1 parent 38c2950 commit dd71587
Showing 1 changed file with 63 additions and 0 deletions.
63 changes: 63 additions & 0 deletions scrapy_proj/openrecipes/spiders/williamssonoma_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from openrecipes.items import RecipeItem, RecipeItemLoader


class WilliamsSonomaMixin(object):
source = 'williamssonoma'

def parse_item(self, response):

hxs = HtmlXPathSelector(response)

base_path = """//*[contains(concat(' ', normalize-space(@class), ' '),
' hrecipe ')]"""
recipes_scopes = hxs.select(base_path)

name_path = '//*[@class="fn"]/text()'
description_path = '//*[@class="recipe-description summary"]/p/text()'
image_path = '//img[@class="photo"]/@src'
recipeYield_path = '//*[@class="directions"]/p/text()'
ingredients_path = '//*[@class="ingredient"]/text()'

recipes = []

for r_scope in recipes_scopes:
il = RecipeItemLoader(item=RecipeItem())

il.add_value('source', self.source)

il.add_value('name', r_scope.select(name_path).extract())
il.add_value('image', r_scope.select(image_path).extract())
il.add_value('url', response.url)
il.add_value('description',
r_scope.select(description_path).extract())

# yield given somewhere in description 'Serves n.'
il.add_value('recipeYield',
r_scope.select(recipeYield_path).re('Serves \d\.'))

il.add_value('ingredients',
r_scope.select(ingredients_path).extract())

recipes.append(il.load_item())

return recipes


class WilliamsSonomacrawlSpider(CrawlSpider, WilliamsSonomaMixin):

name = "www.williams-sonoma.com"

allowed_domains = ["www.williams-sonoma.com"]

start_urls = [
"http://www.williams-sonoma.com/recipe/",
]

rules = (
Rule(SgmlLinkExtractor(allow=('/recipe/[\w\-]+\.html')),
callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('/recipe/.+'))),
)

0 comments on commit dd71587

Please sign in to comment.