Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Standardizing all actions #26

Merged
merged 11 commits into from
May 29, 2024
4 changes: 3 additions & 1 deletion .github/workflows/python-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ jobs:
strategy:
matrix:
include:
- python-version: "3.7.x" # Min Python version (No 3.6 version)
- python-version: "3.7.x" # Min Python version (No 3.6 version in GitHub repository)
- python-version: "3.8.x"
- python-version: "3.9.x"
- python-version: "3.10.x"
- python-version: "3.11.x"
- python-version: "3.12.x"
- python-version: "3.x" # Last Python version
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class MySpider(scrapy.Spider):
## Advanced usage

`PuppeteerRequest`'s first argument is a browser action.
Avalable actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`.
Available actions are defined in `scrapypuppeteer.actions` module as subclasses of `PuppeteerServiceAction`.
Passing a URL into request is a shortcut for `GoTo(url)` action.

Here is the list of available actions:
Expand Down
10 changes: 5 additions & 5 deletions examples/settings.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
BOT_NAME = 'scrapypuppeteer'
BOT_NAME = "scrapypuppeteer"

SPIDER_MODULES = ['examples.spiders']
NEWSPIDER_MODULE = 'examples.spiders'
SPIDER_MODULES = ["examples.spiders"]
NEWSPIDER_MODULE = "examples.spiders"

CONCURRENT_REQUESTS = 1

DOWNLOADER_MIDDLEWARES = {
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042
}

PUPPETEER_SERVICE_URL = 'http://localhost:3000'
PUPPETEER_SERVICE_URL = "http://localhost:3000"
47 changes: 27 additions & 20 deletions examples/spiders/auto_recaptcha.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,37 +14,44 @@ class AutoRecaptchaSpider(scrapy.Spider):
start_urls = ["https://www.google.com/recaptcha/api2/demo"]

custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1041,
'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042
"DOWNLOADER_MIDDLEWARES": {
"scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware": 1041,
"scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042,
},
"PUPPETEER_INCLUDE_META": True,
"RECAPTCHA_ACTIVATION": True,
"RECAPTCHA_SOLVING": True,
"RECAPTCHA_SUBMIT_SELECTORS": {
"www.google.com/recaptcha/api2/demo": "#recaptcha-demo-submit",
},
'PUPPETEER_INCLUDE_META': True,

'RECAPTCHA_ACTIVATION': True,
'RECAPTCHA_SOLVING': True,
'RECAPTCHA_SUBMIT_SELECTORS': {
'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit',
}
}

def start_requests(self):
for url in self.start_urls:
action = GoTo(url=url)
yield PuppeteerRequest(action=action, callback=self.parse_html, errback=self.error, close_page=False)
yield PuppeteerRequest(
action=action,
callback=self.parse_html,
errback=self.error,
close_page=False,
)

def parse_html(self, response: PuppeteerResponse, **kwargs):
with open(f"recaptcha_page.html", 'wb') as f:
with open(f"recaptcha_page.html", "wb") as f:
f.write(response.body)
action = Screenshot(options={
'full_page': True,
})
yield response.follow(action,
callback=self.make_screenshot,
errback=self.error,
close_page=True)
action = Screenshot(
options={
"full_page": True,
}
)
yield response.follow(
action, callback=self.make_screenshot, errback=self.error, close_page=True
)

def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs):
data = response.screenshot # Note that data is string containing bytes, don't forget to decode them!
data = (
response.screenshot
) # Note that data is string containing bytes, don't forget to decode them!
with open("imageToSave.png", "wb") as fh:
fh.write(base64.b64decode(data))

Expand Down
44 changes: 31 additions & 13 deletions examples/spiders/manual_recaptcha.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,47 @@ class ManualRecaptchaSpider(scrapy.Spider):
def start_requests(self):
for url in self.start_urls:
action = GoTo(url=url)
yield PuppeteerRequest(action=action, callback=self.solve_recaptcha, errback=self.error, close_page=False)
yield PuppeteerRequest(
action=action,
callback=self.solve_recaptcha,
errback=self.error,
close_page=False,
)

def solve_recaptcha(self, response: PuppeteerResponse, **kwargs):
action = RecaptchaSolver()
yield response.follow(action=action, callback=self.submit_recaptcha, errback=self.error, close_page=False)
yield response.follow(
action=action,
callback=self.submit_recaptcha,
errback=self.error,
close_page=False,
)

def submit_recaptcha(self, response, **kwargs):
action = Click('#recaptcha-demo-submit')
yield response.follow(action=action, callback=self.parse_html, errback=self.error, close_page=False)
action = Click("#recaptcha-demo-submit")
yield response.follow(
action=action,
callback=self.parse_html,
errback=self.error,
close_page=False,
)

def parse_html(self, response: PuppeteerResponse, **kwargs):
with open(f"recaptcha_page.html", 'wb') as f:
with open(f"recaptcha_page.html", "wb") as f:
f.write(response.body)
action = Screenshot(options={
'full_page': True,
})
yield response.follow(action,
callback=self.make_screenshot,
errback=self.error,
close_page=True)
action = Screenshot(
options={
"full_page": True,
}
)
yield response.follow(
action, callback=self.make_screenshot, errback=self.error, close_page=True
)

def make_screenshot(self, response: PuppeteerScreenshotResponse, **kwargs):
data = response.screenshot # Note that data is string containing bytes, don't forget to decode them!
data = (
response.screenshot
) # Note that data is string containing bytes, don't forget to decode them!
with open("imageToSave.png", "wb") as fh:
fh.write(base64.b64decode(data))

Expand Down
12 changes: 6 additions & 6 deletions examples/spiders/meduza.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@


class MeduzaSpider(scrapy.Spider):
name = 'meduza'
name = "meduza"

def start_requests(self):
yield PuppeteerRequest('https://meduza.io', callback=self.parse_main_page)
yield PuppeteerRequest("https://meduza.io", callback=self.parse_main_page)

def parse_main_page(self, response: PuppeteerHtmlResponse):
for article_url in response.css('a.Link-isInBlockTitle::attr(href)').getall():
for article_url in response.css("a.Link-isInBlockTitle::attr(href)").getall():
yield response.follow(article_url, callback=self.parse_article)

def parse_article(self, response: PuppeteerHtmlResponse):
yield {
'url': response.url,
'title': response.css('h1::text').get(),
'text': '\n'.join(response.css('p.SimpleBlock-p::text').getall())
"url": response.url,
"title": response.css("h1::text").get(),
"text": "\n".join(response.css("p.SimpleBlock-p::text").getall()),
}
110 changes: 61 additions & 49 deletions examples/spiders/webscraperio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,103 +8,115 @@ class EcommerceSiteSpider(scrapy.Spider):

@staticmethod
def extract_items(list_page_response):
for item_selector in list_page_response.css('div.row div.thumbnail'):
for item_selector in list_page_response.css("div.row div.thumbnail"):
yield {
'link': item_selector.css('a.title::attr(href)').get(),
'title': item_selector.css('a.title::attr(title)').get(),
'price': item_selector.css('h4.price::text').get(),
'description': item_selector.css('p.description::text').get(),
'rating': len(item_selector.css('span.glyphicon-star')),
'reviews_count': int(item_selector
.css('.ratings p.pull-right::text')
.re_first(r'\d+'))
"link": item_selector.css("a.title::attr(href)").get(),
"title": item_selector.css("a.title::attr(title)").get(),
"price": item_selector.css("h4.price::text").get(),
"description": item_selector.css("p.description::text").get(),
"rating": len(item_selector.css("span.glyphicon-star")),
"reviews_count": int(
item_selector.css(".ratings p.pull-right::text").re_first(r"\d+")
),
}

@staticmethod
def extract_item(detail_page_response):
yield {
'link': detail_page_response.url,
'title': detail_page_response.css('h4.price + h4::text').get(),
'price': detail_page_response.css('h4.price::text').get(),
'description': detail_page_response.css('p.description::text').get(),
'rating': len(detail_page_response.css('span.glyphicon-star')),
'reviews_count': int(detail_page_response
.css('.ratings::text')
.re_first('\d+'))
"link": detail_page_response.url,
"title": detail_page_response.css("h4.price + h4::text").get(),
"price": detail_page_response.css("h4.price::text").get(),
"description": detail_page_response.css("p.description::text").get(),
"rating": len(detail_page_response.css("span.glyphicon-star")),
"reviews_count": int(
detail_page_response.css(".ratings::text").re_first("\d+")
),
}


class AjaxPaginationSpider(EcommerceSiteSpider):
name = 'e-commerce-ajax'
name = "e-commerce-ajax"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.start_url = 'https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops'
self.start_url = (
"https://webscraper.io/test-sites/e-commerce/ajax/computers/laptops"
)
self.next_page_ix = 1

def start_requests(self):
yield PuppeteerRequest(GoTo(self.start_url),
close_page=False,
callback=self.process_list_page)
yield PuppeteerRequest(
GoTo(self.start_url), close_page=False, callback=self.process_list_page
)

def process_list_page(self, response):
yield from self.extract_items(response)
self.next_page_ix += 1
next_page_selector = f'button[data-id="{self.next_page_ix}"]'
if response.css(next_page_selector):
yield response.follow(Click(next_page_selector,
wait_options={'selectorOrTimeout': 3000}),
close_page=False,
callback=self.process_list_page)
yield response.follow(
Click(next_page_selector, wait_options={"selectorOrTimeout": 3000}),
close_page=False,
callback=self.process_list_page,
)


class MoreSpider(EcommerceSiteSpider):
name = 'e-commerce-more'
name = "e-commerce-more"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.start_url = 'https://webscraper.io/test-sites/e-commerce/more/computers/laptops'
self.start_url = (
"https://webscraper.io/test-sites/e-commerce/more/computers/laptops"
)
self.seen_item_links = set()

def start_requests(self):
yield PuppeteerRequest(GoTo(self.start_url, wait_options={'selectorOrTimeout': 10000}),
close_page=False,
callback=self.process_list_page)
yield PuppeteerRequest(
GoTo(self.start_url, wait_options={"selectorOrTimeout": 10000}),
close_page=False,
callback=self.process_list_page,
)

def process_list_page(self, response):
for item in self.extract_items(response):
if item['link'] not in self.seen_item_links:
self.seen_item_links.add(item['link'])
if item["link"] not in self.seen_item_links:
self.seen_item_links.add(item["link"])
yield item
more_selector = '.ecomerce-items-scroll-more'
more_selector = ".ecomerce-items-scroll-more"
more_button = response.css(more_selector)
if 'style' not in more_button.attrib:
yield response.follow(Click(more_selector,
wait_options={'selectorOrTimeout': 1000}),
close_page=False,
callback=self.process_list_page)
if "style" not in more_button.attrib:
yield response.follow(
Click(more_selector, wait_options={"selectorOrTimeout": 1000}),
close_page=False,
callback=self.process_list_page,
)


class ScrollSpider(EcommerceSiteSpider):
name = 'e-commerce-scroll'
name = "e-commerce-scroll"

def __init__(self, **kwargs):
super().__init__(**kwargs)
self.start_url = 'https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops'
self.start_url = (
"https://webscraper.io/test-sites/e-commerce/scroll/computers/laptops"
)
self.seen_item_links = set()

def start_requests(self):
yield PuppeteerRequest(GoTo(self.start_url),
close_page=False,
callback=self.process_list_page)
yield PuppeteerRequest(
GoTo(self.start_url), close_page=False, callback=self.process_list_page
)

def process_list_page(self, response):
items = self.extract_items(response)
new_items = [i for i in items if i['link'] not in self.seen_item_links]
new_items = [i for i in items if i["link"] not in self.seen_item_links]
if new_items:
for item in new_items:
self.seen_item_links.add(item['link'])
self.seen_item_links.add(item["link"])
yield item
yield response.follow(Scroll(wait_options={'selectorOrTimeout': 1000}),
close_page=False,
callback=self.process_list_page)
yield response.follow(
Scroll(wait_options={"selectorOrTimeout": 1000}),
close_page=False,
callback=self.process_list_page,
)
Loading
Loading