-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #6 from mazen-r/main
Add leboncoin.com scraper
- Loading branch information
Showing
8 changed files
with
14,647 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
name: Leboncoin.com Test | ||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '0 2 * * THU' | ||
|
||
env: | ||
PROJECT_DIR: leboncoin-scraper | ||
|
||
jobs: | ||
test: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
test: [test_search_scraping, test_ad_scraping] | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: "3.10" | ||
|
||
- name: Install Poetry | ||
run: | | ||
curl -sSL https://install.python-poetry.org | python3 - | ||
- name: Cache Poetry virtual environment | ||
uses: actions/cache@v2 | ||
id: cache | ||
with: | ||
path: ~/.cache/pypoetry/virtualenvs | ||
key: ${{ runner.os }}-poetry-${{ hashFiles('**/${{ env.PROJECT_DIR }}/pyproject.toml') }} | ||
restore-keys: | | ||
${{ runner.os }}-poetry- | ||
- name: Install dependencies | ||
run: | | ||
cd ${{ env.PROJECT_DIR }} | ||
poetry install | ||
- name: Run test | ||
env: | ||
SCRAPFLY_KEY: ${{ secrets.SCRAPFLY_KEY }} | ||
run: | | ||
cd ${{ env.PROJECT_DIR }} | ||
poetry run pytest test.py -k ${{ matrix.test }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# leboncoin.com Scraper | ||
|
||
This scraper is using [scrapfly.io](https://scrapfly.io/) and Python to scrape property data from leboncoin.com. | ||
|
||
Full tutorial | ||
|
||
The scraping code is located in the `leboncoin.py` file. It's fully documented and simplified for educational purposes and the example scraper run code can be found in `run.py` file. | ||
|
||
This scraper scrapes: | ||
- Leboncoin search for finding ad listings | ||
- Leboncoin ad pages for ads data | ||
|
||
For output examples see the `./results` directory. | ||
|
||
## Fair Use Disclaimer | ||
|
||
Note that this code is provided free of charge as is, and Scrapfly does __not__ provide free web scraping support or consultation. For any bugs, see the issue tracker. | ||
|
||
## Setup and Use | ||
|
||
This Leboncoin scraper uses __Python 3.10__ with [scrapfly-sdk](https://pypi.org/project/scrapfly-sdk/) package which is used to scrape and parse leboncoin's data. | ||
|
||
0. Ensure you have __Python 3.10__ and [poetry Python package manager](https://python-poetry.org/docs/#installation) on your system. | ||
1. Retrieve your Scrapfly API key from <https://scrapfly.io/dashboard> and set `SCRAPFLY_KEY` environment variable: | ||
```shell | ||
$ export SCRAPFLY_KEY="YOUR SCRAPFLY KEY" | ||
``` | ||
2. Clone and install Python environment: | ||
```shell | ||
$ git clone https://github.com/scrapfly/scrapfly-scrapers.git | ||
$ cd scrapfly-scrapers/zillow-scraper | ||
$ poetry install | ||
``` | ||
3. Run example scrape: | ||
```shell | ||
$ poetry run python run.py | ||
``` | ||
4. Run tests: | ||
```shell | ||
$ poetry install --with dev | ||
$ poetry run pytest test.py | ||
# or specific scraping areas | ||
$ poetry run pytest test.py -k test_search_scraping | ||
$ poetry run pytest test.py -k test_ad_scraping | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
""" | ||
This is an example web scraper for leboncoin.com. | ||
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key: | ||
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard" | ||
""" | ||
import os | ||
import json | ||
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse | ||
from typing import Dict, List | ||
from pathlib import Path | ||
from loguru import logger as log | ||
|
||
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"]) | ||
BASE_CONFIG = { | ||
"asp": True, | ||
"country": "fr", | ||
} | ||
|
||
output = Path(__file__).parent / "results" | ||
output.mkdir(exist_ok=True) | ||
|
||
|
||
def parse_search(result: ScrapeApiResponse): | ||
"""parse search result data from nextjs cache""" | ||
# select the __NEXT_DATA__ script from the HTML | ||
next_data = result.selector.css("script[id='__NEXT_DATA__']::text").get() | ||
# extract ads listing data from the search page | ||
ads_data = json.loads(next_data)["props"]["pageProps"]["initialProps"]["searchData"]["ads"] | ||
return ads_data | ||
|
||
def _max_search_pages(result: ScrapeApiResponse): | ||
"""get the number of max pages in the search""" | ||
next_data = result.selector.css("script[id='__NEXT_DATA__']::text").get() | ||
# extract the total pages number | ||
max_search_pages = json.loads(next_data)["props"]["pageProps"]["initialProps"]["searchData"]["max_pages"] | ||
return max_search_pages | ||
|
||
|
||
def parse_ad(result: ScrapeApiResponse): | ||
"""parse ad data from nextjs cache""" | ||
next_data = result.selector.css("script[id='__NEXT_DATA__']::text").get() | ||
# extract ad data from the ad page | ||
ad_data = json.loads(next_data)["props"]["pageProps"]["ad"] | ||
return ad_data | ||
|
||
|
||
async def scrape_search( | ||
url: str, scrape_all_pages: bool, max_pages: int = 10 | ||
) -> List[Dict]: | ||
"""scrape leboncoin search""" | ||
log.info("scraping search {}", url) | ||
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG)) | ||
search_data = parse_search(first_page) | ||
total_search_pages = _max_search_pages(first_page) | ||
# scrape a specfic amount of search pages | ||
if scrape_all_pages == False and max_pages < total_search_pages: | ||
total_pages = max_pages | ||
# scrape all available pages in the search if scrape_all_pages = True or max_pages > total_search_pages | ||
else: | ||
total_pages = total_search_pages | ||
log.info("scraping search {} pagination ({} more pages)", url, total_pages - 1) | ||
# add the ramaining pages in a scraping list | ||
_other_pages = [ | ||
ScrapeConfig(f"{first_page.context['url']}&page={page}", **BASE_CONFIG) | ||
for page in range(2, total_pages + 1) | ||
] | ||
# scrape the remaining pages concurrently | ||
async for result in SCRAPFLY.concurrent_scrape(_other_pages): | ||
ads_data = parse_search(result) | ||
search_data.extend(ads_data) | ||
log.info("scraped {} ads from {}", len(search_data), url) | ||
return search_data | ||
|
||
|
||
async def scrape_ad(url: str) -> Dict: | ||
"""scrape ad page""" | ||
log.info("scraping ad {}", url) | ||
result = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG)) | ||
ad_data = parse_ad(result) | ||
return ad_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
[tool.poetry] | ||
name = "scrapfly-leboncoin" | ||
version = "0.1.0" | ||
description = "demo web scraper for leboncoin.com using Scrapfly" | ||
authors = ["Mazen Ramadan <mazen@scrapfly.io>"] | ||
license = "NPOS-3.0" | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.10" | ||
scrapfly-sdk = {extras = ["all"], version = "^0.8.5"} | ||
nested-lookup = "^0.2.25" | ||
loguru = "^0.7.1" | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
black = "^23.7.0" | ||
pytest = "^7.3.1" | ||
cerberus = "^1.3.4" | ||
asyncio = "^3.4.3" | ||
pytest-asyncio = "^0.21.0" | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" | ||
|
||
[tool.pytest.ini_options] | ||
python_files = "test.py" | ||
|
||
[tool.black] | ||
line-length = 120 | ||
target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] |
Oops, something went wrong.