Merge pull request #6 from mazen-r/main

Add leboncoin.com scraper
scrapfly · Oct 20, 2023 · 8757b70 · 8757b70
2 parents ce087a8 + 713bd42
commit 8757b70
Show file tree

Hide file tree

Showing 8 changed files with 14,647 additions and 0 deletions.
diff --git a/.github/workflows/leboncoin.yaml b/.github/workflows/leboncoin.yaml
@@ -0,0 +1,49 @@
+name: Leboncoin.com Test
+on:
+  workflow_dispatch:
+  schedule:
+    - cron:  '0 2 * * THU'
+
+env:
+  PROJECT_DIR: leboncoin-scraper
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        test: [test_search_scraping, test_ad_scraping]
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.10"
+
+    - name: Install Poetry
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+
+    - name: Cache Poetry virtual environment
+      uses: actions/cache@v2
+      id: cache
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: ${{ runner.os }}-poetry-${{ hashFiles('**/${{ env.PROJECT_DIR }}/pyproject.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-poetry-
+
+    - name: Install dependencies
+      run: |
+        cd ${{ env.PROJECT_DIR }}
+        poetry install
+
+    - name: Run test
+      env:
+        SCRAPFLY_KEY: ${{ secrets.SCRAPFLY_KEY }}
+      run: |
+        cd ${{ env.PROJECT_DIR }}
+        poetry run pytest test.py -k ${{ matrix.test }}
diff --git a/leboncoin-scraper/README.md b/leboncoin-scraper/README.md
@@ -0,0 +1,46 @@
+# leboncoin.com Scraper
+
+This scraper is using [scrapfly.io](https://scrapfly.io/) and Python to scrape property data from leboncoin.com. 
+
+Full tutorial 
+
+The scraping code is located in the `leboncoin.py` file. It's fully documented and simplified for educational purposes and the example scraper run code can be found in `run.py` file.
+
+This scraper scrapes:
+- Leboncoin search for finding ad listings
+- Leboncoin ad pages for ads data
+
+For output examples see the `./results` directory.
+
+## Fair Use Disclaimer
+
+Note that this code is provided free of charge as is, and Scrapfly does __not__ provide free web scraping support or consultation. For any bugs, see the issue tracker.
+
+## Setup and Use
+
+This Leboncoin scraper uses __Python 3.10__ with [scrapfly-sdk](https://pypi.org/project/scrapfly-sdk/) package which is used to scrape and parse leboncoin's data.
+
+0. Ensure you have __Python 3.10__ and [poetry Python package manager](https://python-poetry.org/docs/#installation) on your system.
+1. Retrieve your Scrapfly API key from <https://scrapfly.io/dashboard> and set `SCRAPFLY_KEY` environment variable:
+    ```shell
+    $ export SCRAPFLY_KEY="YOUR SCRAPFLY KEY"
+    ```
+2. Clone and install Python environment:
+    ```shell
+    $ git clone https://github.com/scrapfly/scrapfly-scrapers.git
+    $ cd scrapfly-scrapers/zillow-scraper
+    $ poetry install
+    ```
+3. Run example scrape:
+    ```shell
+    $ poetry run python run.py
+    ```
+4. Run tests:
+    ```shell
+    $ poetry install --with dev
+    $ poetry run pytest test.py
+    # or specific scraping areas
+    $ poetry run pytest test.py -k test_search_scraping
+    $ poetry run pytest test.py -k test_ad_scraping
+    ```
+
diff --git a/leboncoin-scraper/leboncoin.py b/leboncoin-scraper/leboncoin.py
@@ -0,0 +1,81 @@
+"""
+This is an example web scraper for leboncoin.com.
+
+To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
+$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
+"""
+import os
+import json
+from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
+from typing import Dict, List
+from pathlib import Path
+from loguru import logger as log
+
+SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
+BASE_CONFIG = {
+    "asp": True,
+    "country": "fr",
+}
+
+output = Path(__file__).parent / "results"
+output.mkdir(exist_ok=True)
+
+
+def parse_search(result: ScrapeApiResponse):
+    """parse search result data from nextjs cache"""
+    # select the __NEXT_DATA__ script from the HTML
+    next_data = result.selector.css("script[id='__NEXT_DATA__']::text").get()
+    # extract ads listing data from the search page
+    ads_data = json.loads(next_data)["props"]["pageProps"]["initialProps"]["searchData"]["ads"]
+    return ads_data
+
+def _max_search_pages(result: ScrapeApiResponse):
+    """get the number of max pages in the search"""
+    next_data = result.selector.css("script[id='__NEXT_DATA__']::text").get()
+    # extract the total pages number
+    max_search_pages = json.loads(next_data)["props"]["pageProps"]["initialProps"]["searchData"]["max_pages"]
+    return max_search_pages
+
+
+def parse_ad(result: ScrapeApiResponse):
+    """parse ad data from nextjs cache"""
+    next_data = result.selector.css("script[id='__NEXT_DATA__']::text").get()
+    # extract ad data from the ad page
+    ad_data = json.loads(next_data)["props"]["pageProps"]["ad"]
+    return ad_data
+
+
+async def scrape_search(
+    url: str, scrape_all_pages: bool, max_pages: int = 10
+) -> List[Dict]:
+    """scrape leboncoin search"""
+    log.info("scraping search {}", url)
+    first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
+    search_data = parse_search(first_page)
+    total_search_pages = _max_search_pages(first_page)
+    # scrape a specfic amount of search pages
+    if scrape_all_pages == False and max_pages < total_search_pages:
+        total_pages = max_pages
+    # scrape all available pages in the search if scrape_all_pages = True or max_pages > total_search_pages
+    else:
+        total_pages = total_search_pages
+    log.info("scraping search {} pagination ({} more pages)", url, total_pages - 1)
+    # add the ramaining pages in a scraping list
+    _other_pages = [
+        ScrapeConfig(f"{first_page.context['url']}&page={page}", **BASE_CONFIG)
+        for page in range(2, total_pages + 1)
+    ]
+    # scrape the remaining pages concurrently
+    async for result in SCRAPFLY.concurrent_scrape(_other_pages):
+        ads_data = parse_search(result)
+        search_data.extend(ads_data)
+    log.info("scraped {} ads from {}", len(search_data), url)
+    return search_data
+
+
+async def scrape_ad(url: str) -> Dict:
+    """scrape ad page"""
+    log.info("scraping ad {}", url)
+    result = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
+    ad_data = parse_ad(result)
+    return ad_data
diff --git a/leboncoin-scraper/pyproject.toml b/leboncoin-scraper/pyproject.toml
@@ -0,0 +1,31 @@
+[tool.poetry]
+name = "scrapfly-leboncoin"
+version = "0.1.0"
+description = "demo web scraper for leboncoin.com using Scrapfly"
+authors = ["Mazen Ramadan <mazen@scrapfly.io>"]
+license = "NPOS-3.0"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.10"
+scrapfly-sdk = {extras = ["all"], version = "^0.8.5"}
+nested-lookup = "^0.2.25"
+loguru = "^0.7.1"
+
+[tool.poetry.group.dev.dependencies]
+black = "^23.7.0"
+pytest = "^7.3.1"
+cerberus = "^1.3.4"
+asyncio = "^3.4.3"
+pytest-asyncio = "^0.21.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+python_files = "test.py"
+
+[tool.black]
+line-length = 120
+target-version = ['py37', 'py38', 'py39', 'py310', 'py311']