Merge pull request #20 from mazen-r/main

add domain.com.au scraper
scrapfly · Nov 22, 2023 · 5e25df2 · 5e25df2
2 parents 272053c + 6ba27f8
commit 5e25df2
Show file tree

Hide file tree

Showing 8 changed files with 4,901 additions and 0 deletions.
diff --git a/.github/workflows/domaincom.yaml b/.github/workflows/domaincom.yaml
@@ -0,0 +1,49 @@
+name: Domain.com.au Test
+on:
+  workflow_dispatch:
+  schedule:
+    - cron:  '0 2 * * THU'
+
+env:
+  PROJECT_DIR: domaincom-scraper
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        test: [test_search_scraping, test_properties_scraping]
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: "3.10"
+
+    - name: Install Poetry
+      run: |
+        curl -sSL https://install.python-poetry.org | python3 -
+
+    - name: Cache Poetry virtual environment
+      uses: actions/cache@v2
+      id: cache
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: ${{ runner.os }}-poetry-${{ hashFiles('**/${{ env.PROJECT_DIR }}/pyproject.toml') }}
+        restore-keys: |
+          ${{ runner.os }}-poetry-
+
+    - name: Install dependencies
+      run: |
+        cd ${{ env.PROJECT_DIR }}
+        poetry install
+
+    - name: Run test
+      env:
+        SCRAPFLY_KEY: ${{ secrets.SCRAPFLY_KEY }}
+      run: |
+        cd ${{ env.PROJECT_DIR }}
+        poetry run pytest test.py -k ${{ matrix.test }}
diff --git a/domaincom-scraper/README.md b/domaincom-scraper/README.md
@@ -0,0 +1,45 @@
+# Domain.com.au Scraper
+
+This scraper is using [scrapfly.io](https://scrapfly.io/) and Python to scrape property listing data from Domain.com.au. 
+
+Full tutorial  
+
+The scraping code is located in the `domaincom.py` file. It's fully documented and simplified for educational purposes and the example scraper run code can be found in `run.py` file.
+
+This scraper scrapes:
+- Domain.com.au property search for finding property listings
+- Domain.com.au property pages for property listing data
+
+For output examples see the `./results` directory.
+
+## Fair Use Disclaimer
+
+Note that this code is provided free of charge as is, and Scrapfly does __not__ provide free web scraping support or consultation. For any bugs, see the issue tracker.
+
+## Setup and Use
+
+This Domain.com.au scraper uses __Python 3.10__ with [scrapfly-sdk](https://pypi.org/project/scrapfly-sdk/) package which is used to scrape and parse Domaincom's data.
+
+0. Ensure you have __Python 3.10__ and [poetry Python package manager](https://python-poetry.org/docs/#installation) on your system.
+1. Retrieve your Scrapfly API key from <https://scrapfly.io/dashboard> and set `SCRAPFLY_KEY` environment variable:
+    ```shell
+    $ export SCRAPFLY_KEY="YOUR SCRAPFLY KEY"
+    ```
+2. Clone and install Python environment:
+    ```shell
+    $ git clone https://github.com/scrapfly/scrapfly-scrapers.git
+    $ cd scrapfly-scrapers/domain-scraper
+    $ poetry install
+    ```
+3. Run example scrape:
+    ```shell
+    $ poetry run python run.py
+    ```
+4. Run tests:
+    ```shell
+    $ poetry install --with dev
+    $ poetry run pytest test.py
+    # or specific scraping areas
+    $ poetry run pytest test.py -k test_search_scraping
+    $ poetry run pytest test.py -k test_properties_scraping
+    ```
diff --git a/domaincom-scraper/domaincom.py b/domaincom-scraper/domaincom.py
@@ -0,0 +1,148 @@
+"""
+This is an example web scraper for domain.com.au
+
+To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
+$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
+"""
+import os
+import json
+import jmespath
+from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
+from typing import Dict, List
+from pathlib import Path
+from loguru import logger as log
+
+
+SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
+
+
+BASE_CONFIG = {
+    # bypass domain.com.au scraping blocking    
+    "asp": True,
+    # set the proxy country to australia
+    "country": "AU",
+}
+
+
+output = Path(__file__).parent / "results"
+output.mkdir(exist_ok=True)
+
+
+def parse_hidden_data(response: ScrapeApiResponse):
+    """parse json data from script tags"""
+    selector = response.selector
+    script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
+    data = json.loads(script)
+    with open("data.json", "w", encoding="utf-8") as file:
+        json.dump(data, file, indent=2, ensure_ascii=False)
+    return data["props"]["pageProps"]["componentProps"]
+
+
+def parse_property_page(data: Dict) -> Dict:
+    """refine property pages data"""
+    if not data:
+        return
+    result = jmespath.search(
+        """{
+    listingId: listingId,
+    listingUrl: listingUrl,
+    unitNumber: unitNumber,
+    streetNumber: streetNumber,
+    street: street,
+    suburb: suburb,
+    postcode: postcode,
+    createdOn: createdOn,
+    propertyType: propertyType,
+    beds: beds,
+    phone: phone,
+    agencyName: agencyName,
+    propertyDeveloperName: propertyDeveloperName,
+    agencyProfileUrl: agencyProfileUrl,
+    propertyDeveloperUrl: propertyDeveloperUrl,
+    description: description,
+    loanfinder: loanfinder,
+    schools: schoolCatchment.schools,
+    suburbInsights: suburbInsights,
+    gallery: gallery,
+    listingSummary: listingSummary,
+    agents: agents,
+    features: features,
+    structuredFeatures: structuredFeatures,
+    faqs: faqs
+    }""",
+        data,
+    )
+    return result
+
+
+def parse_search_page(data):
+    """refine search pages data"""
+    if not data:
+        return    
+    data = data["listingsMap"]
+    result = []
+    # iterate over card items in the search data
+    for key in data.keys():
+        item = data[key]
+        parsed_data = jmespath.search(
+            """{
+        id: id,
+        listingType: listingType,
+        listingModel: listingModel
+      }""",
+        item,
+        )
+        # execulde the skeletonImages key from the data
+        parsed_data["listingModel"].pop("skeletonImages")
+        result.append(parsed_data)
+    return result
+
+
+async def scrape_properties(urls: List[str]) -> List[Dict]:
+    """scrape listing data from property pages"""
+    # add the property page URLs to a scraping list
+    to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
+    properties = []
+    # scrape all the property page concurrently
+    async for response in SCRAPFLY.concurrent_scrape(to_scrape):
+        # parse the data from script tag
+        data = parse_hidden_data(response)
+        # aappend the data to the list after refining
+        properties.append(parse_property_page(data))
+    log.success(f"scraped {len(properties)} property listings")
+    return properties
+
+
+async def scrape_search(url: str, max_scrape_pages: int = None):
+    """scrape property listings from search pages"""
+    first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
+    log.info("scraping search page {}", url)
+    data = parse_hidden_data(first_page)
+    search_data = parse_search_page(data)
+    # get the number of maximum search pages
+    max_search_pages = data["totalPages"]
+    # scrape all available pages if not max_scrape_pages or max_scrape_pages >= max_search_pages
+    if max_scrape_pages and max_scrape_pages < max_search_pages:
+        max_scrape_pages = max_scrape_pages
+    else:
+        max_scrape_pages = max_search_pages
+    log.info(
+        f"scraping search pagination, remaining ({max_scrape_pages - 1} more pages)"
+    )
+    # add the remaining search pages to a scraping list
+    other_pages = [
+        ScrapeConfig(
+            # paginate the search pages by adding a "?page" parameter at the end of the URL
+            str(first_page.context["url"]) + f"?page={page}",
+            **BASE_CONFIG,
+        )
+        for page in range(2, max_scrape_pages + 1)
+    ]
+    # scrape the remaining search pages concurrently
+    async for response in SCRAPFLY.concurrent_scrape(other_pages):
+        # parse the data from script tag        
+        data = parse_hidden_data(response)
+        # aappend the data to the list after refining        
+        search_data.extend(parse_search_page(data))
+    log.success(f"scraped ({len(search_data)}) from {url}")
+    return search_data
diff --git a/domaincom-scraper/pyproject.toml b/domaincom-scraper/pyproject.toml
@@ -0,0 +1,30 @@
+[tool.poetry]
+name = "scrapfly-domaincom"
+version = "0.1.0"
+description = "demo web scraper for domain.com.au using Scrapfly"
+authors = ["Mazen Ramadan <mazen@scrapfly.io>"]
+license = "NPOS-3.0"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "^3.10"
+scrapfly-sdk = {extras = ["all"], version = "^0.8.5"}
+loguru = "^0.7.1"
+
+[tool.poetry.group.dev.dependencies]
+black = "^23.7.0"
+pytest = "^7.3.1"
+cerberus = "^1.3.4"
+asyncio = "^3.4.3"
+pytest-asyncio = "^0.21.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+python_files = "test.py"
+
+[tool.black]
+line-length = 120
+target-version = ['py37', 'py38', 'py39', 'py310', 'py311']