Skip to content

Commit

Permalink
Merge pull request #20 from mazen-r/main
Browse files Browse the repository at this point in the history
add domain.com.au scraper
  • Loading branch information
mazen-r authored Nov 22, 2023
2 parents 272053c + 6ba27f8 commit 5e25df2
Show file tree
Hide file tree
Showing 8 changed files with 4,901 additions and 0 deletions.
49 changes: 49 additions & 0 deletions .github/workflows/domaincom.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Domain.com.au Test
on:
workflow_dispatch:
schedule:
- cron: '0 2 * * THU'

env:
PROJECT_DIR: domaincom-scraper

jobs:
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
test: [test_search_scraping, test_properties_scraping]

steps:
- uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: "3.10"

- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
- name: Cache Poetry virtual environment
uses: actions/cache@v2
id: cache
with:
path: ~/.cache/pypoetry/virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/${{ env.PROJECT_DIR }}/pyproject.toml') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Install dependencies
run: |
cd ${{ env.PROJECT_DIR }}
poetry install
- name: Run test
env:
SCRAPFLY_KEY: ${{ secrets.SCRAPFLY_KEY }}
run: |
cd ${{ env.PROJECT_DIR }}
poetry run pytest test.py -k ${{ matrix.test }}
45 changes: 45 additions & 0 deletions domaincom-scraper/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Domain.com.au Scraper

This scraper is using [scrapfly.io](https://scrapfly.io/) and Python to scrape property listing data from Domain.com.au.

Full tutorial

The scraping code is located in the `domaincom.py` file. It's fully documented and simplified for educational purposes and the example scraper run code can be found in `run.py` file.

This scraper scrapes:
- Domain.com.au property search for finding property listings
- Domain.com.au property pages for property listing data

For output examples see the `./results` directory.

## Fair Use Disclaimer

Note that this code is provided free of charge as is, and Scrapfly does __not__ provide free web scraping support or consultation. For any bugs, see the issue tracker.

## Setup and Use

This Domain.com.au scraper uses __Python 3.10__ with [scrapfly-sdk](https://pypi.org/project/scrapfly-sdk/) package which is used to scrape and parse Domaincom's data.

0. Ensure you have __Python 3.10__ and [poetry Python package manager](https://python-poetry.org/docs/#installation) on your system.
1. Retrieve your Scrapfly API key from <https://scrapfly.io/dashboard> and set `SCRAPFLY_KEY` environment variable:
```shell
$ export SCRAPFLY_KEY="YOUR SCRAPFLY KEY"
```
2. Clone and install Python environment:
```shell
$ git clone https://github.com/scrapfly/scrapfly-scrapers.git
$ cd scrapfly-scrapers/domain-scraper
$ poetry install
```
3. Run example scrape:
```shell
$ poetry run python run.py
```
4. Run tests:
```shell
$ poetry install --with dev
$ poetry run pytest test.py
# or specific scraping areas
$ poetry run pytest test.py -k test_search_scraping
$ poetry run pytest test.py -k test_properties_scraping
```
148 changes: 148 additions & 0 deletions domaincom-scraper/domaincom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
This is an example web scraper for domain.com.au
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
import jmespath
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from typing import Dict, List
from pathlib import Path
from loguru import logger as log


SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])


BASE_CONFIG = {
# bypass domain.com.au scraping blocking
"asp": True,
# set the proxy country to australia
"country": "AU",
}


output = Path(__file__).parent / "results"
output.mkdir(exist_ok=True)


def parse_hidden_data(response: ScrapeApiResponse):
"""parse json data from script tags"""
selector = response.selector
script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
data = json.loads(script)
with open("data.json", "w", encoding="utf-8") as file:
json.dump(data, file, indent=2, ensure_ascii=False)
return data["props"]["pageProps"]["componentProps"]


def parse_property_page(data: Dict) -> Dict:
"""refine property pages data"""
if not data:
return
result = jmespath.search(
"""{
listingId: listingId,
listingUrl: listingUrl,
unitNumber: unitNumber,
streetNumber: streetNumber,
street: street,
suburb: suburb,
postcode: postcode,
createdOn: createdOn,
propertyType: propertyType,
beds: beds,
phone: phone,
agencyName: agencyName,
propertyDeveloperName: propertyDeveloperName,
agencyProfileUrl: agencyProfileUrl,
propertyDeveloperUrl: propertyDeveloperUrl,
description: description,
loanfinder: loanfinder,
schools: schoolCatchment.schools,
suburbInsights: suburbInsights,
gallery: gallery,
listingSummary: listingSummary,
agents: agents,
features: features,
structuredFeatures: structuredFeatures,
faqs: faqs
}""",
data,
)
return result


def parse_search_page(data):
"""refine search pages data"""
if not data:
return
data = data["listingsMap"]
result = []
# iterate over card items in the search data
for key in data.keys():
item = data[key]
parsed_data = jmespath.search(
"""{
id: id,
listingType: listingType,
listingModel: listingModel
}""",
item,
)
# execulde the skeletonImages key from the data
parsed_data["listingModel"].pop("skeletonImages")
result.append(parsed_data)
return result


async def scrape_properties(urls: List[str]) -> List[Dict]:
"""scrape listing data from property pages"""
# add the property page URLs to a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
properties = []
# scrape all the property page concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
# parse the data from script tag
data = parse_hidden_data(response)
# aappend the data to the list after refining
properties.append(parse_property_page(data))
log.success(f"scraped {len(properties)} property listings")
return properties


async def scrape_search(url: str, max_scrape_pages: int = None):
"""scrape property listings from search pages"""
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
log.info("scraping search page {}", url)
data = parse_hidden_data(first_page)
search_data = parse_search_page(data)
# get the number of maximum search pages
max_search_pages = data["totalPages"]
# scrape all available pages if not max_scrape_pages or max_scrape_pages >= max_search_pages
if max_scrape_pages and max_scrape_pages < max_search_pages:
max_scrape_pages = max_scrape_pages
else:
max_scrape_pages = max_search_pages
log.info(
f"scraping search pagination, remaining ({max_scrape_pages - 1} more pages)"
)
# add the remaining search pages to a scraping list
other_pages = [
ScrapeConfig(
# paginate the search pages by adding a "?page" parameter at the end of the URL
str(first_page.context["url"]) + f"?page={page}",
**BASE_CONFIG,
)
for page in range(2, max_scrape_pages + 1)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
# parse the data from script tag
data = parse_hidden_data(response)
# aappend the data to the list after refining
search_data.extend(parse_search_page(data))
log.success(f"scraped ({len(search_data)}) from {url}")
return search_data
30 changes: 30 additions & 0 deletions domaincom-scraper/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[tool.poetry]
name = "scrapfly-domaincom"
version = "0.1.0"
description = "demo web scraper for domain.com.au using Scrapfly"
authors = ["Mazen Ramadan <mazen@scrapfly.io>"]
license = "NPOS-3.0"
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.10"
scrapfly-sdk = {extras = ["all"], version = "^0.8.5"}
loguru = "^0.7.1"

[tool.poetry.group.dev.dependencies]
black = "^23.7.0"
pytest = "^7.3.1"
cerberus = "^1.3.4"
asyncio = "^3.4.3"
pytest-asyncio = "^0.21.0"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
python_files = "test.py"

[tool.black]
line-length = 120
target-version = ['py37', 'py38', 'py39', 'py310', 'py311']
Loading

0 comments on commit 5e25df2

Please sign in to comment.