-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #20 from mazen-r/main
add domain.com.au scraper
- Loading branch information
Showing
8 changed files
with
4,901 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
name: Domain.com.au Test | ||
on: | ||
workflow_dispatch: | ||
schedule: | ||
- cron: '0 2 * * THU' | ||
|
||
env: | ||
PROJECT_DIR: domaincom-scraper | ||
|
||
jobs: | ||
test: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
test: [test_search_scraping, test_properties_scraping] | ||
|
||
steps: | ||
- uses: actions/checkout@v2 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v2 | ||
with: | ||
python-version: "3.10" | ||
|
||
- name: Install Poetry | ||
run: | | ||
curl -sSL https://install.python-poetry.org | python3 - | ||
- name: Cache Poetry virtual environment | ||
uses: actions/cache@v2 | ||
id: cache | ||
with: | ||
path: ~/.cache/pypoetry/virtualenvs | ||
key: ${{ runner.os }}-poetry-${{ hashFiles('**/${{ env.PROJECT_DIR }}/pyproject.toml') }} | ||
restore-keys: | | ||
${{ runner.os }}-poetry- | ||
- name: Install dependencies | ||
run: | | ||
cd ${{ env.PROJECT_DIR }} | ||
poetry install | ||
- name: Run test | ||
env: | ||
SCRAPFLY_KEY: ${{ secrets.SCRAPFLY_KEY }} | ||
run: | | ||
cd ${{ env.PROJECT_DIR }} | ||
poetry run pytest test.py -k ${{ matrix.test }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# Domain.com.au Scraper | ||
|
||
This scraper is using [scrapfly.io](https://scrapfly.io/) and Python to scrape property listing data from Domain.com.au. | ||
|
||
Full tutorial | ||
|
||
The scraping code is located in the `domaincom.py` file. It's fully documented and simplified for educational purposes and the example scraper run code can be found in `run.py` file. | ||
|
||
This scraper scrapes: | ||
- Domain.com.au property search for finding property listings | ||
- Domain.com.au property pages for property listing data | ||
|
||
For output examples see the `./results` directory. | ||
|
||
## Fair Use Disclaimer | ||
|
||
Note that this code is provided free of charge as is, and Scrapfly does __not__ provide free web scraping support or consultation. For any bugs, see the issue tracker. | ||
|
||
## Setup and Use | ||
|
||
This Domain.com.au scraper uses __Python 3.10__ with [scrapfly-sdk](https://pypi.org/project/scrapfly-sdk/) package which is used to scrape and parse Domaincom's data. | ||
|
||
0. Ensure you have __Python 3.10__ and [poetry Python package manager](https://python-poetry.org/docs/#installation) on your system. | ||
1. Retrieve your Scrapfly API key from <https://scrapfly.io/dashboard> and set `SCRAPFLY_KEY` environment variable: | ||
```shell | ||
$ export SCRAPFLY_KEY="YOUR SCRAPFLY KEY" | ||
``` | ||
2. Clone and install Python environment: | ||
```shell | ||
$ git clone https://github.com/scrapfly/scrapfly-scrapers.git | ||
$ cd scrapfly-scrapers/domain-scraper | ||
$ poetry install | ||
``` | ||
3. Run example scrape: | ||
```shell | ||
$ poetry run python run.py | ||
``` | ||
4. Run tests: | ||
```shell | ||
$ poetry install --with dev | ||
$ poetry run pytest test.py | ||
# or specific scraping areas | ||
$ poetry run pytest test.py -k test_search_scraping | ||
$ poetry run pytest test.py -k test_properties_scraping | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
""" | ||
This is an example web scraper for domain.com.au | ||
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key: | ||
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard" | ||
""" | ||
import os | ||
import json | ||
import jmespath | ||
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse | ||
from typing import Dict, List | ||
from pathlib import Path | ||
from loguru import logger as log | ||
|
||
|
||
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"]) | ||
|
||
|
||
BASE_CONFIG = { | ||
# bypass domain.com.au scraping blocking | ||
"asp": True, | ||
# set the proxy country to australia | ||
"country": "AU", | ||
} | ||
|
||
|
||
output = Path(__file__).parent / "results" | ||
output.mkdir(exist_ok=True) | ||
|
||
|
||
def parse_hidden_data(response: ScrapeApiResponse): | ||
"""parse json data from script tags""" | ||
selector = response.selector | ||
script = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get() | ||
data = json.loads(script) | ||
with open("data.json", "w", encoding="utf-8") as file: | ||
json.dump(data, file, indent=2, ensure_ascii=False) | ||
return data["props"]["pageProps"]["componentProps"] | ||
|
||
|
||
def parse_property_page(data: Dict) -> Dict: | ||
"""refine property pages data""" | ||
if not data: | ||
return | ||
result = jmespath.search( | ||
"""{ | ||
listingId: listingId, | ||
listingUrl: listingUrl, | ||
unitNumber: unitNumber, | ||
streetNumber: streetNumber, | ||
street: street, | ||
suburb: suburb, | ||
postcode: postcode, | ||
createdOn: createdOn, | ||
propertyType: propertyType, | ||
beds: beds, | ||
phone: phone, | ||
agencyName: agencyName, | ||
propertyDeveloperName: propertyDeveloperName, | ||
agencyProfileUrl: agencyProfileUrl, | ||
propertyDeveloperUrl: propertyDeveloperUrl, | ||
description: description, | ||
loanfinder: loanfinder, | ||
schools: schoolCatchment.schools, | ||
suburbInsights: suburbInsights, | ||
gallery: gallery, | ||
listingSummary: listingSummary, | ||
agents: agents, | ||
features: features, | ||
structuredFeatures: structuredFeatures, | ||
faqs: faqs | ||
}""", | ||
data, | ||
) | ||
return result | ||
|
||
|
||
def parse_search_page(data): | ||
"""refine search pages data""" | ||
if not data: | ||
return | ||
data = data["listingsMap"] | ||
result = [] | ||
# iterate over card items in the search data | ||
for key in data.keys(): | ||
item = data[key] | ||
parsed_data = jmespath.search( | ||
"""{ | ||
id: id, | ||
listingType: listingType, | ||
listingModel: listingModel | ||
}""", | ||
item, | ||
) | ||
# execulde the skeletonImages key from the data | ||
parsed_data["listingModel"].pop("skeletonImages") | ||
result.append(parsed_data) | ||
return result | ||
|
||
|
||
async def scrape_properties(urls: List[str]) -> List[Dict]: | ||
"""scrape listing data from property pages""" | ||
# add the property page URLs to a scraping list | ||
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls] | ||
properties = [] | ||
# scrape all the property page concurrently | ||
async for response in SCRAPFLY.concurrent_scrape(to_scrape): | ||
# parse the data from script tag | ||
data = parse_hidden_data(response) | ||
# aappend the data to the list after refining | ||
properties.append(parse_property_page(data)) | ||
log.success(f"scraped {len(properties)} property listings") | ||
return properties | ||
|
||
|
||
async def scrape_search(url: str, max_scrape_pages: int = None): | ||
"""scrape property listings from search pages""" | ||
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG)) | ||
log.info("scraping search page {}", url) | ||
data = parse_hidden_data(first_page) | ||
search_data = parse_search_page(data) | ||
# get the number of maximum search pages | ||
max_search_pages = data["totalPages"] | ||
# scrape all available pages if not max_scrape_pages or max_scrape_pages >= max_search_pages | ||
if max_scrape_pages and max_scrape_pages < max_search_pages: | ||
max_scrape_pages = max_scrape_pages | ||
else: | ||
max_scrape_pages = max_search_pages | ||
log.info( | ||
f"scraping search pagination, remaining ({max_scrape_pages - 1} more pages)" | ||
) | ||
# add the remaining search pages to a scraping list | ||
other_pages = [ | ||
ScrapeConfig( | ||
# paginate the search pages by adding a "?page" parameter at the end of the URL | ||
str(first_page.context["url"]) + f"?page={page}", | ||
**BASE_CONFIG, | ||
) | ||
for page in range(2, max_scrape_pages + 1) | ||
] | ||
# scrape the remaining search pages concurrently | ||
async for response in SCRAPFLY.concurrent_scrape(other_pages): | ||
# parse the data from script tag | ||
data = parse_hidden_data(response) | ||
# aappend the data to the list after refining | ||
search_data.extend(parse_search_page(data)) | ||
log.success(f"scraped ({len(search_data)}) from {url}") | ||
return search_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
[tool.poetry] | ||
name = "scrapfly-domaincom" | ||
version = "0.1.0" | ||
description = "demo web scraper for domain.com.au using Scrapfly" | ||
authors = ["Mazen Ramadan <mazen@scrapfly.io>"] | ||
license = "NPOS-3.0" | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.10" | ||
scrapfly-sdk = {extras = ["all"], version = "^0.8.5"} | ||
loguru = "^0.7.1" | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
black = "^23.7.0" | ||
pytest = "^7.3.1" | ||
cerberus = "^1.3.4" | ||
asyncio = "^3.4.3" | ||
pytest-asyncio = "^0.21.0" | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
build-backend = "poetry.core.masonry.api" | ||
|
||
[tool.pytest.ini_options] | ||
python_files = "test.py" | ||
|
||
[tool.black] | ||
line-length = 120 | ||
target-version = ['py37', 'py38', 'py39', 'py310', 'py311'] |
Oops, something went wrong.