-
Notifications
You must be signed in to change notification settings - Fork 0
Games ~ Scraping Categories and Mechanics from BGG
Owen Fahey edited this page Nov 15, 2023
·
4 revisions
Below is code to scrape the BGG site for information on categories and mechanics and to create corresponding fixtures.
The output of running this script will be a file called mechanics_categories_fixtures.json
. This file should be placed in /src/chigame/games/fixtures
.
Relevant Links
- Corresponding issue
- Corresponding PR (forthcoming)
# For Jupyter Notebook use
# !pip install requests
# !pip install beautifulsoup4
import json
import requests
from bs4 import BeautifulSoup # https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# ============ FUNCTIONS ============
def fetch_and_parse(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
return soup
def extract_categories_or_mechanics(url):
soup = fetch_and_parse(url)
# Extracting the categories or mechanics
categories_or_mechanics = {}
for link in soup.find_all("a", href=True):
if (
"/boardgamecategory/" in link["href"]
or "/boardgamemechanic/" in link["href"]
):
# Creating dictionary with the category or mechanic name as key and the link as value
categories_or_mechanics[link.text.strip()] = link["href"]
return categories_or_mechanics
# Gets image and descriptions for a given mechanic or category
def extract_info(url):
soup = fetch_and_parse(url)
# Extracting the image source and description
img_meta_tag = soup.find("meta", property="og:image")
img_src = (
img_meta_tag["content"]
if img_meta_tag
else "/static/images/no_picture_available.png"
)
description_meta_tag = soup.find("meta", {"name": "description"})
description = (
description_meta_tag["content"].strip() if description_meta_tag else ""
)
return img_src, description
def create_fixtures(base_url, path, model_name):
count = 0
index_url = f"{base_url}/{path}"
items = extract_categories_or_mechanics(index_url)
fixtures = []
for name, relative_url in items.items():
count += 1
full_url = f"{base_url}{relative_url}"
img_src, description = extract_info(full_url)
fixture = {
"model": model_name,
"pk": count,
"fields": {"name": name, "description": description, "image": img_src},
}
fixtures.append(fixture)
return fixtures
# ============ EXECUTION ============
base_url = "https://boardgamegeek.com"
# Extract categories and mechanics
categories = extract_categories_or_mechanics(base_url + "/browse/boardgamecategory")
mechanics = extract_categories_or_mechanics(base_url + "/browse/boardgamemechanic")
# Print categories and mechanics
print("Categories:", categories)
print("Mechanics:", mechanics)
# Create fixtures for Mechanics
mechanics_fixtures = create_fixtures(
base_url, "browse/boardgamemechanic", "games.mechanic"
)
# Create fixtures for Categories
categories_fixtures = create_fixtures(
base_url, "browse/boardgamecategory", "games.category"
)
## Combine mechanics and categories fixtures
combined_fixtures = mechanics_fixtures + categories_fixtures
# Save to JSON file
with open("mechanics_categories_fixtures.json", "w") as f:
json.dump(combined_fixtures, f, indent=4)
-
Tournaments
-
Games
-
Matches
-
API
-
Forums
-
Users & Friending