Skip to content

Games ~ Scraping Categories and Mechanics from BGG

Owen Fahey edited this page Nov 15, 2023 · 4 revisions

Below is code to scrape the BGG site for information on categories and mechanics and to create corresponding fixtures. The output of running this script will be a file called mechanics_categories_fixtures.json. This file should be placed in /src/chigame/games/fixtures.

Relevant Links

  1. Corresponding issue
  2. Corresponding PR (forthcoming)
# For Jupyter Notebook use
# !pip install requests
# !pip install beautifulsoup4

import json
import requests
from bs4 import BeautifulSoup  # https://www.crummy.com/software/BeautifulSoup/bs4/doc/


# ============ FUNCTIONS ============


def fetch_and_parse(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup


def extract_categories_or_mechanics(url):
    soup = fetch_and_parse(url)

    # Extracting the categories or mechanics
    categories_or_mechanics = {}
    for link in soup.find_all("a", href=True):
        if (
            "/boardgamecategory/" in link["href"]
            or "/boardgamemechanic/" in link["href"]
        ):
            # Creating dictionary with the category or mechanic name as key and the link as value
            categories_or_mechanics[link.text.strip()] = link["href"]

    return categories_or_mechanics


# Gets image and descriptions for a given mechanic or category
def extract_info(url):
    soup = fetch_and_parse(url)

    # Extracting the image source and description
    img_meta_tag = soup.find("meta", property="og:image")
    img_src = (
        img_meta_tag["content"]
        if img_meta_tag
        else "/static/images/no_picture_available.png"
    )

    description_meta_tag = soup.find("meta", {"name": "description"})
    description = (
        description_meta_tag["content"].strip() if description_meta_tag else ""
    )

    return img_src, description


def create_fixtures(base_url, path, model_name):
    count = 0
    index_url = f"{base_url}/{path}"
    items = extract_categories_or_mechanics(index_url)
    fixtures = []

    for name, relative_url in items.items():
        count += 1
        full_url = f"{base_url}{relative_url}"
        img_src, description = extract_info(full_url)
        fixture = {
            "model": model_name,
            "pk": count,
            "fields": {"name": name, "description": description, "image": img_src},
        }
        fixtures.append(fixture)

    return fixtures


# ============ EXECUTION ============


base_url = "https://boardgamegeek.com"

# Extract categories and mechanics
categories = extract_categories_or_mechanics(base_url + "/browse/boardgamecategory")
mechanics = extract_categories_or_mechanics(base_url + "/browse/boardgamemechanic")

# Print categories and mechanics
print("Categories:", categories)
print("Mechanics:", mechanics)

# Create fixtures for Mechanics
mechanics_fixtures = create_fixtures(
    base_url, "browse/boardgamemechanic", "games.mechanic"
)

# Create fixtures for Categories
categories_fixtures = create_fixtures(
    base_url, "browse/boardgamecategory", "games.category"
)

## Combine mechanics and categories fixtures
combined_fixtures = mechanics_fixtures + categories_fixtures

# Save to JSON file
with open("mechanics_categories_fixtures.json", "w") as f:
    json.dump(combined_fixtures, f, indent=4)
Clone this wiki locally