Skip to content

Commit

Permalink
Refactor SMK script to use the ProviderDataIngester class (#742)
Browse files Browse the repository at this point in the history
Co-authored-by: Madison Swain-Bowden <bowdenm@spu.edu>
  • Loading branch information
krysal and AetherUnbound authored Sep 29, 2022
1 parent 3b58e60 commit 323d07b
Show file tree
Hide file tree
Showing 7 changed files with 400 additions and 621 deletions.
366 changes: 160 additions & 206 deletions openverse_catalog/dags/providers/provider_api_scripts/smk.py
Original file line number Diff line number Diff line change
@@ -1,223 +1,177 @@
import logging

from common import constants
from common.licenses import get_license_info
from common.loader import provider_details as prov
from common.requester import DelayedRequester
from common.storage.image import ImageStore
from requests.exceptions import JSONDecodeError
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester


logging.basicConfig(
format="%(asctime)s - %(name)s - %(levelname)s: %(message)s", level=logging.INFO
)
logger = logging.getLogger(__name__)

LIMIT = 2000
DELAY = 5
RETRIES = 3
PROVIDER = prov.SMK_DEFAULT_PROVIDER
ENDPOINT = "https://api.smk.dk/api/v1/art/search/"
LANDING_PAGE_BASE_URL = "https://open.smk.dk/en/artwork/image/"
IMAGE_SIZE = 2048

delay_request = DelayedRequester(delay=DELAY)
image_store = ImageStore(provider=PROVIDER)
class SmkDataIngester(ProviderDataIngester):
endpoint = "https://api.smk.dk/api/v1/art/search/"
delay = 5
batch_limit = 2000
headers = {"Accept": "application/json"}
providers = {"image": prov.SMK_DEFAULT_PROVIDER}

DEFAULT_QUERY_PARAMS = {
"keys": "*",
"filters": "[has_image:true],[public_domain:true]",
"offset": 0,
"rows": LIMIT,
}
def get_media_type(self, record: dict) -> str:
return constants.IMAGE

HEADERS = {"Accept": "application/json"}
def get_next_query_params(self, prev_query_params: dict | None, **kwargs) -> dict:
if not prev_query_params:
return {
"keys": "*",
"filters": "[has_image:true],[public_domain:true]",
"offset": 0,
"rows": self.batch_limit,
}
return {
**prev_query_params,
"offset": prev_query_params["offset"] + self.batch_limit,
}

def get_batch_data(self, response_json) -> list:
return response_json.get("items")

@staticmethod
def _get_foreign_landing_url(item) -> str | None:
"""Use the English site instead of the original link."""
object_num = item.get("object_number")
if not object_num:
logger.info(
f"Image with (foreign) id {item.get('id')} does not have "
"`object_number`! Therefore we cannot build the "
"foreign_landing_url."
)
return
return f"https://open.smk.dk/en/artwork/image/{object_num}"

@staticmethod
def _get_image_url(image_iiif_id: str, image_size=2048):
# For high quality IIIF-enabled images, restrict the image size to prevent
# loading very large files.
# TODO: consider just using the full "image_native" when adding the
# "image_thumbnail".
image_url = f"{image_iiif_id}/full/!{image_size},/0/default.jpg"
return image_url

@staticmethod
def _get_title(item: dict) -> str | None:
titles = item.get("titles")
if not titles or not isinstance(titles, list):
logger.info(f"No title for image with (foreign) id {item.get('id')}.")
return
return titles[0].get("title")

@staticmethod
def _get_creator(item: dict) -> str | None:
# TODO: review this field, there could be more than one creator or artist.
# Keeping it as it was for the class refactor.
data = item.get("production", [])
if not data or not isinstance(data, list):
return
return data[0].get("creator")

@staticmethod
def _get_images(item: dict) -> list:
images = []

# Legacy images do not have an iiif_id; fall back to the ID from the
# collection DB.
iiif_id = item.get("image_iiif_id")
image_id = iiif_id or item.get("id")

if image_id is not None:
if iiif_id is None:
# Legacy images do not have IIIF links.
image_url = item.get("image_native")
else:
image_url = SmkDataIngester._get_image_url(iiif_id)

height = item.get("image_height")
width = item.get("image_width")
filesize = item.get("image_size") or item.get("size")
images.append(
{
"id": image_id,
"image_url": image_url,
"height": height,
"width": width,
"filesize": filesize,
}
)

alternative_images = item.get("alternative_images")
if type(alternative_images) == list:
for alt_img in alternative_images:
if type(alt_img) == dict:
iiif_id = alt_img.get("iiif_id")
if iiif_id is None:
# The API for alternative images does not include the
# 'id', so we must skip if `iiif_id` is not present.
continue
image_url = SmkDataIngester._get_image_url(iiif_id)
height = alt_img.get("height")
width = alt_img.get("width")
filesize = alt_img.get("image_size") or alt_img.get("size")

images.append(
{
"id": iiif_id,
"image_url": image_url,
"height": height,
"width": width,
"filesize": filesize,
}
)
return images

@staticmethod
def _get_metadata(item: dict) -> dict:
meta_data = {}
if created_date := item.get("created"):
meta_data["created_date"] = created_date
collection = item.get("collection")
if type(collection) == list:
meta_data["collection"] = ",".join(collection)
techniques = item.get("techniques")
if type(techniques) == list:
meta_data["techniques"] = ",".join(techniques)
colors = item.get("colors")
if type(colors) == list:
meta_data["colors"] = ",".join(colors)
return meta_data

def get_record_data(self, data: dict) -> dict | list[dict] | None:
license_info = get_license_info(license_url=data.get("rights"))
if license_info is None:
return
images = []
alt_images = self._get_images(data)
for img in alt_images:
images.append(
{
"foreign_identifier": img.get("id"),
"foreign_landing_url": self._get_foreign_landing_url(data),
"image_url": img.get("image_url"),
"license_info": license_info,
"title": self._get_title(data),
"creator": self._get_creator(data),
"height": img.get("height"),
"width": img.get("width"),
"filesize": img.get("filesize"),
"meta_data": self._get_metadata(data),
}
)
return images


def main():
condition = True
offset = 0
while condition:
query_params = _get_query_param(offset=offset)
items = _get_batch_items(query_params=query_params)
if type(items) == list:
if len(items) > 0:
_handle_items_data(items)
offset += LIMIT
else:
condition = False
else:
condition = False
image_count = image_store.commit()
logger.info(f"total images collected {image_count}")


def _get_query_param(offset=0, default_query_param=None):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_params = default_query_param.copy()
query_params.update(offset=offset)
return query_params


def _get_batch_items(
endpoint=ENDPOINT, query_params=None, headers=None, retries=RETRIES
):
if headers is None:
headers = HEADERS.copy()
items = None
for retry in range(retries):
response = delay_request.get(endpoint, query_params, headers=headers)
try:
response_json = response.json()
if "items" in response_json.keys():
items = response_json.get("items")
break
except (AttributeError, JSONDecodeError, ValueError, TypeError) as e:
logger.error(f"errored due to {e}")
return items


def _handle_items_data(
items,
landing_page_base=LANDING_PAGE_BASE_URL,
):
image_count = 0
for item in items:
images = _get_images(item)
if len(images) == 0:
continue
rights = item.get("rights")
license_, version = _get_license_info(rights)
if license_ is None and version is None:
continue
object_id = item.get("object_number")
if object_id is None:
continue
foreign_landing_url = landing_page_base + object_id
production = item.get("production")
creator = _get_creator(production)
titles = item.get("titles")
title = _get_title(titles)
meta_data = _get_metadata(item)
for img in images:
license_info = get_license_info(license_=license_, license_version=version)
image_count = image_store.add_item(
foreign_identifier=img.get("id"),
foreign_landing_url=foreign_landing_url,
image_url=img.get("image_url"),
height=img.get("height"),
width=img.get("width"),
filesize=img.get("filesize"),
license_info=license_info,
creator=creator,
title=title,
meta_data=meta_data,
)
return image_count


def _get_images(item):
images = []

# Legacy images do not have an iiif_id; fall back to the ID from the
# collection DB.
iiif_id = item.get("image_iiif_id")
id = iiif_id or item.get("id")

if id is not None:
if iiif_id is None:
# Legacy images do not have IIIF links.
image_url = item.get("image_native")
else:
image_url = _get_image_url(iiif_id)

height = item.get("image_height")
width = item.get("image_width")
filesize = item.get("image_size") or item.get("size")
images.append(
{
"id": id,
"image_url": image_url,
"height": height,
"width": width,
"filesize": filesize,
}
)

alternative_images = item.get("alternative_images")
if type(alternative_images) == list:
for alt_img in alternative_images:
if type(alt_img) == dict:
iiif_id = alt_img.get("iiif_id")
if iiif_id is None:
# The API for alternative images does not include the
# 'id', so we must skip if `iiif_id` is not present.
continue
image_url = _get_image_url(iiif_id)
height = alt_img.get("height")
width = alt_img.get("width")
filesize = alt_img.get("image_size") or alt_img.get("size")

images.append(
{
"id": iiif_id,
"image_url": image_url,
"height": height,
"width": width,
"filesize": filesize,
}
)
return images


def _get_image_url(image_iiif_id, image_size=IMAGE_SIZE):
# For high quality IIIF-enabled images, restrict the image size to prevent loading
# very large files.
image_url = image_iiif_id + f"/full/!{image_size},/0/default.jpg"

return image_url


def _get_license_info(rights):
license_, version = None, None
if type(rights) == str:
if "creativecommons" in rights:
license_, version = "cc0", "1.0"

return license_, version


def _get_creator(production):
creator = None
if type(production) == list:
if type(production[0]) == dict:
creator = production[0].get("creator")
return creator


def _get_title(titles):
title = None
if type(titles) == list:
if type(titles[0]) == dict:
title = titles[0].get("title")
return title


def _get_metadata(item):
meta_data = {}
created_date = item.get("created")
if created_date:
meta_data["created_date"] = created_date
collection = item.get("collection")
if type(collection) == list:
meta_data["collection"] = ",".join(collection)
techniques = item.get("techniques")
if type(techniques) == list:
meta_data["techniques"] = ",".join(techniques)
colors = item.get("colors")
if type(colors) == list:
meta_data["colors"] = ",".join(colors)
return meta_data
logger.info("Begin: SMK provider script")
ingester = SmkDataIngester()
ingester.ingest_records()


if __name__ == "__main__":
Expand Down
2 changes: 2 additions & 0 deletions openverse_catalog/dags/providers/provider_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from providers.provider_api_scripts.museum_victoria import VictoriaDataIngester
from providers.provider_api_scripts.provider_data_ingester import ProviderDataIngester
from providers.provider_api_scripts.science_museum import ScienceMuseumDataIngester
from providers.provider_api_scripts.smk import SmkDataIngester
from providers.provider_api_scripts.stocksnap import StockSnapDataIngester
from providers.provider_api_scripts.wikimedia_commons import (
WikimediaCommonsDataIngester,
Expand Down Expand Up @@ -195,6 +196,7 @@ def __post_init__(self):
),
ProviderWorkflow(
provider_script="smk",
ingestion_callable=SmkDataIngester,
start_date=datetime(2020, 1, 1),
),
ProviderWorkflow(
Expand Down
Loading

0 comments on commit 323d07b

Please sign in to comment.