diff --git a/catalog/justfile b/catalog/justfile index 8e261fde859..bb4f9446e36 100644 --- a/catalog/justfile +++ b/catalog/justfile @@ -142,6 +142,27 @@ generate-dag-docs fail_on_diff="false": fi fi + +# Generate the DAG documentation +generate-media-props fail_on_diff="true": + #!/bin/bash + set -e + python utilities/media_props_gen/generate_media_properties.py \&\& chmod 666 utilities/media_props_gen/media_properties.md + # Move the file to the documentation folder + mv utilities/media_props_gen/media_properties.md ../documentation/meta/media_properties.md + echo -n "Running linting..." + # Linting step afterwards is necessary since the generated output differs greatly from what prettier expects + just ../lint prettier ../documentation/meta/media_properties.md &>/dev/null || true + echo "Done!" + if {{ fail_on_diff }}; then + set +e + git diff --exit-code ../documentation/meta/media_properties.md + if [ $? -ne 0 ]; then + printf "\n\n\e[31m!! Changes found in Media properties documentation, please run 'just generate-media-props' locally and commit difference !!\n\n" + exit 1 + fi + fi + # Generate files for a new provider add-provider provider_name endpoint +media_types="image": python3 templates/create_provider_ingester.py "{{ provider_name }}" "{{ endpoint }}" -m {{ media_types }} diff --git a/catalog/utilities/media_props_gen/__init__.py b/catalog/utilities/media_props_gen/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/catalog/utilities/media_props_gen/column_parser.py b/catalog/utilities/media_props_gen/column_parser.py new file mode 100644 index 00000000000..788c7a563bf --- /dev/null +++ b/catalog/utilities/media_props_gen/column_parser.py @@ -0,0 +1,117 @@ +import ast +from pathlib import Path + + +STORAGE_PATH = Path(__file__).parents[2] / "dags" / "common" / "storage" +COLUMNS_PATH = STORAGE_PATH / "columns.py" + +COLUMNS_URL = "https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py" # noqa: E501 + + +def format_python_column( + column_db_name: str, + python_column: dict[str, any], + python_column_lines: dict[str, tuple[int, int]], +) -> str: + col_type = python_column.pop("python_type") + start, end = python_column_lines[col_type] + python_column_string = f"[{col_type}]({COLUMNS_URL}#L{start}-L{end}) (" + + col_name = python_column.pop("name") + if col_name != column_db_name: + python_column_string += f"name='{col_name}', " + + custom_props_string = "" + if custom_props := python_column.pop("custom_column_props", None): + props_string = ", ".join([f"{k}={v}" for k, v in custom_props.items()]) + custom_props_string = f", {col_type}Props({props_string})" + + python_column_string += ", ".join([f"{k}={v}" for k, v in python_column.items()]) + python_column_string += f"{custom_props_string})" + + return python_column_string + + +def file_to_ast() -> ast.Module: + with open(COLUMNS_PATH) as f: + file_contents = f.read() + return ast.parse(file_contents) + + +def parse_python_columns() -> dict[str, any]: + """Get the Python column definitions from the columns.py file.""" + columns = {} + python_column_lines = get_python_column_types() + + code = file_to_ast() + for item in ast.iter_child_nodes(code): + if isinstance(item, ast.Assign): + if not (column := parse_column_definition(item)): + continue + db_name = column.pop("db_name") + columns[db_name] = format_python_column( + db_name, column, python_column_lines + ) + + return columns + + +def get_python_column_types() -> dict[str, tuple[int, int]]: + """ + Parse the columns.py file to get the Python column names + and their line numbers for hyperlinks. + Sample output: `StringColumn: (3, 5)`` + """ + code = file_to_ast() + return { + item.name: (item.lineno, item.end_lineno) + for item in ast.iter_child_nodes(code) + if isinstance(item, ast.ClassDef) and item.name.endswith("Column") + } + + +def parse_column_definition(item: ast.Assign) -> dict[str, any] | None: + column = { + "python_type": None, + "name": None, + "db_name": None, + "nullable": None, + "required": False, + "upsert_strategy": "newest_non_null", + "custom_column_props": {}, + } + if hasattr(item.value, "func") and hasattr(item.value.func, "id"): + column["python_type"] = item.value.func.id + + if hasattr(item.value, "keywords"): + for kw in item.value.keywords: + if hasattr(kw.value, "value"): + if kw.arg not in column.keys(): + column["custom_column_props"][kw.arg] = kw.value.value + else: + # upsert_strategy is a special case + if hasattr(kw.value, "attr"): + column[kw.arg] = kw.value.attr + else: + column[kw.arg] = kw.value.value + else: + if not hasattr(kw.value, "keywords"): + continue + # An Array column that has a base_column + column_params = ", ".join( + [f"{kw2.arg}={kw2.value.value}" for kw2 in kw.value.keywords] + ) + column["custom_column_props"][kw.arg] = ( + f"{kw.value.func.id}({column_params})" + ) + if not column.get("name"): + return None + column["db_name"] = column.get("db_name") or column["name"] + if column["custom_column_props"] == {}: + del column["custom_column_props"] + if column["nullable"] is None: + column["nullable"] = ( + not column["required"] if column["required"] is not None else True + ) + return column + return None diff --git a/catalog/utilities/media_props_gen/generate_media_properties.py b/catalog/utilities/media_props_gen/generate_media_properties.py new file mode 100644 index 00000000000..e9a9d23cda2 --- /dev/null +++ b/catalog/utilities/media_props_gen/generate_media_properties.py @@ -0,0 +1,246 @@ +"""Automatic media properties generation.""" + +import logging +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +from column_parser import parse_python_columns + + +log = logging.getLogger(__name__) +# Silence noisy modules +logging.getLogger("common.storage.media").setLevel(logging.WARNING) + +# Constants +DOC_MD_PATH = Path(__file__).parent / "media_properties.md" +SOURCE_MD_PATH = Path(__file__).parent / "media_props.md" +LOCAL_POSTGRES_FOLDER = Path(__file__).parents[3] / "docker" / "upstream_db" + +SQL_PATH = { + "image": LOCAL_POSTGRES_FOLDER / "0003_openledger_image_schema.sql", + "audio": LOCAL_POSTGRES_FOLDER / "0006_openledger_audio_schema.sql", +} +sql_types = [ + "integer", + "boolean", + "uuid", + "double precision", + "jsonb", + "timestamp with time zone", + "character varying", +] +sql_type_regex = re.compile(f"({'|'.join(sql_types)})") + +MediaType = Literal["audio", "image"] +MEDIA_TYPES: list[MediaType] = ["audio", "image"] + + +@dataclass +class FieldInfo: + name: str + nullable: bool + datatype: str + constraint: str + python_column: str = "" + + +@dataclass +class FieldSqlInfo: + nullable: bool + datatype: str + constraint: str + + +def create_db_props_dict( + media_type: MediaType, +) -> dict[Any, Any] | dict[Any, dict[str, FieldSqlInfo]]: + """ + Parse the DDL for a media type and returns a list of field + sql definitions. + """ + + create_table_regex = re.compile(r"CREATE\s+TABLE\s+\w+\.(\w+)\s+\(([\s\S]*?)\);") + sql_path = SQL_PATH[media_type] + + with open(sql_path) as f: + contents = f.read() + table_description_matches = create_table_regex.search(contents) + if not table_description_matches: + print(f"Could not find table description for {media_type} in {sql_path}") + return {} + table_name = table_description_matches.group(1) + if table_name != media_type: + print(f"Table name {table_name} does not match media type {media_type}") + return {} + field_descriptions = [ + field.strip() + for field in table_description_matches.group(2).split("\n") + if field.strip() + ] + fields = {} + for field in field_descriptions: + field_name = field.split(" ")[0] + # False if "not null" in field.lower() else True + field_constraint = "" + try: + field_type = sql_type_regex.search(field).group(1) + if field_type == "character varying": + char_limit = field.split("(")[1].split(")")[0] + field_constraint = f"({char_limit})" + + if "[]" in field: + field_type = f"array of {field_type}" + except AttributeError: + raise ValueError(f"Could not find type for field {field_name} in {field}") + + fields[field_name] = { + "sql": FieldSqlInfo( + nullable="NOT NULL" not in field, + datatype=field_type, + constraint=field_constraint, + ) + } + return fields + + +def add_column_props(media_props, python_columns): + """Add the python column properties to the media properties dictionary.""" + for prop in media_props.keys(): + if not (python_prop := python_columns.get(prop)): + print(f"Column {prop} not found in table") + python_prop = "" + media_props[prop]["python_column"] = python_prop + return media_props + + +def parse_markdown() -> dict[str, dict[str, str]]: + """ + Parse the markdown documentation file and return a dictionary with the + field name as key and the description as value. + """ + with open(SOURCE_MD_PATH) as f: + contents = [line for line in f.readlines() if line.strip()] + current_field = "" + properties = {} + prop = "" + value = {} + for i, line in enumerate(contents): + if line.startswith("# "): + if current_field and value: + properties[current_field] = value + current_field = line.replace("# ", "").strip() + value = {} + continue + elif line.startswith("## "): + prop = line.replace("## ", "").strip() + value[prop] = "" + continue + else: + value[prop] += line + + return properties + + +def generate_media_props() -> dict: + """ + Generate a dictionary with the media properties from the database, + python code and markdown documentation files. + """ + media_props = {} + python_columns = parse_python_columns() + + for media_type in MEDIA_TYPES: + media_props[media_type] = create_db_props_dict(media_type) + media_props[media_type] = add_column_props( + media_props[media_type], python_columns + ) + return media_props + + +def generate_media_props_table(media_properties) -> str: + """Generate the table with media properties.""" + + # Convert the list of FieldInfo objects to a md table + table = "| DB Field | Python Column | \n" + table += "| --- | --- | \n" + for field_name, field in media_properties.items(): + field_sql = field["sql"] + field_db_type = ( + field_sql.datatype + if not field_sql.constraint + else f"{field_sql.datatype} {field_sql.constraint}" + ) + db_properties = ( + f"{field_db_type}, {'nullable' if field_sql.nullable else 'non-nullable'}" + ) + table += ( + f"| [`{field_name}`](#{field_name}) ({db_properties}) | " + f"{field.get('python_column', '')} | \n" + ) + return table + + +def description_dict_to_markdown(description: dict[str, str]) -> str: + """Convert a dictionary with field descriptions to a markdown string.""" + description_md = "" + for name, value in description.items(): + description_md += f"#### {name}\n\n" + description_md += f"{value}\n\n" + return description_md + + +def generate_media_props_doc( + markdown_descriptions: dict, media_properties: dict +) -> str: + """Generate the long-form documentation for each media property.""" + media_docs = "" + for prop, description in markdown_descriptions.items(): + prop_heading = f"### {prop}\n\n" + media_types = [ + f"`{media_type}`" + for media_type, value in media_properties.items() + if prop in value.keys() + ] + + prop_heading += f"Media Types: {', '.join(media_types)}\n\n" + if prop_doc := description_dict_to_markdown(description): + media_docs += prop_heading + prop_doc + + return media_docs + + +def generate_markdown_doc( + media_properties: dict[str, dict], markdown_descriptions: dict[str, dict] +) -> str: + """ + Generate the tables with media properties database column and + Python objects characteristics. + """ + with open(Path(__file__).parent / "preamble.md") as f: + preamble = f.read() + media_props_doc = f"""{preamble} +## Image Properties\n +{generate_media_props_table(media_properties["image"])} +""" # noqa 501 + media_props_doc += f"""## Audio Properties\n +{generate_media_props_table(media_properties["audio"])} +""" + media_props_doc += f"""## Media Property Descriptions\n +{generate_media_props_doc(markdown_descriptions, media_properties)} + """ + return media_props_doc + + +def write_media_props_doc(path: Path = DOC_MD_PATH) -> None: + """Generate the DAG documentation and write it to a file.""" + media_properties = generate_media_props() + markdown_descriptions = parse_markdown() + doc_text = generate_markdown_doc(media_properties, markdown_descriptions) + log.info(f"Writing DAG doc to {path}") + path.write_text(doc_text) + + +if __name__ == "__main__": + write_media_props_doc() diff --git a/catalog/utilities/media_props_gen/media_props.md b/catalog/utilities/media_props_gen/media_props.md new file mode 100644 index 00000000000..9553720a4c3 --- /dev/null +++ b/catalog/utilities/media_props_gen/media_props.md @@ -0,0 +1,425 @@ +# identifier + +## Description + +The unique UUID identifier for the media item. + +## Object Shape + +UUID + +## Selection Criteria + +Created when the item is inserted into the main table. + +## Normalization and Validation + +## Existing Data Inconsistencies + +# created_on + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# updated_on + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# ingestion_type + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# provider + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# foreign_identifier + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# foreign_landing_url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# thumbnail + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# width + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# height + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# filesize + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# license + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# license_version + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# creator + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# creator_url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# title + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# meta_data + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# tags + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# watermarked + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# last_synced_with_source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# removed_from_source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# filetype + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# category + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# standardized_popularity + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# duration + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# bit_rate + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# sample_rate + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# genres + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# alt_files + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# audio_set + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# audio_set_foreign_identifier + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# set_position + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# alt_files + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies diff --git a/catalog/utilities/media_props_gen/postamble.md b/catalog/utilities/media_props_gen/postamble.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/catalog/utilities/media_props_gen/preamble.md b/catalog/utilities/media_props_gen/preamble.md new file mode 100644 index 00000000000..f1c2ffa3093 --- /dev/null +++ b/catalog/utilities/media_props_gen/preamble.md @@ -0,0 +1,9 @@ +# Media Properties + +_This document is auto-generated from the source code in +[/catalog/utilities/media_props_gen/generate_media_properties.py](https://github.com/WordPress/openverse/blob/main/catalog/utilities/media_props_gen/generate_media_properties.py)._ + +This is a list of the media properties, with the descriptions of corresponding +database columns and Python objects that are used to store and retrieve media +data. The order of the properties corresponds to their order in the image_view +materialized view. diff --git a/documentation/meta/index.md b/documentation/meta/index.md index 23d71cf4484..d3a55e6968f 100644 --- a/documentation/meta/index.md +++ b/documentation/meta/index.md @@ -11,4 +11,5 @@ decision_making/index documentation/index monitoring/index maintenance/index +media_properties ``` diff --git a/documentation/meta/media_properties.md b/documentation/meta/media_properties.md new file mode 100644 index 00000000000..7a8b8ca0bb9 --- /dev/null +++ b/documentation/meta/media_properties.md @@ -0,0 +1,561 @@ +# Media Properties + +_This document is auto-generated from the source code in +[/catalog/utilities/media_props_gen/generate_media_properties.py](https://github.com/WordPress/openverse/blob/main/catalog/utilities/media_props_gen/generate_media_properties.py)._ + +This is a list of the media properties, with the descriptions of corresponding +database columns and Python objects that are used to store and retrieve media +data. The order of the properties corresponds to their order in the image_view +materialized view. + +## Image Properties + +| DB Field | Python Column | +| ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [`identifier`](#identifier) (uuid, nullable) | [UUIDColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L500-L517) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`created_on`](#created_on) (timestamp with time zone, non-nullable) | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=False, required=True, upsert_strategy=no_change) | +| [`updated_on`](#updated_on) (timestamp with time zone, non-nullable) | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=False, required=True, upsert_strategy=newest_non_null) | +| [`ingestion_type`](#ingestion_type) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`provider`](#provider) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`source`](#source) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`foreign_identifier`](#foreign_identifier) (character varying (3000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=False, required=True, upsert_strategy=newest_non_null, StringColumnProps(size=3000, truncate=False)) | +| [`foreign_landing_url`](#foreign_landing_url) (character varying (1000), nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (nullable=True, required=True, upsert_strategy=newest_non_null, URLColumnProps(size=1000)) | +| [`url`](#url) (character varying (3000), non-nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (nullable=False, required=True, upsert_strategy=newest_non_null, URLColumnProps(size=3000)) | +| [`thumbnail`](#thumbnail) (character varying (3000), nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (name='thumbnail_url', nullable=True, required=False, upsert_strategy=newest_non_null, URLColumnProps(size=3000)) | +| [`width`](#width) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`height`](#height) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`filesize`](#filesize) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`license`](#license) (character varying (50), non-nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (name='license\_', nullable=False, required=True, upsert_strategy=newest_non_null, StringColumnProps(size=50, truncate=False)) | +| [`license_version`](#license_version) (character varying (25), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=False, required=True, upsert_strategy=newest_non_null, StringColumnProps(size=25, truncate=False)) | +| [`creator`](#creator) (character varying (2000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=2000, truncate=True)) | +| [`creator_url`](#creator_url) (character varying (2000), nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (nullable=True, required=False, upsert_strategy=newest_non_null, URLColumnProps(size=2000)) | +| [`title`](#title) (character varying (5000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=5000, truncate=True)) | +| [`meta_data`](#meta_data) (jsonb, nullable) | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`tags`](#tags) (jsonb, nullable) | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (nullable=True, required=False, upsert_strategy=merge_jsonb_arrays) | +| [`watermarked`](#watermarked) (boolean, nullable) | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`last_synced_with_source`](#last_synced_with_source) (timestamp with time zone, nullable) | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`removed_from_source`](#removed_from_source) (boolean, non-nullable) | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (nullable=False, required=True, upsert_strategy=false) | +| [`filetype`](#filetype) (character varying (5), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(truncate=False, size=5)) | +| [`category`](#category) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`standardized_popularity`](#standardized_popularity) (double precision, nullable) | [CalculatedColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L259-L337) (nullable=True, required=False, upsert_strategy=newest_non_null) | + +## Audio Properties + +| DB Field | Python Column | +| ---------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [`identifier`](#identifier) (uuid, nullable) | [UUIDColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L500-L517) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`created_on`](#created_on) (timestamp with time zone, non-nullable) | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=False, required=True, upsert_strategy=no_change) | +| [`updated_on`](#updated_on) (timestamp with time zone, non-nullable) | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=False, required=True, upsert_strategy=newest_non_null) | +| [`ingestion_type`](#ingestion_type) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`provider`](#provider) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`source`](#source) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`foreign_identifier`](#foreign_identifier) (character varying (3000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=False, required=True, upsert_strategy=newest_non_null, StringColumnProps(size=3000, truncate=False)) | +| [`foreign_landing_url`](#foreign_landing_url) (character varying (1000), nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (nullable=True, required=True, upsert_strategy=newest_non_null, URLColumnProps(size=1000)) | +| [`url`](#url) (character varying (3000), non-nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (nullable=False, required=True, upsert_strategy=newest_non_null, URLColumnProps(size=3000)) | +| [`thumbnail`](#thumbnail) (character varying (3000), nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (name='thumbnail_url', nullable=True, required=False, upsert_strategy=newest_non_null, URLColumnProps(size=3000)) | +| [`filetype`](#filetype) (character varying (5), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(truncate=False, size=5)) | +| [`duration`](#duration) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`bit_rate`](#bit_rate) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`sample_rate`](#sample_rate) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`category`](#category) (character varying (80), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=80, truncate=False)) | +| [`genres`](#genres) (array of character varying (80), nullable) | [ArrayColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L599-L651) (nullable=True, required=False, upsert_strategy=newest_non_null, ArrayColumnProps(base_column=StringColumn(name=genre, required=False, size=80, truncate=False))) | +| [`audio_set`](#audio_set) (jsonb, nullable) | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`set_position`](#set_position) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`alt_files`](#alt_files) (jsonb, nullable) | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (nullable=True, required=False, upsert_strategy=merge_jsonb_arrays) | +| [`filesize`](#filesize) (integer, nullable) | [IntegerColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L216-L256) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`license`](#license) (character varying (50), non-nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (name='license\_', nullable=False, required=True, upsert_strategy=newest_non_null, StringColumnProps(size=50, truncate=False)) | +| [`license_version`](#license_version) (character varying (25), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=False, required=True, upsert_strategy=newest_non_null, StringColumnProps(size=25, truncate=False)) | +| [`creator`](#creator) (character varying (2000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=2000, truncate=True)) | +| [`creator_url`](#creator_url) (character varying (2000), nullable) | [URLColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L550-L596) (nullable=True, required=False, upsert_strategy=newest_non_null, URLColumnProps(size=2000)) | +| [`title`](#title) (character varying (5000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=5000, truncate=True)) | +| [`meta_data`](#meta_data) (jsonb, nullable) | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`tags`](#tags) (jsonb, nullable) | [JSONColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L388-L454) (nullable=True, required=False, upsert_strategy=merge_jsonb_arrays) | +| [`watermarked`](#watermarked) (boolean, nullable) | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`last_synced_with_source`](#last_synced_with_source) (timestamp with time zone, nullable) | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`removed_from_source`](#removed_from_source) (boolean, non-nullable) | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (nullable=False, required=True, upsert_strategy=false) | +| [`standardized_popularity`](#standardized_popularity) (double precision, nullable) | [CalculatedColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L259-L337) (nullable=True, required=False, upsert_strategy=newest_non_null) | +| [`audio_set_foreign_identifier`](#audio_set_foreign_identifier) (character varying (1000), nullable) | [StringColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L457-L497) (nullable=True, required=False, upsert_strategy=newest_non_null, StringColumnProps(size=1000, truncate=False)) | + +## Media Property Descriptions + +### identifier + +Media Types: `audio`, `image` + +#### Description + +The unique UUID identifier for the media item. + +#### Object Shape + +UUID + +#### Selection Criteria + +Created when the item is inserted into the main table. + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### created_on + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### updated_on + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### ingestion_type + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### provider + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### source + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### foreign_identifier + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### foreign_landing_url + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### url + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### thumbnail + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### width + +Media Types: `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### height + +Media Types: `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### filesize + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### license + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### license_version + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### creator + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### creator_url + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### title + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### meta_data + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### tags + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### watermarked + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### last_synced_with_source + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### removed_from_source + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### filetype + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### category + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### standardized_popularity + +Media Types: `audio`, `image` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### duration + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### bit_rate + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### sample_rate + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### genres + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### alt_files + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### audio_set + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### audio_set_foreign_identifier + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies + +### set_position + +Media Types: `audio` + +#### Description + +#### Object Shape + +#### Selection Criteria + +#### Normalization and Validation + +#### Existing Data Inconsistencies