PROD-1898 Provide API support for (optional) pagination of the `/syst…

…em` endpoint and /datasets (#5071)
ethyca · Jul 16, 2024 · e63a74f · e63a74f
1 parent 75b7dda
commit e63a74f
Show file tree

Hide file tree

Showing 12 changed files with 859 additions and 20 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,7 @@ The types of changes are:
 ### Changed
 - Updated the sample dataset for the Amplitude integration [#5063](https://github.com/ethyca/fides/pull/5063)
 - Messaging page now shows a notice if you have properties without any templates [#5077](https://github.com/ethyca/fides/pull/5077)
+- Endpoints for listing systems (GET /system) and datasets (GET /dataset) now support optional pagination [#5071](https://github.com/ethyca/fides/pull/5071)
 
 ### Developer Experience
 - Upgrade to React 18 and Chakra 2, including other dependencies [#5036](https://github.com/ethyca/fides/pull/5036)

diff --git a/src/fides/api/api/v1/endpoints/generic_overrides.py b/src/fides/api/api/v1/endpoints/generic_overrides.py
@@ -0,0 +1,64 @@
+from typing import List, Optional, Union
+
+from fastapi import APIRouter, Depends, Query, Security
+from fastapi_pagination import Page, Params
+from fastapi_pagination.ext.async_sqlalchemy import paginate as async_paginate
+from fideslang.models import Dataset
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.sql.expression import select
+
+from fides.api.db.crud import list_resource
+from fides.api.db.ctl_session import get_async_db
+from fides.api.oauth.utils import verify_oauth_client
+from fides.api.schemas.filter_params import FilterParams
+from fides.api.util.filter_utils import apply_filters_to_query
+from fides.common.api.scope_registry import DATASET_READ
+from fides.common.api.v1.urn_registry import V1_URL_PREFIX
+
+from fides.api.models.sql_models import (  # type: ignore[attr-defined] # isort: skip
+    Dataset as CtlDataset,
+)
+
+# We create routers to override specific methods in those defined in generic.py
+# when we need more custom implementations for only some of the methods in a router.
+
+dataset_router = APIRouter(tags=["Dataset"], prefix=V1_URL_PREFIX)
+
+
+@dataset_router.get(
+    "/dataset",
+    dependencies=[Security(verify_oauth_client, scopes=[DATASET_READ])],
+    response_model=Union[Page[Dataset], List[Dataset]],
+    name="List datasets (optionally paginated)",
+)
+async def list_dataset_paginated(
+    db: AsyncSession = Depends(get_async_db),
+    size: Optional[int] = Query(None, ge=1, le=100),
+    page: Optional[int] = Query(None, ge=1),
+    search: Optional[str] = Query(None),
+    data_categories: Optional[List[str]] = Query(None),
+) -> Union[Page[Dataset], List[Dataset]]:
+    """
+    Get a list of all of the Datasets.
+    If any pagination parameters (size or page) are provided, then the response will be paginated
+    & provided filters (search, data_categories) will be applied.
+    Otherwise all Datasets will be returned (this may be a slow operation if there are many datasets,
+    so using the pagination parameters is recommended).
+    """
+    if page or size:
+        query = select(CtlDataset)
+        filter_params = FilterParams(search=search, data_categories=data_categories)
+        filtered_query = apply_filters_to_query(
+            query=query,
+            search_model=CtlDataset,
+            taxonomy_model=CtlDataset,
+            filter_params=filter_params,
+        )
+        pagination_params = Params(page=page or 1, size=size or 50)
+        return await async_paginate(db, filtered_query, pagination_params)
+
+    return await list_resource(CtlDataset, db)
+
+
+GENERIC_OVERRIDES_ROUTER = APIRouter()
+GENERIC_OVERRIDES_ROUTER.include_router(dataset_router)
diff --git a/src/fides/api/api/v1/endpoints/system.py b/src/fides/api/api/v1/endpoints/system.py
@@ -1,14 +1,16 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
-from fastapi import Depends, HTTPException, Response, Security
+from fastapi import Depends, HTTPException, Query, Response, Security
 from fastapi_pagination import Page, Params
 from fastapi_pagination.bases import AbstractPage
+from fastapi_pagination.ext.async_sqlalchemy import paginate as async_paginate
 from fastapi_pagination.ext.sqlalchemy import paginate
 from fideslang.models import System as SystemSchema
 from fideslang.validation import FidesKey
 from loguru import logger
 from pydantic.types import conlist
 from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
 from sqlalchemy.orm import Session
 from starlette import status
 from starlette.status import HTTP_200_OK, HTTP_204_NO_CONTENT, HTTP_404_NOT_FOUND
@@ -30,7 +32,10 @@
 )
 from fides.api.models.connectionconfig import ConnectionConfig, ConnectionType
 from fides.api.models.fides_user import FidesUser
-from fides.api.models.sql_models import System  # type:ignore[attr-defined]
+from fides.api.models.sql_models import (  # type:ignore[attr-defined]
+    PrivacyDeclaration,
+    System,
+)
 from fides.api.oauth.system_manager_oauth_util import (
     verify_oauth_client_for_system_from_fides_key,
     verify_oauth_client_for_system_from_request_body_cli,
@@ -49,6 +54,7 @@
 from fides.api.schemas.connection_configuration.saas_config_template_values import (
     SaasConnectionTemplateValues,
 )
+from fides.api.schemas.filter_params import FilterParams
 from fides.api.schemas.system import BasicSystemResponse, SystemResponse
 from fides.api.util.api_router import APIRouter
 from fides.api.util.connection_util import (
@@ -58,6 +64,7 @@
     patch_connection_configs,
     validate_secrets,
 )
+from fides.api.util.filter_utils import apply_filters_to_query
 from fides.common.api.scope_registry import (
     CONNECTION_CREATE_OR_UPDATE,
     CONNECTION_DELETE,
@@ -366,13 +373,47 @@ async def create(
             scopes=[SYSTEM_READ],
         )
     ],
-    response_model=List[BasicSystemResponse],
-    name="List",
+    response_model=Union[List[BasicSystemResponse], Page[BasicSystemResponse]],
+    name="List systems (optionally paginated)",
 )
 async def ls(  # pylint: disable=invalid-name
     db: AsyncSession = Depends(get_async_db),
+    size: Optional[int] = Query(None, ge=1, le=100),
+    page: Optional[int] = Query(None, ge=1),
+    search: Optional[str] = None,
+    data_uses: Optional[List[FidesKey]] = Query(None),
+    data_categories: Optional[List[FidesKey]] = Query(None),
+    data_subjects: Optional[List[FidesKey]] = Query(None),
 ) -> List:
-    """Get a list of all of the resources of this type."""
+    """Get a list of all of the Systems.
+    If any pagination parameters (size or page) are provided, then the response will be paginated
+    & provided filters (search, taxonomy fields) will be applied.
+    Otherwise all Systems will be returned (this may be a slow operation if there are many systems,
+    so using the pagination parameters is recommended).
+    """
+    if size or page:
+        pagination_params = Params(page=page or 1, size=size or 50)
+        # Need to join with PrivacyDeclaration in order to be able to filter
+        # by data use, data category, and data subject
+        query = select(System).outerjoin(
+            PrivacyDeclaration, System.id == PrivacyDeclaration.system_id
+        )
+        filter_params = FilterParams(
+            search=search,
+            data_uses=data_uses,
+            data_categories=data_categories,
+            data_subjects=data_subjects,
+        )
+        filtered_query = apply_filters_to_query(
+            query=query,
+            filter_params=filter_params,
+            search_model=System,
+            taxonomy_model=PrivacyDeclaration,
+        )
+        # Add a distinct so we only get one row per system
+        duplicates_removed = filtered_query.distinct(System.id)
+        return await async_paginate(db, duplicates_removed, pagination_params)
+
     return await list_resource(System, db)
 
 

diff --git a/src/fides/api/app_setup.py b/src/fides/api/app_setup.py
@@ -7,6 +7,7 @@
 from typing import List
 
 from fastapi import APIRouter, FastAPI
+from fastapi.routing import APIRoute
 from loguru import logger
 from redis.exceptions import RedisError, ResponseError
 from slowapi.errors import RateLimitExceeded  # type: ignore
@@ -18,6 +19,7 @@
 from fides.api.api.v1 import CTL_ROUTER
 from fides.api.api.v1.api import api_router
 from fides.api.api.v1.endpoints.admin import ADMIN_ROUTER
+from fides.api.api.v1.endpoints.generic_overrides import GENERIC_OVERRIDES_ROUTER
 from fides.api.api.v1.endpoints.health import HEALTH_ROUTER
 from fides.api.api.v1.exception_handlers import ExceptionHandlers
 from fides.api.common_exceptions import FunctionalityNotConfigured, RedisConnectionError
@@ -57,6 +59,7 @@
 
 
 ROUTERS = [CTL_ROUTER, api_router, DB_ROUTER]
+OVERRIDING_ROUTERS = [GENERIC_OVERRIDES_ROUTER]
 
 
 def create_fides_app(
@@ -80,6 +83,8 @@ def create_fides_app(
     for router in routers:
         fastapi_app.include_router(router)
 
+    override_generic_routers(OVERRIDING_ROUTERS, fastapi_app)
+
     if security_env == "dev":
         # This removes auth requirements for specific endpoints
         fastapi_app.dependency_overrides[verify_oauth_client_prod] = get_root_client
@@ -96,6 +101,33 @@ def create_fides_app(
     return fastapi_app
 
 
+def override_generic_routers(
+    overriding_routers: List[APIRouter], base_router: FastAPI
+) -> None:
+    """
+    Remove generic routes in favor of their more specific implementations, if available.
+    """
+    for i, existing_route in reversed(list(enumerate(base_router.routes))):
+        if not isinstance(existing_route, APIRoute):
+            continue
+        for new_router in overriding_routers:
+            for new_route in new_router.routes:
+                if not isinstance(new_route, APIRoute):  # pragma: no cover
+                    continue
+                if (
+                    existing_route.methods == new_route.methods
+                    and existing_route.path == new_route.path
+                ):
+                    logger.debug(
+                        "Removing generic route: {} {}",
+                        existing_route.methods,
+                        existing_route.path,
+                    )
+                    del base_router.routes[i]
+    for router in overriding_routers:
+        base_router.include_router(router)
+
+
 def log_startup() -> None:
     """Log application startup and other information."""
     logger.info(f"Starting Fides - v{VERSION}")

diff --git a/src/fides/api/schemas/filter_params.py b/src/fides/api/schemas/filter_params.py
@@ -0,0 +1,14 @@
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class FilterParams(BaseModel):
+    """
+    Generic parameters for filtering queries.
+    """
+
+    search: Optional[str] = None
+    data_uses: Optional[List[str]] = None
+    data_categories: Optional[List[str]] = None
+    data_subjects: Optional[List[str]] = None
diff --git a/src/fides/api/util/filter_utils.py b/src/fides/api/util/filter_utils.py
@@ -0,0 +1,123 @@
+from typing import List, Optional, Type
+
+from sqlalchemy import and_, func, or_
+from sqlalchemy.sql.elements import BooleanClauseList
+from sqlalchemy.sql.selectable import Select
+
+from fides.api.models.sql_models import FidesBase  # type: ignore[attr-defined]
+from fides.api.schemas.filter_params import FilterParams
+
+
+class MissingTaxonomyField(ValueError):
+    pass
+
+
+# FIXME: this code is basically the same as the one in filter_datamap_query
+# in the fidesplus repo, but slightly more generic. Ideally we want to replace that with using this
+# so we don't duplicate this logic in two different places
+def apply_filters_to_query(
+    query: Select,
+    filter_params: FilterParams,
+    search_model: Type[FidesBase],  # Model to search on
+    taxonomy_model: Optional[
+        Type[FidesBase]
+    ],  # Model that has the taxonomy fields to filter on
+) -> Select:
+    """
+    Function to filter a given query by given filter params.
+    The search term is used as a filter on the search_model name and fides_key, as well as its id.
+    Taxonomy filters are applied to the taxonomy_model if provided.
+    The search_model and taxonomy_model may be the same model, e.g if the lookup is on one table,
+    or may be different, e.g if the query is performing a join between two tables.
+    Returns the filtered query.
+    """
+
+    # Perform a text search on the search_model's name, fides_key and id
+    if filter_params.search:
+        query = query.where(
+            and_(
+                or_(
+                    func.lower(search_model.name).like(
+                        f"%{filter_params.search.lower()}%"
+                    ),
+                    search_model.fides_key == filter_params.search,
+                    search_model.id == filter_params.search,
+                )
+            )
+        )
+
+    if not taxonomy_model:
+        return query
+
+    # We match the name of the field in FilterParams to the name of the field in the taxonomy_model,
+    # which can be represented by either a single element field or a collection field
+    taxonomy_field_information = {
+        "data_categories": {
+            "single": "data_category",
+            "collection": "data_categories",
+        },
+        "data_subjects": {
+            "single": "data_subject",
+            "collection": "data_subjects",
+        },
+        "data_uses": {
+            "single": "data_use",
+            "collection": "data_uses",
+        },
+    }
+
+    # Filter the fields so we only use the ones that have been provided in the filter params
+    available_fields_info = {
+        field: field_info
+        for field, field_info in taxonomy_field_information.items()
+        if getattr(filter_params, field)
+    }
+
+    taxonomy_filter_conditions: List[BooleanClauseList] = []
+
+    for field, field_info in available_fields_info.items():
+        single_field_name = field_info["single"]
+        collection_field_name = field_info["collection"]
+
+        # If the taxonomy_model doesn't have either a single or collection field matching this field
+        # we raise an error since it makes no sense to pass in the field as part of the filter params
+        if not hasattr(taxonomy_model, single_field_name) and not hasattr(
+            taxonomy_model, collection_field_name
+        ):
+            raise MissingTaxonomyField(
+                f"Model {taxonomy_model.__name__} does not have a {single_field_name} or {collection_field_name} field, but filter_params.{field} is not empty"
+            )
+
+        single_field_conditions = []
+        collection_field_conditions = []
+
+        # For single fields, we match each element provided in the filter params field
+        # against the field in the taxonomy model using like, since model field is a single element
+        # e.g a single data category represented as a string
+        if hasattr(taxonomy_model, single_field_name):
+            single_field_conditions = [
+                getattr(taxonomy_model, single_field_name).like(element + "%")
+                for element in getattr(filter_params, field)
+            ]
+
+        # For collection fields, we match each element provided in the filter params field
+        # against the field in the taxonomy model using contains, since model field is
+        # a collection of elements, e.g a list of data categories
+        if hasattr(taxonomy_model, collection_field_name):
+            collection_field_conditions = [
+                getattr(taxonomy_model, collection_field_name).contains([element])
+                for element in getattr(filter_params, field)
+            ]
+
+        # We join all conditions with an OR, so that we retrieve rows that match
+        # either in their single or collection fields
+        all_field_conditions = or_(
+            *single_field_conditions, *collection_field_conditions
+        )
+        taxonomy_filter_conditions.append(all_field_conditions)
+
+    # Finally, we filter the query for taxonomy_model instances that match all the conditions
+    if taxonomy_filter_conditions:
+        query = query.where(and_(*taxonomy_filter_conditions))
+
+    return query