Skip to content

Commit

Permalink
fix check error due to new requirement rules + add docstring
Browse files Browse the repository at this point in the history
add docstrings
  • Loading branch information
jacquesfize committed Jul 25, 2024
1 parent edcc56d commit 82db59e
Showing 1 changed file with 139 additions and 11 deletions.
150 changes: 139 additions & 11 deletions backend/geonature/core/imports/checks/dataframe/core.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict
from typing import Dict, Optional, Set

import numpy as np
import pandas as pd
Expand All @@ -7,7 +7,7 @@
from geonature.utils.env import db
from geonature.core.gn_meta.models import TDatasets

from geonature.core.imports.models import BibFields
from geonature.core.imports.models import BibFields, TImports

from .utils import dfcheck

Expand All @@ -17,18 +17,59 @@

@dfcheck
def check_required_values(df, fields: Dict[str, BibFields]):
"""
Check if required values are present in the dataframe.
Parameters
----------
df : pandas.DataFrame
The dataframe to check.
fields : Dict[str, BibFields]
Dictionary of fields to check.
Yields
------
dict
Dictionary containing the error code, the column name and the invalid rows.
Notes
-----
If a field is not mandatory and it has mandatory conditions, it will not raise an error
if any of the mandatory conditions are mapped.
If a field is mandatory and it has optional conditions, it will not raise an error
if any of the optional conditions is mapped.
If a field is mandatory and it is not mapped, it will raise an error for all the rows.
"""

for field_name, field in fields.items():
if not field.mandatory:
if field.mandatory_conditions:
are_required_field_mapped = [
fields[field_req].source_column in df
for field_req in field.mandatory_conditions
]
if not any(are_required_field_mapped):
continue
continue

if field.mandatory and field.optional_conditions:
# If a required field is optional thanks to other columns mapped
are_optional_field_mapped = [
fields[field_opt].source_column in df for field_opt in field.optional_conditions
]
if any(are_optional_field_mapped):
continue
if field.source_column not in df:
continue
# XXX lever une erreur pour toutes les lignes si le champs n’est pas mappé
# XXX rise errors for missing mandatory field from mapping?
yield {
"error_code": "MISSING_VALUE",
"column": field_name,
"invalid_rows": df,
}
# XXX lever une erreur pour toutes les lignes si le champs n’est pas mappé
# XXX rise errors for missing mandatory field from mapping?
yield {
"error_code": "MISSING_VALUE",
"column": field_name,
"invalid_rows": df,
}
invalid_rows = df[df[field.source_column].isna()]
if len(invalid_rows):
yield {
Expand All @@ -39,6 +80,25 @@ def check_required_values(df, fields: Dict[str, BibFields]):


def _check_ordering(df, min_field, max_field):
"""
Check if the values in the `min_field` are lower or equal to the values
in the `max_field` for all the rows of the dataframe `df`.
Parameters
----------
df : pandas.DataFrame
The dataframe to check.
min_field : str
The name of the column containing the minimum values.
max_field : str
The name of the column containing the maximum values.
Yields
------
dict
Dictionary containing the invalid rows.
"""
ordered = df[min_field] <= df[max_field]
ordered = ordered.fillna(False)
invalid_rows = df[~ordered & df[min_field].notna() & df[max_field].notna()]
Expand All @@ -48,7 +108,39 @@ def _check_ordering(df, min_field, max_field):


@dfcheck
def check_counts(df, count_min_field, count_max_field, default_count=None):
def check_counts(
df: pd.DataFrame, count_min_field: str, count_max_field: str, default_count: int = None
):
"""
Check if the value in the `count_min_field` is lower or equal to the value in the `count_max_field`
| count_min_field | count_max_field |
| --------------- | --------------- |
| 0 | 2 | --> ok
| 2 | 0 | --> provoke an error
Parameters
----------
df : pandas.DataFrame
The dataframe to check.
count_min_field : BibField
The field containing the minimum count.
count_max_field : BibField
The field containing the maximum count.
default_count : object, optional
The default count to use if a count is missing, by default None.
Yields
------
dict
Dictionary containing the error code, the column name and the invalid rows.
Returns
------
set
Set of columns updated.
"""
count_min_col = count_min_field.dest_field
count_max_col = count_max_field.dest_field
updated_cols = {count_max_col}
Expand Down Expand Up @@ -89,7 +181,43 @@ def check_counts(df, count_min_field, count_max_field, default_count=None):


@dfcheck
def check_datasets(imprt, df, uuid_field, id_field, module_code, object_code=None):
def check_datasets(
imprt: TImports,
df: pd.DataFrame,
uuid_field: BibFields,
id_field: BibFields,
module_code: str,
object_code: Optional[str] = None,
) -> Set[str]:
"""
Check if datasets exist and are authorized for the user and import.
Parameters
----------
imprt : TImports
Import to check datasets for.
df : pd.DataFrame
Dataframe to check.
uuid_field : BibFields
Field containing dataset UUIDs.
id_field : BibFields
Field to fill with dataset IDs.
module_code : str
Module code to check datasets for.
object_code : Optional[str], optional
Object code to check datasets for, by default None.
Yields
------
dict
Dictionary containing error code, column name and invalid rows.
Returns
------
Set[str]
Set of columns updated.
"""
updated_cols = set()
uuid_col = uuid_field.dest_field
id_col = id_field.dest_field
Expand Down

0 comments on commit 82db59e

Please sign in to comment.