Skip to content

Commit

Permalink
WIP commit, break apart
Browse files Browse the repository at this point in the history
  • Loading branch information
benjwadams committed Sep 16, 2024
1 parent e3486ad commit d9ee493
Show file tree
Hide file tree
Showing 7 changed files with 169 additions and 84 deletions.
5 changes: 2 additions & 3 deletions compliance_checker/cf/cf_1_6.py
Original file line number Diff line number Diff line change
Expand Up @@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds):
# IS THIS EVEN NEEDED ANYMORE?
# ***************
def check_grid_coordinates(self, ds):
# def _check_grid_coordinates(self, ds):
"""
5.6 When the coordinate variables for a horizontal grid are not
longitude and latitude, it is required that the true latitude and
Expand All @@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds):
latitudes = cfutil.get_true_latitude_variables(ds)
longitudes = cfutil.get_true_longitude_variables(ds)

check_featues = [
check_features = [
"2d-regular-grid",
"2d-static-grid",
"3d-regular-grid",
Expand All @@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds):
# dimensions
dimensions = set(ds.variables[variable].dimensions)
# If it's not a grid, skip it
if cfutil.guess_feature_type(ds, variable) not in check_featues:
if cfutil.guess_feature_type(ds, variable) not in check_features:
continue
has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"])

Expand Down
6 changes: 3 additions & 3 deletions compliance_checker/cf/cf_1_8.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset):
results.append(geom_valid.to_result())
continue

node_count = reference_attr_variables(
node_count, node_count_errors = reference_attr_variables(
ds,
getattr(geometry_var, "node_count", None),
)
# multipart lines and polygons only
part_node_count = reference_attr_variables(
part_node_count, part_node_count_errors = reference_attr_variables(
ds,
getattr(geometry_var, "part_node_count", None),
)
# polygons with interior geometry only
interior_ring = reference_attr_variables(
interior_ring, interior_ring_errors = reference_attr_variables(
ds,
getattr(geometry_var, "interior_ring", None),
)
Expand Down
104 changes: 69 additions & 35 deletions compliance_checker/cf/cf_1_9.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,59 +112,93 @@ def check_domain_variables(self, ds: Dataset):
var
for var in ds.get_variables_by_attributes(
coordinates=lambda c: c is not None,
)
# IMPLICIT CONFORMANCE REQUIRED 1/4
)):
# all variables have dimensions attribute, but should be scalar
if var.dimensions == ()
):
if not ds.dimensions == ():
continue

# IMPLICIT CONFORMANCE REQUIRED 1/4
# Has a dimensions *NetCDF* attribute
try:
dim_nc_attr = var.getncattr(dimensions)
# most variables are unlikely to be domain variables, so don't treat this
# as a failure
except AttributeError:
continue
# IMPLICIT CONFORMANCE REQUIRED 2/4
# Aforementioned dimensions attribute is comprised of space separated
# dimension names which must exist in the file
domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"])
domain_valid.out_of += 1
domain_coord_vars = reference_attr_variables(
domain_valid.out_of += 2
_, dim_errors = reference_attr_variables(
ds,
domain_var.coordinates,
domain_var.getncattr("dimensions"),
" ",
"dimensions"
)
errors = [
maybe_error.name
for maybe_error in domain_coord_vars
if isinstance(maybe_error, VariableReferenceError)
]
if errors:
errors_str = ", ".join(errors)
if dim_errors:
errors_str = ", ".join(dim_errors)
domain_valid.messages.append(
"Could not find the following "
"dimensions referenced in "
"dimensions attribute from "
"domain variable "
f"{domain_var.name}: {errors_str}",
)
else:
domain_valid.score += 1
domain_coord_vars, domain_coord_var_errors = reference_attr_variables(
ds, domain_var.coordinates, " "
)
if domain_coord_var_errors:
errors_str = ", ".join(var_errors)
domain_valid.messages.append(
"Could not find the following "
"variables referenced in "
"coordinates attribute from "
"domain variable "
f"{domain_var.name}: {errors_str}",
)
else:
domain_valid.score += 1

coord_var_dim_failures = []
is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", ""))
if is_ragged_array_repr:
for var in domain_coord_vars:
pass
else:
long_name = getattr(domain_var, "long_name", None)
if long_name is None or not isinstance(long_name, str):
domain_valid.messages.append(
f"For domain variable {domain_var.name} "
f"it is recommended that attribute long_name be present and a string",
)
results.append(domain_valid.to_result())
continue
appendix_a_not_recommended_attrs = []
for attr_name in domain_var.ncattrs():
if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
appendix_a_not_recommended_attrs.append(attr_name)

if appendix_a_not_recommended_attrs:
domain_valid.messages.append(
f"The following attributes appear in variable {domain_var.name} "
"and CF Appendix A, but are not for use in domain variables: "
f"{appendix_a_not_recommended_attrs}",
)
for var in domain_coord_vars:
pass

# no errors occurred
domain_valid.score += 1
# not in conformance docs, but mentioned as recommended anyways
long_name = getattr(domain_var, "long_name", None)
if long_name is None or not isinstance(long_name, str):
domain_valid.messages.append(
f"For domain variable {domain_var.name} "
f"it is recommended that attribute long_name be present and a string",
)
results.append(domain_valid.to_result())
continue
appendix_a_not_recommended_attrs = []
for attr_name in domain_var.ncattrs():
if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
appendix_a_not_recommended_attrs.append(attr_name)

if appendix_a_not_recommended_attrs:
domain_valid.messages.append(
f"The following attributes appear in variable {domain_var.name} "
"and CF Appendix A, but are not for use in domain variables: "
f"{appendix_a_not_recommended_attrs}",
)

# no errors occurred
domain_valid.score += 1


# IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4
# variables named by domain variable's cell_measures attributes must themselves be a subset
# of dimensions named by domain variable's dimensions NetCDF attribute
if hasattr(domain_var, "cell_measures"):
cell_measures_var_names = regex.findall(r"\b(?:area|volume):\s+(\w+)", domain_var.cell_measures)
# check exist
Expand Down
88 changes: 76 additions & 12 deletions compliance_checker/cf/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@
import sys
from pkgutil import get_data

from functools import lru_cache
import requests
from cf_units import Unit
from importlib_resources import files
from lxml import etree
from netCDF4 import Dataset
from netCDF4 import Variable, Dimension, Dataset, Group

import posixpath

from typing import Tuple, Union

# copied from paegan
# paegan may depend on these later
Expand Down Expand Up @@ -444,25 +449,84 @@ def string_from_var_type(variable):
)


def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...]:
"""
Return dimensions if non-char variable, or return variable dimensions
without trailing dimension if char variable, treating it as a label variable.
"""
if variable.kind != "C" or len(variable.dimensions) == 0:
return variable.dimensions

return variable.dimensions[:-1]

@lru_cache()
def maybe_lateral_reference_variable_or_dimension(group: Union[Group, Dataset],
name: str,
reference_type: Union[Variable, Dimension]):

def can_lateral_search(name):
return (not name.startswith(".") and posixpath.split(name)[0] == "")

if reference_type == "variable":
# first try to fetch any
# can't set to None with .get
try:
maybe_var = group[name]
except IndexError:
maybe_var = None
else:
if isinstance(maybe_var, Variable):
return maybe_var

# alphanumeric string by itself, not a relative or absolute
# search by proximity
if (posixpath.split(name)[0] == "" and
not (name.startswith(".") or name.startswith("/"))):
group_traverse = group
while group_traverse.parent:
group_traverse = group_traverse.parent
check_target = posixpath.join(group_traverse.path, name)
try:
maybe_var = group_traverse[name]
except IndexError:
maybe_var = None
else:
if isinstance(maybe_var, Variable):
return maybe_var
else:
return VariableReferenceError(name)

# can't find path relative to current group or absolute path
# perform lateral search if we aren't in the root group




def reference_attr_variables(
dataset: Dataset,
attributes_string: str,
split_by: str = None,
reference_type: str = "variable",
group: Union[Group, Dataset] = None
):
"""
Attempts to reference variables in the string, optionally splitting by
a string
"""
references, errors = [], []
if attributes_string is None:
return None
elif split_by is None:
return dataset.variables.get(
attributes_string,
VariableReferenceError(attributes_string),
)
else:
string_proc = attributes_string.split(split_by)
return [
dataset.variables.get(var_name, VariableReferenceError(var_name))
for var_name in string_proc
]
elif reference_type == "variable":
if split_by is None:
return dataset.variables.get(
attributes_string,
VariableReferenceError(attributes_string),
)
else:
string_proc = attributes_string.split(split_by)
for var_name in string_proc:
if var_name in dataset.variables:
references.append(dataset.variables[var_name])
else:
errors.append(VariableReferenceError(var_name))
return references, errors
41 changes: 15 additions & 26 deletions compliance_checker/cfutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def is_geophysical(nc, variable):
return True


@lru_cache()
def get_coordinate_variables(nc):
"""
Returns a list of variable names that identify as coordinate variables.
Expand All @@ -260,6 +261,7 @@ def get_coordinate_variables(nc):
coord_vars = []
for dimension in nc.dimensions:
if dimension in nc.variables:
# TODO: Handle string coordinate variables
if nc.variables[dimension].dimensions == (dimension,):
coord_vars.append(dimension)
return coord_vars
Expand Down Expand Up @@ -491,11 +493,6 @@ def get_latitude_variables(nc):
for variable in nc.get_variables_by_attributes(standard_name="latitude"):
latitude_variables.append(variable.name)

# Then axis
for variable in nc.get_variables_by_attributes(axis="Y"):
if variable.name not in latitude_variables:
latitude_variables.append(variable.name)

check_fn = partial(
attr_membership,
value_set=VALID_LAT_UNITS,
Expand Down Expand Up @@ -557,11 +554,6 @@ def get_longitude_variables(nc):
for variable in nc.get_variables_by_attributes(standard_name="longitude"):
longitude_variables.append(variable.name)

# Then axis
for variable in nc.get_variables_by_attributes(axis="X"):
if variable.name not in longitude_variables:
longitude_variables.append(variable.name)

check_fn = partial(
attr_membership,
value_set=VALID_LON_UNITS,
Expand Down Expand Up @@ -905,29 +897,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
array structure. See inline comments.
"""

# regardless of if compound type or not, must have a cf_role
# variable; if compound, this will be the first part of the
# feature_type as we'll have to search for one with profile_id
# regardless; if single feature type, cf_role must match that
# featureType
cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
is_compound = False
if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}:
is_compound = True
ftype = feature_type.lower().split("profile")[0]
if len(cf_role_vars) > 2:
return False
else:
ftype = feature_type.lower()
if len(cf_role_vars) > 1:
return False

# regardless of if compound type or not, must have a cf_role
# variable; if compound, this will be the first part of the
# feature_type as we'll have to search for one with profile_id
# regardless; if single feature type, cf_role must match that
# featureType
cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
if (
not cf_role_vars
or (len(cf_role_vars) > 1 and not is_compound)
or (len(cf_role_vars) > 2 and is_compound)
):
return False
cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0]
if (
cf_role_var.cf_role.split("_id")[0].lower() != ftype
): # if cf_role_var returns None, this should raise an error?
# if cf_role_var returns None, this should raise an error?
if cf_role_var.cf_role.split("_id")[0].lower() != ftype:
return False

# now we'll check dimensions for singular feature types and/or
Expand All @@ -936,7 +925,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
if len(instance_dim) != 1:
return False

# Wow we check for the presence of an index variable or count variable;
# Now we check for the presence of an index variable or count variable;
# NOTE that if no index or count variables exist, we can't determine with
# certainty that this is invalid, because single-instance data sets
# are valid representations of the ragged array structures. Instead,
Expand Down
Loading

0 comments on commit d9ee493

Please sign in to comment.