From d9ee493796d5a810688776b36b1641c85a89cd09 Mon Sep 17 00:00:00 2001 From: Benjamin Adams Date: Mon, 16 Sep 2024 11:50:12 -0400 Subject: [PATCH] WIP commit, break apart --- compliance_checker/cf/cf_1_6.py | 5 +- compliance_checker/cf/cf_1_8.py | 6 +- compliance_checker/cf/cf_1_9.py | 104 ++++++++++++------ compliance_checker/cf/util.py | 88 +++++++++++++-- compliance_checker/cfutil.py | 41 +++---- compliance_checker/tests/test_cf.py | 7 +- .../tests/test_feature_detection.py | 2 +- 7 files changed, 169 insertions(+), 84 deletions(-) diff --git a/compliance_checker/cf/cf_1_6.py b/compliance_checker/cf/cf_1_6.py index d7fa6906..379ad565 100644 --- a/compliance_checker/cf/cf_1_6.py +++ b/compliance_checker/cf/cf_1_6.py @@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds): # IS THIS EVEN NEEDED ANYMORE? # *************** def check_grid_coordinates(self, ds): - # def _check_grid_coordinates(self, ds): """ 5.6 When the coordinate variables for a horizontal grid are not longitude and latitude, it is required that the true latitude and @@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds): latitudes = cfutil.get_true_latitude_variables(ds) longitudes = cfutil.get_true_longitude_variables(ds) - check_featues = [ + check_features = [ "2d-regular-grid", "2d-static-grid", "3d-regular-grid", @@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds): # dimensions dimensions = set(ds.variables[variable].dimensions) # If it's not a grid, skip it - if cfutil.guess_feature_type(ds, variable) not in check_featues: + if cfutil.guess_feature_type(ds, variable) not in check_features: continue has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"]) diff --git a/compliance_checker/cf/cf_1_8.py b/compliance_checker/cf/cf_1_8.py index 6d51ba66..8f262db7 100644 --- a/compliance_checker/cf/cf_1_8.py +++ b/compliance_checker/cf/cf_1_8.py @@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset): results.append(geom_valid.to_result()) continue - node_count = reference_attr_variables( + node_count, node_count_errors = reference_attr_variables( ds, getattr(geometry_var, "node_count", None), ) # multipart lines and polygons only - part_node_count = reference_attr_variables( + part_node_count, part_node_count_errors = reference_attr_variables( ds, getattr(geometry_var, "part_node_count", None), ) # polygons with interior geometry only - interior_ring = reference_attr_variables( + interior_ring, interior_ring_errors = reference_attr_variables( ds, getattr(geometry_var, "interior_ring", None), ) diff --git a/compliance_checker/cf/cf_1_9.py b/compliance_checker/cf/cf_1_9.py index 73211e44..d03d5740 100644 --- a/compliance_checker/cf/cf_1_9.py +++ b/compliance_checker/cf/cf_1_9.py @@ -112,25 +112,46 @@ def check_domain_variables(self, ds: Dataset): var for var in ds.get_variables_by_attributes( coordinates=lambda c: c is not None, - ) - # IMPLICIT CONFORMANCE REQUIRED 1/4 + )): # all variables have dimensions attribute, but should be scalar - if var.dimensions == () - ): + if not ds.dimensions == (): + continue + + # IMPLICIT CONFORMANCE REQUIRED 1/4 + # Has a dimensions *NetCDF* attribute + try: + dim_nc_attr = var.getncattr(dimensions) + # most variables are unlikely to be domain variables, so don't treat this + # as a failure + except AttributeError: + continue + # IMPLICIT CONFORMANCE REQUIRED 2/4 + # Aforementioned dimensions attribute is comprised of space separated + # dimension names which must exist in the file domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"]) - domain_valid.out_of += 1 - domain_coord_vars = reference_attr_variables( + domain_valid.out_of += 2 + _, dim_errors = reference_attr_variables( ds, - domain_var.coordinates, + domain_var.getncattr("dimensions"), " ", + "dimensions" ) - errors = [ - maybe_error.name - for maybe_error in domain_coord_vars - if isinstance(maybe_error, VariableReferenceError) - ] - if errors: - errors_str = ", ".join(errors) + if dim_errors: + errors_str = ", ".join(dim_errors) + domain_valid.messages.append( + "Could not find the following " + "dimensions referenced in " + "dimensions attribute from " + "domain variable " + f"{domain_var.name}: {errors_str}", + ) + else: + domain_valid.score += 1 + domain_coord_vars, domain_coord_var_errors = reference_attr_variables( + ds, domain_var.coordinates, " " + ) + if domain_coord_var_errors: + errors_str = ", ".join(var_errors) domain_valid.messages.append( "Could not find the following " "variables referenced in " @@ -138,33 +159,46 @@ def check_domain_variables(self, ds: Dataset): "domain variable " f"{domain_var.name}: {errors_str}", ) + else: + domain_valid.score += 1 + coord_var_dim_failures = [] + is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", "")) + if is_ragged_array_repr: + for var in domain_coord_vars: + pass else: - long_name = getattr(domain_var, "long_name", None) - if long_name is None or not isinstance(long_name, str): - domain_valid.messages.append( - f"For domain variable {domain_var.name} " - f"it is recommended that attribute long_name be present and a string", - ) - results.append(domain_valid.to_result()) - continue - appendix_a_not_recommended_attrs = [] - for attr_name in domain_var.ncattrs(): - if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]: - appendix_a_not_recommended_attrs.append(attr_name) - - if appendix_a_not_recommended_attrs: - domain_valid.messages.append( - f"The following attributes appear in variable {domain_var.name} " - "and CF Appendix A, but are not for use in domain variables: " - f"{appendix_a_not_recommended_attrs}", - ) + for var in domain_coord_vars: + pass - # no errors occurred - domain_valid.score += 1 + # not in conformance docs, but mentioned as recommended anyways + long_name = getattr(domain_var, "long_name", None) + if long_name is None or not isinstance(long_name, str): + domain_valid.messages.append( + f"For domain variable {domain_var.name} " + f"it is recommended that attribute long_name be present and a string", + ) + results.append(domain_valid.to_result()) + continue + appendix_a_not_recommended_attrs = [] + for attr_name in domain_var.ncattrs(): + if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]: + appendix_a_not_recommended_attrs.append(attr_name) + + if appendix_a_not_recommended_attrs: + domain_valid.messages.append( + f"The following attributes appear in variable {domain_var.name} " + "and CF Appendix A, but are not for use in domain variables: " + f"{appendix_a_not_recommended_attrs}", + ) + + # no errors occurred + domain_valid.score += 1 # IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4 + # variables named by domain variable's cell_measures attributes must themselves be a subset + # of dimensions named by domain variable's dimensions NetCDF attribute if hasattr(domain_var, "cell_measures"): cell_measures_var_names = regex.findall(r"\b(?:area|volume):\s+(\w+)", domain_var.cell_measures) # check exist diff --git a/compliance_checker/cf/util.py b/compliance_checker/cf/util.py index 591a7b3e..9154a29a 100644 --- a/compliance_checker/cf/util.py +++ b/compliance_checker/cf/util.py @@ -3,11 +3,16 @@ import sys from pkgutil import get_data +from functools import lru_cache import requests from cf_units import Unit from importlib_resources import files from lxml import etree -from netCDF4 import Dataset +from netCDF4 import Variable, Dimension, Dataset, Group + +import posixpath + +from typing import Tuple, Union # copied from paegan # paegan may depend on these later @@ -444,25 +449,84 @@ def string_from_var_type(variable): ) +def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...]: + """ + Return dimensions if non-char variable, or return variable dimensions + without trailing dimension if char variable, treating it as a label variable. + """ + if variable.kind != "C" or len(variable.dimensions) == 0: + return variable.dimensions + + return variable.dimensions[:-1] + +@lru_cache() +def maybe_lateral_reference_variable_or_dimension(group: Union[Group, Dataset], + name: str, + reference_type: Union[Variable, Dimension]): + + def can_lateral_search(name): + return (not name.startswith(".") and posixpath.split(name)[0] == "") + + if reference_type == "variable": + # first try to fetch any + # can't set to None with .get + try: + maybe_var = group[name] + except IndexError: + maybe_var = None + else: + if isinstance(maybe_var, Variable): + return maybe_var + + # alphanumeric string by itself, not a relative or absolute + # search by proximity + if (posixpath.split(name)[0] == "" and + not (name.startswith(".") or name.startswith("/"))): + group_traverse = group + while group_traverse.parent: + group_traverse = group_traverse.parent + check_target = posixpath.join(group_traverse.path, name) + try: + maybe_var = group_traverse[name] + except IndexError: + maybe_var = None + else: + if isinstance(maybe_var, Variable): + return maybe_var + else: + return VariableReferenceError(name) + + # can't find path relative to current group or absolute path + # perform lateral search if we aren't in the root group + + + + def reference_attr_variables( dataset: Dataset, attributes_string: str, split_by: str = None, + reference_type: str = "variable", + group: Union[Group, Dataset] = None ): """ Attempts to reference variables in the string, optionally splitting by a string """ + references, errors = [], [] if attributes_string is None: return None - elif split_by is None: - return dataset.variables.get( - attributes_string, - VariableReferenceError(attributes_string), - ) - else: - string_proc = attributes_string.split(split_by) - return [ - dataset.variables.get(var_name, VariableReferenceError(var_name)) - for var_name in string_proc - ] + elif reference_type == "variable": + if split_by is None: + return dataset.variables.get( + attributes_string, + VariableReferenceError(attributes_string), + ) + else: + string_proc = attributes_string.split(split_by) + for var_name in string_proc: + if var_name in dataset.variables: + references.append(dataset.variables[var_name]) + else: + errors.append(VariableReferenceError(var_name)) + return references, errors diff --git a/compliance_checker/cfutil.py b/compliance_checker/cfutil.py index 8bcce331..5745f7f3 100644 --- a/compliance_checker/cfutil.py +++ b/compliance_checker/cfutil.py @@ -241,6 +241,7 @@ def is_geophysical(nc, variable): return True +@lru_cache() def get_coordinate_variables(nc): """ Returns a list of variable names that identify as coordinate variables. @@ -260,6 +261,7 @@ def get_coordinate_variables(nc): coord_vars = [] for dimension in nc.dimensions: if dimension in nc.variables: + # TODO: Handle string coordinate variables if nc.variables[dimension].dimensions == (dimension,): coord_vars.append(dimension) return coord_vars @@ -491,11 +493,6 @@ def get_latitude_variables(nc): for variable in nc.get_variables_by_attributes(standard_name="latitude"): latitude_variables.append(variable.name) - # Then axis - for variable in nc.get_variables_by_attributes(axis="Y"): - if variable.name not in latitude_variables: - latitude_variables.append(variable.name) - check_fn = partial( attr_membership, value_set=VALID_LAT_UNITS, @@ -557,11 +554,6 @@ def get_longitude_variables(nc): for variable in nc.get_variables_by_attributes(standard_name="longitude"): longitude_variables.append(variable.name) - # Then axis - for variable in nc.get_variables_by_attributes(axis="X"): - if variable.name not in longitude_variables: - longitude_variables.append(variable.name) - check_fn = partial( attr_membership, value_set=VALID_LON_UNITS, @@ -905,29 +897,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str): array structure. See inline comments. """ + # regardless of if compound type or not, must have a cf_role + # variable; if compound, this will be the first part of the + # feature_type as we'll have to search for one with profile_id + # regardless; if single feature type, cf_role must match that + # featureType + cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None) is_compound = False if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}: is_compound = True ftype = feature_type.lower().split("profile")[0] + if len(cf_role_vars) > 2: + return False else: ftype = feature_type.lower() + if len(cf_role_vars) > 1: + return False - # regardless of if compound type or not, must have a cf_role - # variable; if compound, this will be the first part of the - # feature_type as we'll have to search for one with profile_id - # regardless; if single feature type, cf_role must match that - # featureType - cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None) - if ( - not cf_role_vars - or (len(cf_role_vars) > 1 and not is_compound) - or (len(cf_role_vars) > 2 and is_compound) - ): - return False cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0] - if ( - cf_role_var.cf_role.split("_id")[0].lower() != ftype - ): # if cf_role_var returns None, this should raise an error? + # if cf_role_var returns None, this should raise an error? + if cf_role_var.cf_role.split("_id")[0].lower() != ftype: return False # now we'll check dimensions for singular feature types and/or @@ -936,7 +925,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str): if len(instance_dim) != 1: return False - # Wow we check for the presence of an index variable or count variable; + # Now we check for the presence of an index variable or count variable; # NOTE that if no index or count variables exist, we can't determine with # certainty that this is invalid, because single-instance data sets # are valid representations of the ragged array structures. Instead, diff --git a/compliance_checker/tests/test_cf.py b/compliance_checker/tests/test_cf.py index dce2bfa4..4bc89bc9 100644 --- a/compliance_checker/tests/test_cf.py +++ b/compliance_checker/tests/test_cf.py @@ -949,7 +949,6 @@ def test_latitude(self): dataset = self.load_dataset(STATIC_FILES["bad"]) results = self.cf.check_latitude(dataset) scored, out_of, messages = get_results(results) - assert len(results) == 12 assert scored < out_of assert len([r for r in results if r.value[0] < r.value[1]]) == 3 assert (r.name == "§4.1 Latitude Coordinate" for r in results) @@ -958,7 +957,7 @@ def test_latitude(self): dataset = self.load_dataset(STATIC_FILES["rotated_pole_grid"]) results = self.cf.check_latitude(dataset) scored, out_of, messages = get_results(results) - assert len(results) == 6 + assert len(results) == 3 assert scored == out_of assert (r.name == "§4.1 Latitude Coordinate" for r in results) @@ -991,7 +990,6 @@ def test_longitude(self): dataset = self.load_dataset(STATIC_FILES["bad"]) results = self.cf.check_longitude(dataset) scored, out_of, messages = get_results(results) - assert len(results) == 12 assert scored < out_of assert len([r for r in results if r.value[0] < r.value[1]]) == 3 assert all(r.name == "§4.2 Longitude Coordinate" for r in results) @@ -1000,7 +998,7 @@ def test_longitude(self): dataset = self.load_dataset(STATIC_FILES["rotated_pole_grid"]) results = self.cf.check_latitude(dataset) scored, out_of, messages = get_results(results) - assert (scored, out_of) == (6, 6) + assert (scored, out_of) == (3, 3) # hack to avoid writing to read-only file dataset.variables["rlon"] = MockVariable(dataset.variables["rlon"]) rlon = dataset.variables["rlon"] @@ -3216,6 +3214,7 @@ def test_domain(self): domain_var = dataset.createVariable("domain", "c", ()) domain_var.long_name = "Domain variable" domain_var.coordinates = "lon lat depth" + domain_var.setncattr("dimensions", "time") results = self.cf.check_domain_variables(dataset) self.assertEqual(results[0].value[0], results[0].value[1]) self.assertFalse(results[0].msgs) diff --git a/compliance_checker/tests/test_feature_detection.py b/compliance_checker/tests/test_feature_detection.py index 11f88fdb..8f3d8769 100644 --- a/compliance_checker/tests/test_feature_detection.py +++ b/compliance_checker/tests/test_feature_detection.py @@ -300,7 +300,7 @@ def test_forecast_reference_metadata(self): def test_rotated_pole_grid(self): with Dataset(resources.STATIC_FILES["rotated_pole_grid"]) as nc: latitudes = util.get_latitude_variables(nc) - assert latitudes == ["lat", "rlat"] + assert latitudes == ["lat"] assert util.is_mapped_grid(nc, "temperature") is True def test_vertical_coords(self):