WIP commit, break apart

ioos · Sep 16, 2024 · d9ee493 · d9ee493
1 parent e3486ad
commit d9ee493
Show file tree

Hide file tree

Showing 7 changed files with 169 additions and 84 deletions.
diff --git a/compliance_checker/cf/cf_1_6.py b/compliance_checker/cf/cf_1_6.py
@@ -2252,7 +2252,6 @@ def check_multi_dimensional_coords(self, ds):
     # IS THIS EVEN NEEDED ANYMORE?
     # ***************
     def check_grid_coordinates(self, ds):
-        # def _check_grid_coordinates(self, ds):
         """
         5.6 When the coordinate variables for a horizontal grid are not
         longitude and latitude, it is required that the true latitude and
@@ -2266,7 +2265,7 @@ def check_grid_coordinates(self, ds):
         latitudes = cfutil.get_true_latitude_variables(ds)
         longitudes = cfutil.get_true_longitude_variables(ds)
 
-        check_featues = [
+        check_features = [
             "2d-regular-grid",
             "2d-static-grid",
             "3d-regular-grid",
@@ -2287,7 +2286,7 @@ def check_grid_coordinates(self, ds):
             # dimensions
             dimensions = set(ds.variables[variable].dimensions)
             # If it's not a grid, skip it
-            if cfutil.guess_feature_type(ds, variable) not in check_featues:
+            if cfutil.guess_feature_type(ds, variable) not in check_features:
                 continue
             has_coords = TestCtx(BaseCheck.HIGH, self.section_titles["5.6"])
 

diff --git a/compliance_checker/cf/cf_1_8.py b/compliance_checker/cf/cf_1_8.py
@@ -169,17 +169,17 @@ def check_geometry(self, ds: Dataset):
                 results.append(geom_valid.to_result())
                 continue
 
-            node_count = reference_attr_variables(
+            node_count, node_count_errors = reference_attr_variables(
                 ds,
                 getattr(geometry_var, "node_count", None),
             )
             # multipart lines and polygons only
-            part_node_count = reference_attr_variables(
+            part_node_count, part_node_count_errors = reference_attr_variables(
                 ds,
                 getattr(geometry_var, "part_node_count", None),
             )
             # polygons with interior geometry only
-            interior_ring = reference_attr_variables(
+            interior_ring, interior_ring_errors = reference_attr_variables(
                 ds,
                 getattr(geometry_var, "interior_ring", None),
             )

diff --git a/compliance_checker/cf/cf_1_9.py b/compliance_checker/cf/cf_1_9.py
@@ -112,59 +112,93 @@ def check_domain_variables(self, ds: Dataset):
             var
             for var in ds.get_variables_by_attributes(
                 coordinates=lambda c: c is not None,
-            )
-            # IMPLICIT CONFORMANCE REQUIRED 1/4
+        )):
             # all variables have dimensions attribute, but should be scalar
-            if var.dimensions == ()
-        ):
+            if not ds.dimensions == ():
+                continue
+
+            # IMPLICIT CONFORMANCE REQUIRED 1/4
+            # Has a dimensions *NetCDF* attribute
+            try:
+                dim_nc_attr = var.getncattr(dimensions)
+            # most variables are unlikely to be domain variables, so don't treat this
+            # as a failure
+            except AttributeError:
+                continue
+            # IMPLICIT CONFORMANCE REQUIRED 2/4
+            # Aforementioned dimensions attribute is comprised of space separated
+            # dimension names which must exist in the file
             domain_valid = TestCtx(BaseCheck.MEDIUM, self.section_titles["5.8"])
-            domain_valid.out_of += 1
-            domain_coord_vars = reference_attr_variables(
+            domain_valid.out_of += 2
+            _, dim_errors = reference_attr_variables(
                 ds,
-                domain_var.coordinates,
+                domain_var.getncattr("dimensions"),
                 " ",
+                "dimensions"
             )
-            errors = [
-                maybe_error.name
-                for maybe_error in domain_coord_vars
-                if isinstance(maybe_error, VariableReferenceError)
-            ]
-            if errors:
-                errors_str = ", ".join(errors)
+            if dim_errors:
+                errors_str = ", ".join(dim_errors)
+                domain_valid.messages.append(
+                    "Could not find the following "
+                    "dimensions referenced in "
+                    "dimensions attribute from "
+                    "domain variable "
+                    f"{domain_var.name}: {errors_str}",
+                )
+            else:
+                domain_valid.score += 1
+            domain_coord_vars, domain_coord_var_errors = reference_attr_variables(
+                ds, domain_var.coordinates, " "
+            )
+            if domain_coord_var_errors:
+                errors_str = ", ".join(var_errors)
                 domain_valid.messages.append(
                     "Could not find the following "
                     "variables referenced in "
                     "coordinates attribute from "
                     "domain variable "
                     f"{domain_var.name}: {errors_str}",
                 )
+            else:
+                domain_valid.score += 1
 
+            coord_var_dim_failures = []
+            is_ragged_array_repr = is_dataset_valid_ragged_array_repr_featureType(ds, getattr(ds, "featureType", ""))
+            if is_ragged_array_repr:
+                for var in domain_coord_vars:
+                    pass
             else:
-                long_name = getattr(domain_var, "long_name", None)
-                if long_name is None or not isinstance(long_name, str):
-                    domain_valid.messages.append(
-                        f"For domain variable {domain_var.name} "
-                        f"it is recommended that attribute long_name be present and a string",
-                    )
-                    results.append(domain_valid.to_result())
-                    continue
-                appendix_a_not_recommended_attrs = []
-                for attr_name in domain_var.ncattrs():
-                    if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
-                        appendix_a_not_recommended_attrs.append(attr_name)
-
-                if appendix_a_not_recommended_attrs:
-                    domain_valid.messages.append(
-                        f"The following attributes appear in variable {domain_var.name} "
-                        "and CF Appendix A, but are not for use in domain variables: "
-                        f"{appendix_a_not_recommended_attrs}",
-                    )
+                for var in domain_coord_vars:
+                    pass
 
-                # no errors occurred
-                domain_valid.score += 1
+            # not in conformance docs, but mentioned as recommended anyways
+            long_name = getattr(domain_var, "long_name", None)
+            if long_name is None or not isinstance(long_name, str):
+                domain_valid.messages.append(
+                    f"For domain variable {domain_var.name} "
+                    f"it is recommended that attribute long_name be present and a string",
+                )
+                results.append(domain_valid.to_result())
+                continue
+            appendix_a_not_recommended_attrs = []
+            for attr_name in domain_var.ncattrs():
+                if attr_name in self.appendix_a and "D" not in self.appendix_a[attr_name]["attr_loc"]:
+                    appendix_a_not_recommended_attrs.append(attr_name)
+
+            if appendix_a_not_recommended_attrs:
+                domain_valid.messages.append(
+                    f"The following attributes appear in variable {domain_var.name} "
+                    "and CF Appendix A, but are not for use in domain variables: "
+                    f"{appendix_a_not_recommended_attrs}",
+                )
+
+            # no errors occurred
+            domain_valid.score += 1
 
 
             # IMPLEMENTATION CONFORMANCE 5.8 REQUIRED 4/4
+            # variables named by domain variable's cell_measures attributes must themselves be a subset
+            # of dimensions named by domain variable's dimensions NetCDF attribute
             if hasattr(domain_var, "cell_measures"):
                 cell_measures_var_names = regex.findall(r"\b(?:area|volume):\s+(\w+)", domain_var.cell_measures)
                 # check exist

diff --git a/compliance_checker/cf/util.py b/compliance_checker/cf/util.py
@@ -3,11 +3,16 @@
 import sys
 from pkgutil import get_data
 
+from functools import lru_cache
 import requests
 from cf_units import Unit
 from importlib_resources import files
 from lxml import etree
-from netCDF4 import Dataset
+from netCDF4 import Variable, Dimension, Dataset, Group
+
+import posixpath
+
+from typing import Tuple, Union
 
 # copied from paegan
 # paegan may depend on these later
@@ -444,25 +449,84 @@ def string_from_var_type(variable):
         )
 
 
+def get_possible_label_variable_dimensions(variable: Variable) -> Tuple[int, ...]:
+    """
+    Return dimensions if non-char variable, or return variable dimensions
+    without trailing dimension if char variable, treating it as a label variable.
+    """
+    if variable.kind != "C" or len(variable.dimensions) == 0:
+        return variable.dimensions
+
+    return variable.dimensions[:-1]
+
+@lru_cache()
+def maybe_lateral_reference_variable_or_dimension(group: Union[Group, Dataset],
+                                                  name: str,
+                                                  reference_type: Union[Variable, Dimension]):
+
+    def can_lateral_search(name):
+        return (not name.startswith(".") and posixpath.split(name)[0] == "")
+
+    if reference_type == "variable":
+        # first try to fetch any
+        # can't set to None with .get
+        try:
+            maybe_var = group[name]
+        except IndexError:
+            maybe_var = None
+        else:
+            if isinstance(maybe_var, Variable):
+                return maybe_var
+
+        # alphanumeric string by itself, not a relative or absolute 
+        # search by proximity
+        if (posixpath.split(name)[0] == "" and 
+            not (name.startswith(".") or name.startswith("/"))):
+            group_traverse = group
+            while group_traverse.parent:
+                group_traverse = group_traverse.parent
+                check_target = posixpath.join(group_traverse.path, name)
+                try:
+                    maybe_var = group_traverse[name]
+                except IndexError:
+                    maybe_var = None
+                else:
+                    if isinstance(maybe_var, Variable):
+                        return maybe_var
+        else:
+            return VariableReferenceError(name)
+
+            # can't find path relative to current group or absolute path
+            # perform lateral search if we aren't in the root group
+
+
+
+
 def reference_attr_variables(
     dataset: Dataset,
     attributes_string: str,
     split_by: str = None,
+    reference_type: str = "variable",
+    group: Union[Group, Dataset] = None
 ):
     """
     Attempts to reference variables in the string, optionally splitting by
     a string
     """
+    references, errors = [], []
     if attributes_string is None:
         return None
-    elif split_by is None:
-        return dataset.variables.get(
-            attributes_string,
-            VariableReferenceError(attributes_string),
-        )
-    else:
-        string_proc = attributes_string.split(split_by)
-        return [
-            dataset.variables.get(var_name, VariableReferenceError(var_name))
-            for var_name in string_proc
-        ]
+    elif reference_type == "variable":
+        if split_by is None:
+            return dataset.variables.get(
+                attributes_string,
+                VariableReferenceError(attributes_string),
+            )
+        else:
+            string_proc = attributes_string.split(split_by)
+            for var_name in string_proc:
+                if var_name in dataset.variables:
+                    references.append(dataset.variables[var_name])
+                else:
+                    errors.append(VariableReferenceError(var_name))
+            return references, errors
diff --git a/compliance_checker/cfutil.py b/compliance_checker/cfutil.py
@@ -241,6 +241,7 @@ def is_geophysical(nc, variable):
     return True
 
 
+@lru_cache()
 def get_coordinate_variables(nc):
     """
     Returns a list of variable names that identify as coordinate variables.
@@ -260,6 +261,7 @@ def get_coordinate_variables(nc):
     coord_vars = []
     for dimension in nc.dimensions:
         if dimension in nc.variables:
+            # TODO: Handle string coordinate variables
             if nc.variables[dimension].dimensions == (dimension,):
                 coord_vars.append(dimension)
     return coord_vars
@@ -491,11 +493,6 @@ def get_latitude_variables(nc):
     for variable in nc.get_variables_by_attributes(standard_name="latitude"):
         latitude_variables.append(variable.name)
 
-    # Then axis
-    for variable in nc.get_variables_by_attributes(axis="Y"):
-        if variable.name not in latitude_variables:
-            latitude_variables.append(variable.name)
-
     check_fn = partial(
         attr_membership,
         value_set=VALID_LAT_UNITS,
@@ -557,11 +554,6 @@ def get_longitude_variables(nc):
     for variable in nc.get_variables_by_attributes(standard_name="longitude"):
         longitude_variables.append(variable.name)
 
-    # Then axis
-    for variable in nc.get_variables_by_attributes(axis="X"):
-        if variable.name not in longitude_variables:
-            longitude_variables.append(variable.name)
-
     check_fn = partial(
         attr_membership,
         value_set=VALID_LON_UNITS,
@@ -905,29 +897,26 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
     array structure. See inline comments.
     """
 
+    # regardless of if compound type or not, must have a cf_role
+    # variable; if compound, this will be the first part of the
+    # feature_type as we'll have to search for one with profile_id
+    # regardless; if single feature type, cf_role must match that
+    # featureType
+    cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
     is_compound = False
     if feature_type.lower() in {"timeseriesprofile", "trajectoryprofile"}:
         is_compound = True
         ftype = feature_type.lower().split("profile")[0]
+        if len(cf_role_vars) > 2:
+            return False
     else:
         ftype = feature_type.lower()
+        if len(cf_role_vars) > 1:
+            return False
 
-    # regardless of if compound type or not, must have a cf_role
-    # variable; if compound, this will be the first part of the
-    # feature_type as we'll have to search for one with profile_id
-    # regardless; if single feature type, cf_role must match that
-    # featureType
-    cf_role_vars = nc.get_variables_by_attributes(cf_role=lambda x: x is not None)
-    if (
-        not cf_role_vars
-        or (len(cf_role_vars) > 1 and not is_compound)
-        or (len(cf_role_vars) > 2 and is_compound)
-    ):
-        return False
     cf_role_var = nc.get_variables_by_attributes(cf_role=f"{ftype}_id")[0]
-    if (
-        cf_role_var.cf_role.split("_id")[0].lower() != ftype
-    ):  # if cf_role_var returns None, this should raise an error?
+    # if cf_role_var returns None, this should raise an error?
+    if cf_role_var.cf_role.split("_id")[0].lower() != ftype:
         return False
 
     # now we'll check dimensions for singular feature types and/or
@@ -936,7 +925,7 @@ def is_dataset_valid_ragged_array_repr_featureType(nc, feature_type: str):
     if len(instance_dim) != 1:
         return False
 
-    # Wow we check for the presence of an index variable or count variable;
+    # Now we check for the presence of an index variable or count variable;
     # NOTE that if no index or count variables exist, we can't determine with
     # certainty that this is invalid, because single-instance data sets
     # are valid representations of the ragged array structures. Instead,