From 64024b12f84dae02808556e6530d301f2a5fe28a Mon Sep 17 00:00:00 2001
From: John Sirois <john.sirois@gmail.com>
Date: Thu, 28 Sep 2023 17:27:07 -0700
Subject: [PATCH] Offer suggestions for lift manifest typos.

Also make the array indexing basis clear when array indexes are used.
---
 CHANGES.md                                 |  5 ++
 RELEASE.md                                 |  2 +-
 science/__init__.py                        |  2 +-
 science/config.py                          | 99 ++++++++++++++++++++--
 science/data.py                            | 56 ++++++++++--
 tests/data/unrecognized-config-fields.toml |  2 +-
 tests/test_config.py                       | 16 ++--
 7 files changed, 157 insertions(+), 25 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 2868ee6..b5de8c5 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,5 +1,10 @@
 # Release Notes
 
+## 0.3.0
+
+The unrecognized lift manifest configuration data error message now includes suggestions when the
+unrecognized configuration is likely a typo.
+
 ## 0.2.2
 
 Unrecognized lift manifest configuration data now generates an informative error instead of
diff --git a/RELEASE.md b/RELEASE.md
index a411475..9596005 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -5,7 +5,7 @@
 ### Version Bump and Changelog
 
 1. Bump the version in at [`science/__init__.py`](science/__init__.py).
-2. Run `nox && nox -epackage` as a sanity check on the state of the project.
+2. Run `nox && nox -e linkcheck package` as a sanity check on the state of the project.
 3. Update [`CHANGES.md`](CHANGES.md) with any changes that are likely to be useful to consumers.
 4. Open a PR with these changes and land it on https://github.com/a-scie/lift main.
 
diff --git a/science/__init__.py b/science/__init__.py
index eb2b358..e22357c 100644
--- a/science/__init__.py
+++ b/science/__init__.py
@@ -3,6 +3,6 @@
 
 from packaging.version import Version
 
-__version__ = "0.2.2"
+__version__ = "0.3.0"
 
 VERSION = Version(__version__)
diff --git a/science/config.py b/science/config.py
index c394cc4..3c2eb32 100644
--- a/science/config.py
+++ b/science/config.py
@@ -3,18 +3,22 @@
 
 from __future__ import annotations
 
+import dataclasses
+import difflib
 import tomllib
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
+from functools import cache
 from io import BytesIO
 from pathlib import Path
 from textwrap import dedent
-from typing import BinaryIO
+from typing import BinaryIO, DefaultDict, Generic, Iterator, TypeVar
 
 from science.build_info import BuildInfo
-from science.data import Data
+from science.data import Accessor, Data
 from science.dataclass import Dataclass
 from science.dataclass.deserializer import parse as parse_dataclass
+from science.dataclass.reflect import FieldInfo, dataclass_info
 from science.doc import DOC_SITE_URL
 from science.errors import InputError
 from science.frozendict import FrozenDict
@@ -74,6 +78,70 @@ class InterpreterGroupFields(Dataclass):
     members: tuple[str, ...]
 
 
+def _iter_field_info(datatype: type[Dataclass]) -> Iterator[FieldInfo]:
+    for field_info in dataclass_info(datatype).field_info:
+        if not field_info.hidden and not field_info.inline:
+            yield field_info
+        elif field_info.inline and (dt := field_info.type.dataclass):
+            yield from _iter_field_info(dt)
+
+
+_D = TypeVar("_D", bound=Dataclass)
+
+
+@dataclass(frozen=True)
+class ValidConfig(Generic[_D]):
+    @classmethod
+    @cache
+    def gather(cls, datatype: type[_D]) -> ValidConfig:
+        return cls(
+            datatype=datatype,
+            fields=FrozenDict(
+                {field_info.name: field_info for field_info in _iter_field_info(datatype)}
+            ),
+        )
+
+    datatype: type[_D]
+    fields: FrozenDict[str, FieldInfo]
+
+    def access(self, field_name: str) -> ValidConfig | None:
+        field_info = self.fields.get(field_name)
+        if not field_info:
+            return None
+
+        if datatype := field_info.type.dataclass:
+            return ValidConfig.gather(datatype)
+
+        if field_info.type.has_item_type and dataclasses.is_dataclass(
+            item_type := field_info.type.item_type
+        ):
+            return ValidConfig.gather(item_type)
+
+        return None
+
+
+@dataclass(frozen=True)
+class UnrecognizedApplicationConfig:
+    @classmethod
+    def gather(cls, lift: Data, index_start: int) -> UnrecognizedApplicationConfig | None:
+        unused_items = list(lift.iter_unused_items(index_start=index_start))
+        if not unused_items:
+            return None
+
+        valid_config_by_unused_accessor: dict[Accessor, ValidConfig | None] = {}
+        for unused_accessor, _ in unused_items:
+            valid_config: ValidConfig | None = ValidConfig.gather(Application)
+            for accessor in unused_accessor.iter_lineage():
+                if not valid_config:
+                    break
+                valid_config = valid_config.access(accessor.key)
+            valid_config_by_unused_accessor[unused_accessor] = valid_config
+
+        return cls(unrecognized=FrozenDict(valid_config_by_unused_accessor))
+
+    unrecognized: FrozenDict[Accessor, ValidConfig | None]
+
+
 def parse_config_data(data: Data) -> Application:
     lift = data.get_data("lift")
 
@@ -111,12 +179,28 @@ def parse_interpreter_group(ig_data: Data) -> InterpreterGroup:
         custom_parsers={BuildInfo: parse_build_info, InterpreterGroup: parse_interpreter_group},
     )
 
-    unused_items = list(lift.iter_unused_items())
-    if unused_items:
+    unrecognized_config = UnrecognizedApplicationConfig.gather(lift, index_start=1)
+    if unrecognized_config:
+        unrecognized_field_info: DefaultDict[str, list[str]] = defaultdict(list)
+        index_used = False
+        for accessor, valid_config in unrecognized_config.unrecognized.items():
+            index_used |= accessor.path_includes_index()
+            suggestions = unrecognized_field_info[accessor.render()]
+            if valid_config:
+                suggestions.extend(difflib.get_close_matches(accessor.key, valid_config.fields))
+
+        field_column_width = max(map(len, unrecognized_field_info))
+        unrecognized_fields = []
+        for name, suggestions in unrecognized_field_info.items():
+            line = name.rjust(field_column_width)
+            if suggestions:
+                line += f": Did you mean {' or '.join(suggestions)}?"
+            unrecognized_fields.append(line)
+
         raise InputError(
             dedent(
                 """\
-                The following `lift` manifest entries in {manifest_source} were not recognized:
+                The following `lift` manifest entries in {manifest_source} were not recognized{index_parenthetical}:
                 {unrecognized_fields}
 
                 Refer to the lift manifest format specification at {doc_url} or by running `science doc open manifest`.
@@ -124,7 +208,8 @@ def parse_interpreter_group(ig_data: Data) -> InterpreterGroup:
             )
             .format(
                 manifest_source=data.provenance.source,
-                unrecognized_fields="\n".join(key for key, _ in unused_items),
+                index_parenthetical=" (indexes are 1-based)" if index_used else "",
+                unrecognized_fields="\n".join(unrecognized_fields),
                 doc_url=f"{DOC_SITE_URL}/manifest.html",
             )
             .strip()
diff --git a/science/data.py b/science/data.py
index befb508..cea5f49 100644
--- a/science/data.py
+++ b/science/data.py
@@ -13,6 +13,40 @@
 from science.frozendict import FrozenDict
 from science.hashing import Provenance
 
+
+@dataclass(frozen=True)
+class Accessor:
+    key: str
+    parent: Accessor | None = None
+    _index: int | None = None
+
+    def index(self, index: int) -> Accessor:
+        return dataclasses.replace(self, _index=index)
+
+    def render(self) -> str:
+        if self.parent:
+            rendered = f"{self.parent.render()}.{self.key}"
+        else:
+            rendered = self.key
+
+        if self._index is not None:
+            rendered += f"[{self._index}]"
+        return rendered
+
+    def iter_lineage(self, include_self: bool = False) -> Iterator[Accessor]:
+        if self.parent:
+            yield from self.parent.iter_lineage(include_self=True)
+        if include_self:
+            yield self
+
+    def path_includes_index(self) -> bool:
+        if self._index is not None:
+            return True
+        if self.parent and self.parent.path_includes_index():
+            return True
+        return False
+
+
 _T = TypeVar("_T")
 
 
@@ -150,17 +184,25 @@ def get_value(
             self._unused_data.pop(key, None)
         return value
 
-    def iter_unused_items(self) -> Iterator[tuple[str, Any]]:
+    def iter_unused_items(
+        self, parent: Accessor | None = None, index_start: int = 0
+    ) -> Iterator[tuple[Accessor, Any]]:
         for key, value in self._unused_data.items():
+            accessor = Accessor(key, parent=parent)
             if isinstance(value, list) and all(isinstance(item, Data) for item in value):
-                for index, item in enumerate(value, start=1):
-                    for sub_key, sub_value in item.iter_unused_items():
-                        yield f"{key}[{index}].{sub_key}", sub_value
+                for index, item in enumerate(value, start=index_start):
+                    accessor = accessor.index(index)
+                    for sub_accessor, sub_value in item.iter_unused_items(
+                        parent=accessor, index_start=index_start
+                    ):
+                        yield sub_accessor, sub_value
             elif isinstance(value, Data):
-                for sub_key, sub_value in value.iter_unused_items():
-                    yield f"{key}.{sub_key}", sub_value
+                for sub_accessor, sub_value in value.iter_unused_items(
+                    parent=accessor, index_start=index_start
+                ):
+                    yield sub_accessor, sub_value
             else:
-                yield key, value
+                yield accessor, value
 
     def __bool__(self):
         return bool(self.data)
diff --git a/tests/data/unrecognized-config-fields.toml b/tests/data/unrecognized-config-fields.toml
index af4841e..1f33f51 100644
--- a/tests/data/unrecognized-config-fields.toml
+++ b/tests/data/unrecognized-config-fields.toml
@@ -33,7 +33,7 @@ members = [
 
 [[lift.commands]]
 # N.B: `environ` is invalid.
-environ = { key = "value" }
+just_wrong = { key = "value" }
 exe = "#{cpython:python}"
 args = [
     "-c",
diff --git a/tests/test_config.py b/tests/test_config.py
index 56ef952..6b559f2 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -172,14 +172,14 @@ def test_unrecognized_config_fields(tmp_path: Path, science_pyz: Path) -> None:
         assert (
             dedent(
                 f"""\
-                The following `lift` manifest entries in {config} were not recognized:
-                scie-jump
-                scie_jump.version2
-                interpreters[2].lizzy
-                commands[1].environ
-                commands[1].env.remove_re2
-                commands[1].env.replace2
-                app-info
+                The following `lift` manifest entries in {config} were not recognized (indexes are 1-based):
+                                 scie-jump: Did you mean scie_jump?
+                        scie_jump.version2: Did you mean version?
+                     interpreters[2].lizzy: Did you mean lazy?
+                    commands[1].just_wrong
+                commands[1].env.remove_re2: Did you mean remove_re or remove_exact?
+                  commands[1].env.replace2: Did you mean replace?
+                                  app-info: Did you mean app_info?
 
                 Refer to the lift manifest format specification at https://science.scie.app/manifest.html or by running `science doc open manifest`.
                 """