Merge branch 'braden/meilisearch-libraries' into rpenido/fal-3690-upd…

…ate-search-index-when-course-content-is-changed
open-craft · Mar 22, 2024 · 7320e3c · 7320e3c
2 parents ac8d0dc + 1ff4b75
commit 7320e3c
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 25 deletions.
diff --git a/cms/envs/common.py b/cms/envs/common.py
@@ -1777,7 +1777,7 @@
     'openedx.core.djangoapps.content_tagging',
 
     # Search
-    'openedx.core.djangoapps.content.search.apps.ContentSearchConfig',
+    'openedx.core.djangoapps.content.search',
 
     'openedx.features.course_duration_limits',
     'openedx.features.content_type_gating',

diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py
@@ -2,8 +2,7 @@
 Utilities related to indexing content for search
 """
 from __future__ import annotations
-
-import hashlib
+from hashlib import blake2b
 import logging
 
 from django.utils.text import slugify
@@ -38,9 +37,10 @@ class Fields:
     # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/
     # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/
     # For details on the format of the hierarchical tag data.
+    # We currently have a hard-coded limit of 4 levels of tags in the search index (level0..level3).
     tags = "tags"
-    tags_taxonomy = "taxonomy"
-    tags_level0 = "level0"
+    tags_taxonomy = "taxonomy"  # subfield of tags, i.e. tags.taxonomy
+    tags_level0 = "level0"  # subfield of tags, i.e. tags.level0
     tags_level1 = "level1"
     tags_level2 = "level2"
     tags_level3 = "level3"
@@ -49,6 +49,10 @@ class Fields:
     # Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on.
     content = "content"
 
+    # Note: new fields or values can be added at any time, but if they need to be indexed for filtering or keyword
+    # search, the index configuration will need to be changed, which is only done as part of the 'reindex_studio'
+    # command (changing those settings on an large active index is not recommended).
+
 
 class DocType:
     """
@@ -64,10 +68,13 @@ def meili_id_from_opaque_key(usage_key: UsageKey) -> str:
     integer or a string composed of alphanumeric characters (a-z A-Z 0-9),
     hyphens (-) and underscores (_). Since our opaque keys don't meet this
     requirement, we transform them to a similar slug ID string that does.
+
+    In the future, with Learning Core's data models in place for courseware,
+    we could use PublishableEntity's primary key / UUID instead.
     """
     # The slugified key _may_ not be unique so we append a hashed string to make it unique:
     key_bin = str(usage_key).encode()
-    suffix = hashlib.sha1(key_bin).hexdigest()[:7]  # When we use Python 3.9+, should add usedforsecurity=False here.
+    suffix = blake2b(key_bin, digest_size=4).hexdigest()  # When we use Python 3.9+, should add usedforsecurity=False
     return slugify(str(usage_key)) + "-" + suffix
 
 
@@ -142,6 +149,13 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
                 "level2": ["Location > North America > Canada > Vancouver"],
             }
         }
+
+    Note: despite what you might expect, because this is only used for the
+    filtering/refinement UI, it's fine if this is a one-way transformation.
+    It's not necessary to be able to re-construct the exact tag IDs nor taxonomy
+    IDs from this data that's stored in the search index. It's just a bunch of
+    strings in a particular format that the frontend knows how to render to
+    support hierarchical refinement by tag.
     """
     # Note that we could improve performance for indexing many components from the same library/course,
     # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the
@@ -160,15 +174,24 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
             result[Fields.tags_taxonomy].append(obj_tag.name)
         # Taxonomy name plus each level of tags, in a list:
         parts = [obj_tag.name] + obj_tag.get_lineage()  # e.g. ["Location", "North America", "Canada", "Vancouver"]
-        parts = [part.replace(" > ", " _ ") for part in parts]  # Escape our separator
+        parts = [part.replace(" > ", " _ ") for part in parts]  # Escape our separator.
+        # Now we build each level (tags.level0, tags.level1, etc.) as applicable.
+        # We have a hard-coded limit of 4 levels of tags for now (see Fields.tags above).
+        # A tag like "Difficulty: Hard" will only result in one level (tags.level0)
+        # But a tag like "Location: North America > Canada > Vancouver" would result in three levels (tags.level0:
+        #   "North America", tags.level1: "North America > Canada", tags.level2: "North America > Canada > Vancouver")
+        # See the comments above on "Field.tags" for an explanation of why we use this format (basically it's the format
+        # required by the Instantsearch frontend).
         for level in range(4):
+            # We use '>' as a separator because it's the default for the Instantsearch frontend library, and our
+            # preferred separator (\t) used in the database is ignored by Meilisearch since it's whitespace.
             new_value = " > ".join(parts[0:level + 2])
             if f"level{level}" not in result:
                 result[f"level{level}"] = [new_value]
             elif new_value not in result[f"level{level}"]:
                 result[f"level{level}"].append(new_value)
             if len(parts) == level + 2:
-                break
+                break  # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only)
 
     return {Fields.tags: result}
 
@@ -185,18 +208,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) ->
         block = xblock_api.load_block(metadata.usage_key, user=None)
     except Exception as err:  # pylint: disable=broad-except
         log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}")
-        # Even though we couldn't load the block, we can still include basic data about it in the index, from 'metadata'
-        doc.update({
-            Fields.id: meili_id_from_opaque_key(metadata.usage_key),
-            Fields.usage_key: str(metadata.usage_key),
-            Fields.block_id: str(metadata.usage_key.block_id),
-            Fields.display_name: metadata.display_name,
-            Fields.type: metadata.usage_key.block_type,
-            Fields.context_key: str(metadata.usage_key.context_key),
-            Fields.org: str(metadata.usage_key.context_key.org),
-        })
-    else:
-        doc.update(_fields_from_block(block))
+    doc.update(_fields_from_block(block))
     doc.update(_tags_for_content_object(metadata.usage_key))
     doc[Fields.type] = DocType.library_block
     # Add the breadcrumbs. In v2 libraries, the library itself is not a "parent" of the XBlocks so we add it here:

diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py
@@ -5,18 +5,33 @@
 See also cms/djangoapps/contentstore/management/commands/reindex_course.py which
 indexes LMS (published) courses in ElasticSearch.
 """
-from django.core.management import BaseCommand
+from django.core.management import BaseCommand, CommandError
 
 from ... import api
 
 
 class Command(BaseCommand):
     """
-    Build or re-build the search index for courses (in Studio, i.e. Draft mode)
+    Build or re-build the search index for courses and libraries (in Studio, i.e. Draft mode)
+
+    This is experimental and not recommended for production use.
     """
 
+    def add_arguments(self, parser):
+        parser.add_argument('--experimental', action='store_true')
+        parser.set_defaults(experimental=False)
+
     def handle(self, *args, **options):
         """
         Build a new search index for Studio, containing content from courses and libraries
         """
+        if not api.is_meilisearch_enabled():
+            raise CommandError("Meilisearch is not enabled. Please set MEILISEARCH_ENABLED to True in your settings.")
+
+        if not options["experimental"]:
+            raise CommandError(
+                "This command is experimental and not recommended for production. "
+                "Use the --experimental argument to acknowledge and run it."
+            )
+
         api.rebuild_index(self.stdout.write)
diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py
@@ -64,7 +64,7 @@ def test_problem_block(self):
         assert doc == {
             # Note the 'id' has been stripped of special characters to meet Meilisearch requirements.
             # The '-8516ed8' suffix is deterministic based on the original usage key.
-            "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-8516ed8",
+            "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-f46b6f1e",
             "type": "course_block",
             "block_type": "problem",
             "usage_key": "block-v1:edX+toy+2012_Fall+type@problem+block@Test_Problem",
@@ -98,7 +98,7 @@ def test_html_block(self):
         block = self.store.get_item(self.html_block_key)
         doc = searchable_doc_for_course_block(block)
         assert doc == {
-            "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-b0b4a10",
+            "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-efb9c601",
             "type": "course_block",
             "block_type": "html",
             "usage_key": "block-v1:edX+toy+2012_Fall+type@html+block@toyjumpto",
@@ -132,7 +132,7 @@ def test_video_block_untagged(self):
         block = self.store.get_item(block_usage_key)
         doc = searchable_doc_for_course_block(block)
         assert doc == {
-            "id": "block-v1edxtoy2012_falltypevideoblockwelcome-b47fb14",
+            "id": "block-v1edxtoy2012_falltypevideoblockwelcome-0c9fd626",
             "type": "course_block",
             "block_type": "video",
             "usage_key": "block-v1:edX+toy+2012_Fall+type@video+block@Welcome",