Skip to content

Commit

Permalink
Merge branch 'braden/meilisearch-libraries' into rpenido/fal-3690-upd…
Browse files Browse the repository at this point in the history
…ate-search-index-when-course-content-is-changed
  • Loading branch information
rpenido committed Mar 22, 2024
2 parents ac8d0dc + 1ff4b75 commit 7320e3c
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 25 deletions.
2 changes: 1 addition & 1 deletion cms/envs/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1777,7 +1777,7 @@
'openedx.core.djangoapps.content_tagging',

# Search
'openedx.core.djangoapps.content.search.apps.ContentSearchConfig',
'openedx.core.djangoapps.content.search',

'openedx.features.course_duration_limits',
'openedx.features.content_type_gating',
Expand Down
50 changes: 31 additions & 19 deletions openedx/core/djangoapps/content/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
Utilities related to indexing content for search
"""
from __future__ import annotations

import hashlib
from hashlib import blake2b
import logging

from django.utils.text import slugify
Expand Down Expand Up @@ -38,9 +37,10 @@ class Fields:
# See https://blog.meilisearch.com/nested-hierarchical-facets-guide/
# and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/
# For details on the format of the hierarchical tag data.
# We currently have a hard-coded limit of 4 levels of tags in the search index (level0..level3).
tags = "tags"
tags_taxonomy = "taxonomy"
tags_level0 = "level0"
tags_taxonomy = "taxonomy" # subfield of tags, i.e. tags.taxonomy
tags_level0 = "level0" # subfield of tags, i.e. tags.level0
tags_level1 = "level1"
tags_level2 = "level2"
tags_level3 = "level3"
Expand All @@ -49,6 +49,10 @@ class Fields:
# Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on.
content = "content"

# Note: new fields or values can be added at any time, but if they need to be indexed for filtering or keyword
# search, the index configuration will need to be changed, which is only done as part of the 'reindex_studio'
# command (changing those settings on an large active index is not recommended).


class DocType:
"""
Expand All @@ -64,10 +68,13 @@ def meili_id_from_opaque_key(usage_key: UsageKey) -> str:
integer or a string composed of alphanumeric characters (a-z A-Z 0-9),
hyphens (-) and underscores (_). Since our opaque keys don't meet this
requirement, we transform them to a similar slug ID string that does.
In the future, with Learning Core's data models in place for courseware,
we could use PublishableEntity's primary key / UUID instead.
"""
# The slugified key _may_ not be unique so we append a hashed string to make it unique:
key_bin = str(usage_key).encode()
suffix = hashlib.sha1(key_bin).hexdigest()[:7] # When we use Python 3.9+, should add usedforsecurity=False here.
suffix = blake2b(key_bin, digest_size=4).hexdigest() # When we use Python 3.9+, should add usedforsecurity=False
return slugify(str(usage_key)) + "-" + suffix


Expand Down Expand Up @@ -142,6 +149,13 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
"level2": ["Location > North America > Canada > Vancouver"],
}
}
Note: despite what you might expect, because this is only used for the
filtering/refinement UI, it's fine if this is a one-way transformation.
It's not necessary to be able to re-construct the exact tag IDs nor taxonomy
IDs from this data that's stored in the search index. It's just a bunch of
strings in a particular format that the frontend knows how to render to
support hierarchical refinement by tag.
"""
# Note that we could improve performance for indexing many components from the same library/course,
# if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the
Expand All @@ -160,15 +174,24 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
result[Fields.tags_taxonomy].append(obj_tag.name)
# Taxonomy name plus each level of tags, in a list:
parts = [obj_tag.name] + obj_tag.get_lineage() # e.g. ["Location", "North America", "Canada", "Vancouver"]
parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator
parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator.
# Now we build each level (tags.level0, tags.level1, etc.) as applicable.
# We have a hard-coded limit of 4 levels of tags for now (see Fields.tags above).
# A tag like "Difficulty: Hard" will only result in one level (tags.level0)
# But a tag like "Location: North America > Canada > Vancouver" would result in three levels (tags.level0:
# "North America", tags.level1: "North America > Canada", tags.level2: "North America > Canada > Vancouver")
# See the comments above on "Field.tags" for an explanation of why we use this format (basically it's the format
# required by the Instantsearch frontend).
for level in range(4):
# We use '>' as a separator because it's the default for the Instantsearch frontend library, and our
# preferred separator (\t) used in the database is ignored by Meilisearch since it's whitespace.
new_value = " > ".join(parts[0:level + 2])
if f"level{level}" not in result:
result[f"level{level}"] = [new_value]
elif new_value not in result[f"level{level}"]:
result[f"level{level}"].append(new_value)
if len(parts) == level + 2:
break
break # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only)

return {Fields.tags: result}

Expand All @@ -185,18 +208,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) ->
block = xblock_api.load_block(metadata.usage_key, user=None)
except Exception as err: # pylint: disable=broad-except
log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}")
# Even though we couldn't load the block, we can still include basic data about it in the index, from 'metadata'
doc.update({
Fields.id: meili_id_from_opaque_key(metadata.usage_key),
Fields.usage_key: str(metadata.usage_key),
Fields.block_id: str(metadata.usage_key.block_id),
Fields.display_name: metadata.display_name,
Fields.type: metadata.usage_key.block_type,
Fields.context_key: str(metadata.usage_key.context_key),
Fields.org: str(metadata.usage_key.context_key.org),
})
else:
doc.update(_fields_from_block(block))
doc.update(_fields_from_block(block))
doc.update(_tags_for_content_object(metadata.usage_key))
doc[Fields.type] = DocType.library_block
# Add the breadcrumbs. In v2 libraries, the library itself is not a "parent" of the XBlocks so we add it here:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,33 @@
See also cms/djangoapps/contentstore/management/commands/reindex_course.py which
indexes LMS (published) courses in ElasticSearch.
"""
from django.core.management import BaseCommand
from django.core.management import BaseCommand, CommandError

from ... import api


class Command(BaseCommand):
"""
Build or re-build the search index for courses (in Studio, i.e. Draft mode)
Build or re-build the search index for courses and libraries (in Studio, i.e. Draft mode)
This is experimental and not recommended for production use.
"""

def add_arguments(self, parser):
parser.add_argument('--experimental', action='store_true')
parser.set_defaults(experimental=False)

def handle(self, *args, **options):
"""
Build a new search index for Studio, containing content from courses and libraries
"""
if not api.is_meilisearch_enabled():
raise CommandError("Meilisearch is not enabled. Please set MEILISEARCH_ENABLED to True in your settings.")

if not options["experimental"]:
raise CommandError(
"This command is experimental and not recommended for production. "
"Use the --experimental argument to acknowledge and run it."
)

api.rebuild_index(self.stdout.write)
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def test_problem_block(self):
assert doc == {
# Note the 'id' has been stripped of special characters to meet Meilisearch requirements.
# The '-8516ed8' suffix is deterministic based on the original usage key.
"id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-8516ed8",
"id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-f46b6f1e",
"type": "course_block",
"block_type": "problem",
"usage_key": "block-v1:edX+toy+2012_Fall+type@problem+block@Test_Problem",
Expand Down Expand Up @@ -98,7 +98,7 @@ def test_html_block(self):
block = self.store.get_item(self.html_block_key)
doc = searchable_doc_for_course_block(block)
assert doc == {
"id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-b0b4a10",
"id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-efb9c601",
"type": "course_block",
"block_type": "html",
"usage_key": "block-v1:edX+toy+2012_Fall+type@html+block@toyjumpto",
Expand Down Expand Up @@ -132,7 +132,7 @@ def test_video_block_untagged(self):
block = self.store.get_item(block_usage_key)
doc = searchable_doc_for_course_block(block)
assert doc == {
"id": "block-v1edxtoy2012_falltypevideoblockwelcome-b47fb14",
"id": "block-v1edxtoy2012_falltypevideoblockwelcome-0c9fd626",
"type": "course_block",
"block_type": "video",
"usage_key": "block-v1:edX+toy+2012_Fall+type@video+block@Welcome",
Expand Down

0 comments on commit 7320e3c

Please sign in to comment.