Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: index collection per learning_package #35424

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 38 additions & 43 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,23 +289,29 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:

# Get the lists of libraries
status_cb("Counting libraries...")
lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')]
lib_keys = [
(lib.library_key, lib.learning_package_id)
for lib in lib_api.ContentLibrary.objects.select_related(
'org',
'learning_package'
).only(
'org',
'slug',
'learning_package_id'
)
]
num_libraries = len(lib_keys)

# Get the list of courses
status_cb("Counting courses...")
num_courses = CourseOverview.objects.count()

# Get the list of collections
status_cb("Counting collections...")
num_collections = authoring_api.get_collections().count()

# Some counters so we can track our progress as indexing progresses:
num_contexts = num_courses + num_libraries + num_collections
num_contexts = num_courses + num_libraries
num_contexts_done = 0 # How many courses/libraries we've indexed
num_blocks_done = 0 # How many individual components/XBlocks we've indexed

status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.")
status_cb(f"Found {num_courses} courses and {num_libraries} libraries.")
with _using_temp_index(status_cb) as temp_index_name:
############## Configure the index ##############

Expand Down Expand Up @@ -369,7 +375,26 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
])

############## Libraries ##############
status_cb("Indexing libraries...")
status_cb("Indexing libraries and collections...")

def index_collection_batch(batch) -> int:
docs = []
for collection in batch:
try:
doc = searchable_doc_for_collection(collection)
# Uncomment below line once collections are tagged.
# doc.update(searchable_doc_tags(collection.id))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing collection {collection}: {err}")

if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return docs

def index_library(lib_key: str) -> list:
docs = []
Expand All @@ -390,10 +415,13 @@ def index_library(lib_key: str) -> list:
status_cb(f"Error indexing library {lib_key}: {err}")
return docs

for lib_key in lib_keys:
for lib_key, learning_package_id in lib_keys:
status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}")
lib_docs = index_library(lib_key)
num_blocks_done += len(lib_docs)
collections = authoring_api.get_collections(learning_package_id)
collection_docs = index_collection_batch(collections)
num_blocks_done += len(collection_docs)
num_contexts_done += 1

############## Courses ##############
Expand Down Expand Up @@ -430,40 +458,7 @@ def add_with_children(block):
num_contexts_done += 1
num_blocks_done += len(course_docs)

############## Collections ##############
status_cb("Indexing collections...")

def index_collection_batch(batch, num_contexts_done) -> int:
docs = []
for collection in batch:
status_cb(
f"{num_contexts_done + 1}/{num_contexts}. "
f"Now indexing collection {collection.title} ({collection.id})"
)
try:
doc = searchable_doc_for_collection(collection)
# Uncomment below line once collections are tagged.
# doc.update(searchable_doc_tags(collection.id))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing collection {collection}: {err}")
finally:
num_contexts_done += 1

if docs:
try:
# Add docs in batch of 100 at once (usually faster than adding one at a time):
_wait_for_meili_task(client.index(temp_index_name).add_documents(docs))
except (TypeError, KeyError, MeilisearchError) as err:
status_cb(f"Error indexing collection batch {p}: {err}")
return num_contexts_done

# To reduce memory usage on large instances, split up the Collections into pages of 100 collections:
paginator = Paginator(authoring_api.get_collections(enabled=True), 100)
for p in paginator.page_range:
num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done)

status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.")
status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and collections.")


def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None:
Expand Down
Loading