From 9df62af5b357a15f648e888856f257313bf339f9 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 21 Oct 2025 11:14:25 -0400 Subject: [PATCH 01/19] some refactoring and de-duping --- vector_search/utils.py | 54 ++++++++++++------------------------------ 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/vector_search/utils.py b/vector_search/utils.py index ab9160b21b..0cc0a5042e 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -83,48 +83,25 @@ def create_qdrant_collections(force_recreate): force_recreate (bool): Whether to recreate the collections even if they already exist """ + + collections = [RESOURCES_COLLECTION_NAME, CONTENT_FILES_COLLECTION_NAME] + for collection_name in collections: + create_qdrant_collection(collection_name, force_recreate) + + update_qdrant_indexes() + + +def create_qdrant_collection(collection_name, force_recreate): + """ + Create or recreate a QDrant collection + """ client = qdrant_client() - resources_collection_name = RESOURCES_COLLECTION_NAME - content_files_collection_name = CONTENT_FILES_COLLECTION_NAME encoder = dense_encoder() # True if either of the collections were recreated - - if ( - not client.collection_exists(collection_name=resources_collection_name) - or force_recreate - ): - client.delete_collection(resources_collection_name) + if not client.collection_exists(collection_name=collection_name) or force_recreate: + client.delete_collection(collection_name) client.recreate_collection( - collection_name=resources_collection_name, - on_disk_payload=True, - vectors_config={ - encoder.model_short_name(): models.VectorParams( - size=encoder.dim(), distance=models.Distance.COSINE - ), - }, - replication_factor=2, - shard_number=6, - strict_mode_config=models.StrictModeConfig( - enabled=True, - unindexed_filtering_retrieve=False, - unindexed_filtering_update=False, - ), - sparse_vectors_config=client.get_fastembed_sparse_vector_params(), - optimizers_config=models.OptimizersConfigDiff(default_segment_number=2), - quantization_config=models.BinaryQuantization( - binary=models.BinaryQuantizationConfig( - always_ram=True, - ), - ), - ) - - if ( - not client.collection_exists(collection_name=content_files_collection_name) - or force_recreate - ): - client.delete_collection(content_files_collection_name) - client.recreate_collection( - collection_name=content_files_collection_name, + collection_name=collection_name, on_disk_payload=True, vectors_config={ encoder.model_short_name(): models.VectorParams( @@ -146,7 +123,6 @@ def create_qdrant_collections(force_recreate): ), ), ) - update_qdrant_indexes() def update_qdrant_indexes(): From 3af6f6ff198649b6940d1427761cc9f5b0720969 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 21 Oct 2025 11:42:42 -0400 Subject: [PATCH 02/19] adding topics collection name --- vector_search/constants.py | 1 + vector_search/utils.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/vector_search/constants.py b/vector_search/constants.py index 97445d892d..42c3ec74bb 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -3,6 +3,7 @@ RESOURCES_COLLECTION_NAME = f"{settings.QDRANT_BASE_COLLECTION_NAME}.resources" CONTENT_FILES_COLLECTION_NAME = f"{settings.QDRANT_BASE_COLLECTION_NAME}.content_files" +TOPICS_COLLECTION_NAME = f"{settings.QDRANT_BASE_COLLECTION_NAME}.topics" QDRANT_CONTENT_FILE_PARAM_MAP = { "key": "key", diff --git a/vector_search/utils.py b/vector_search/utils.py index 0cc0a5042e..29946c7da9 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -29,6 +29,7 @@ QDRANT_LEARNING_RESOURCE_INDEXES, QDRANT_RESOURCE_PARAM_MAP, RESOURCES_COLLECTION_NAME, + TOPICS_COLLECTION_NAME, ) from vector_search.encoders.utils import dense_encoder @@ -84,7 +85,11 @@ def create_qdrant_collections(force_recreate): even if they already exist """ - collections = [RESOURCES_COLLECTION_NAME, CONTENT_FILES_COLLECTION_NAME] + collections = [ + RESOURCES_COLLECTION_NAME, + CONTENT_FILES_COLLECTION_NAME, + TOPICS_COLLECTION_NAME, + ] for collection_name in collections: create_qdrant_collection(collection_name, force_recreate) From dbc10a37b9e86dc530832b2fcf2e738d112f829a Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 21 Oct 2025 12:06:14 -0400 Subject: [PATCH 03/19] adding topic collection indexes --- vector_search/constants.py | 8 ++++++++ vector_search/utils.py | 2 ++ 2 files changed, 10 insertions(+) diff --git a/vector_search/constants.py b/vector_search/constants.py index 42c3ec74bb..fa84f15bbb 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -44,6 +44,10 @@ } +QDRANT_TOPIC_PARAM_MAP = { + "name": "name", +} + QDRANT_LEARNING_RESOURCE_INDEXES = { "readable_id": models.PayloadSchemaType.KEYWORD, "resource_type": models.PayloadSchemaType.KEYWORD, @@ -83,3 +87,7 @@ "edx_block_id": models.PayloadSchemaType.KEYWORD, "url": models.PayloadSchemaType.KEYWORD, } + +QDRANT_TOPIC_INDEXES = { + "name": models.PayloadSchemaType.KEYWORD, +} diff --git a/vector_search/utils.py b/vector_search/utils.py index 29946c7da9..4bba516d8c 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -28,6 +28,7 @@ QDRANT_CONTENT_FILE_PARAM_MAP, QDRANT_LEARNING_RESOURCE_INDEXES, QDRANT_RESOURCE_PARAM_MAP, + QDRANT_TOPIC_INDEXES, RESOURCES_COLLECTION_NAME, TOPICS_COLLECTION_NAME, ) @@ -139,6 +140,7 @@ def update_qdrant_indexes(): for index in [ (QDRANT_LEARNING_RESOURCE_INDEXES, RESOURCES_COLLECTION_NAME), (QDRANT_CONTENT_FILE_INDEXES, CONTENT_FILES_COLLECTION_NAME), + (QDRANT_TOPIC_INDEXES, TOPICS_COLLECTION_NAME), ]: indexes = index[0] collection_name = index[1] From a81e8d8a61035e713810430adbcbe8496994fdc1 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 22 Oct 2025 09:59:30 -0400 Subject: [PATCH 04/19] adding method to embed topics --- vector_search/constants.py | 2 +- vector_search/utils.py | 63 +++++++++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/vector_search/constants.py b/vector_search/constants.py index fa84f15bbb..d057c2ea98 100644 --- a/vector_search/constants.py +++ b/vector_search/constants.py @@ -44,7 +44,7 @@ } -QDRANT_TOPIC_PARAM_MAP = { +QDRANT_TOPICS_PARAM_MAP = { "name": "name", } diff --git a/vector_search/utils.py b/vector_search/utils.py index 4bba516d8c..731f3bf219 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -7,7 +7,11 @@ from qdrant_client import QdrantClient, models from learning_resources.content_summarizer import ContentSummarizer -from learning_resources.models import ContentFile, LearningResource +from learning_resources.models import ( + ContentFile, + LearningResource, + LearningResourceTopic, +) from learning_resources.serializers import ( ContentFileSerializer, LearningResourceMetadataDisplaySerializer, @@ -29,6 +33,7 @@ QDRANT_LEARNING_RESOURCE_INDEXES, QDRANT_RESOURCE_PARAM_MAP, QDRANT_TOPIC_INDEXES, + QDRANT_TOPICS_PARAM_MAP, RESOURCES_COLLECTION_NAME, TOPICS_COLLECTION_NAME, ) @@ -171,6 +176,60 @@ def vector_point_id(readable_id): return str(uuid.uuid5(uuid.NAMESPACE_DNS, readable_id)) +def embed_topics(): + """ + Embed and store new (sub)topics and remove non-existent ones from Qdrant + """ + client = qdrant_client() + create_qdrant_collections(force_recreate=False) + indexed_count = client.count(collection_name=TOPICS_COLLECTION_NAME).count + + topic_names = set( + LearningResourceTopic.objects.filter(parent__isnull=False).values_list( + "name", flat=True + ) + ) + + if indexed_count > 0: + existing = vector_search( + query_string="", + params={}, + search_collection=TOPICS_COLLECTION_NAME, + limit=indexed_count, + ) + indexed_topic_names = {hit["name"] for hit in existing["hits"]} + else: + indexed_topic_names = set() + + new_topics = topic_names - indexed_topic_names + remove_topics = indexed_topic_names - topic_names + for remove_topic in remove_topics: + remove_points_matching_params( + {"name": remove_topic}, collection_name=TOPICS_COLLECTION_NAME + ) + + docs = [] + metadata = [] + ids = [] + + filtered_topics = LearningResourceTopic.objects.filter(name__in=new_topics) + + for topic in filtered_topics: + docs.append(topic.name) + metadata.append( + { + "name": topic.name, + } + ) + ids.append(str(topic.topic_uuid)) + if len(docs) > 0: + encoder = dense_encoder() + embeddings = encoder.embed_documents(docs) + vector_name = encoder.model_short_name() + points = points_generator(ids, metadata, embeddings, vector_name) + client.upload_points(TOPICS_COLLECTION_NAME, points=points, wait=False) + + def _chunk_documents(encoder, texts, metadatas): # chunk the documents. use semantic chunking if enabled chunk_params = { @@ -740,6 +799,8 @@ def qdrant_query_conditions(params, collection_name=RESOURCES_COLLECTION_NAME): conditions = [] if collection_name == RESOURCES_COLLECTION_NAME: QDRANT_PARAM_MAP = QDRANT_RESOURCE_PARAM_MAP + elif collection_name == TOPICS_COLLECTION_NAME: + QDRANT_PARAM_MAP = QDRANT_TOPICS_PARAM_MAP else: QDRANT_PARAM_MAP = QDRANT_CONTENT_FILE_PARAM_MAP if not params: From 37ee81822138d4a0021b479bd7de274edc537c18 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 22 Oct 2025 13:56:47 -0400 Subject: [PATCH 05/19] keeping old topic generation method --- learning_resources_search/api.py | 51 ++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index 6ca4c3422b..9565fb5beb 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -35,7 +35,7 @@ adjust_search_for_percolator, document_percolated_actions, ) -from vector_search.constants import RESOURCES_COLLECTION_NAME +from vector_search.constants import RESOURCES_COLLECTION_NAME, TOPICS_COLLECTION_NAME log = logging.getLogger(__name__) @@ -830,6 +830,39 @@ def user_subscribed_to_query( ) +def get_similar_topics_qdrant(value_doc: dict, num_topics: int) -> list[str]: + from vector_search.encoders.utils import dense_encoder + + """ + Get a list of similar topics based on text values + + Args: + value_doc (dict): + a document representing the data fields we want to search with + num_topics (int): + number of topics to return + Returns: + list of str: + list of topic values + """ + encoder = dense_encoder() + + embedding_context = f""" + {value_doc.get("title", "")} + {value_doc.get("description", "")} + {value_doc.get("full_description", "")} + """ + embeddings = encoder.embed(embedding_context) + return [ + hit["name"] + for hit in _qdrant_similar_results( + input_query=embeddings, + num_resources=num_topics, + collection_name=TOPICS_COLLECTION_NAME, + ) + ] + + def get_similar_topics( value_doc: dict, num_topics: int, min_term_freq: int, min_doc_freq: int ) -> list[str]: @@ -909,7 +942,9 @@ def get_similar_resources( ) -def _qdrant_similar_results(doc, num_resources): +def _qdrant_similar_results( + input_query, num_resources=6, collection_name=RESOURCES_COLLECTION_NAME +): """ Get similar resources from qdrant @@ -926,7 +961,6 @@ def _qdrant_similar_results(doc, num_resources): from vector_search.utils import ( dense_encoder, qdrant_client, - vector_point_id, ) encoder = dense_encoder() @@ -934,8 +968,8 @@ def _qdrant_similar_results(doc, num_resources): return [ hit.payload for hit in client.query_points( - collection_name=RESOURCES_COLLECTION_NAME, - query=vector_point_id(doc["readable_id"]), + collection_name=collection_name, + query=input_query, limit=num_resources, using=encoder.model_short_name(), ).points @@ -956,7 +990,12 @@ def get_similar_resources_qdrant(value_doc: dict, num_resources: int): list of str: list of learning resources """ - hits = _qdrant_similar_results(value_doc, num_resources) + from vector_search.utils import vector_point_id + + hits = _qdrant_similar_results( + input_query=vector_point_id(value_doc["resource_readable_id"]), + num_resources=num_resources, + ) return ( LearningResource.objects.for_search_serialization() .filter( From bfe4f0c62bdf57070762c3c1fd835608b9a86c5c Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Thu, 23 Oct 2025 10:48:30 -0400 Subject: [PATCH 06/19] adding qdrant based topic generator --- learning_resources_search/api.py | 11 +++++------ learning_resources_search/plugins.py | 6 ++---- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index 9565fb5beb..0c31ed9154 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -834,7 +834,7 @@ def get_similar_topics_qdrant(value_doc: dict, num_topics: int) -> list[str]: from vector_search.encoders.utils import dense_encoder """ - Get a list of similar topics based on text values + Get a list of similar topics based on vector similarity Args: value_doc (dict): @@ -847,11 +847,10 @@ def get_similar_topics_qdrant(value_doc: dict, num_topics: int) -> list[str]: """ encoder = dense_encoder() - embedding_context = f""" - {value_doc.get("title", "")} - {value_doc.get("description", "")} - {value_doc.get("full_description", "")} - """ + embedding_context = "\n".join( + [value_doc[key] for key in value_doc if value_doc[key] is not None] + ) + embeddings = encoder.embed(embedding_context) return [ hit["name"] diff --git a/learning_resources_search/plugins.py b/learning_resources_search/plugins.py index 8eca2f981a..581fb1e2c7 100644 --- a/learning_resources_search/plugins.py +++ b/learning_resources_search/plugins.py @@ -7,7 +7,7 @@ from django.conf import settings as django_settings from learning_resources_search import tasks -from learning_resources_search.api import get_similar_topics +from learning_resources_search.api import get_similar_topics_qdrant from learning_resources_search.constants import ( COURSE_TYPE, PERCOLATE_INDEX_TYPE, @@ -125,11 +125,9 @@ def resource_similar_topics(self, resource) -> list[dict]: "full_description": resource.full_description, } - topic_names = get_similar_topics( + topic_names = get_similar_topics_qdrant( text_doc, settings.OPEN_VIDEO_MAX_TOPICS, - settings.OPEN_VIDEO_MIN_TERM_FREQ, - settings.OPEN_VIDEO_MIN_DOC_FREQ, ) return [{"name": topic_name} for topic_name in topic_names] From 2d43efab904e69f7a44639dc293be71984181fcb Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 24 Oct 2025 12:51:08 -0400 Subject: [PATCH 07/19] fix test --- learning_resources_search/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index 0c31ed9154..48545b21dc 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -992,7 +992,7 @@ def get_similar_resources_qdrant(value_doc: dict, num_resources: int): from vector_search.utils import vector_point_id hits = _qdrant_similar_results( - input_query=vector_point_id(value_doc["resource_readable_id"]), + input_query=vector_point_id(value_doc["readable_id"]), num_resources=num_resources, ) return ( From d512d361dea988e6736422bc4aede0a26cb2e68d Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 24 Oct 2025 13:14:42 -0400 Subject: [PATCH 08/19] fix tests --- learning_resources/etl/loaders_test.py | 15 ++++++++++++++- learning_resources_search/plugins_test.py | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/learning_resources/etl/loaders_test.py b/learning_resources/etl/loaders_test.py index 91c16bdb8c..8452213a2a 100644 --- a/learning_resources/etl/loaders_test.py +++ b/learning_resources/etl/loaders_test.py @@ -1465,9 +1465,14 @@ def test_load_video(mocker, video_exists, is_published, pass_topics): assert getattr(result, key) == value, f"Property {key} should equal {value}" -def test_load_videos(): +def test_load_videos(mocker): """Verify that load_videos loads a list of videos""" assert Video.objects.count() == 0 + + mocker.patch( + "learning_resources_search.plugins.get_similar_topics_qdrant", + return_value=["topic1", "topic2"], + ) video_resources = [video.learning_resource for video in VideoFactory.build_batch(5)] videos_data = [ { @@ -1493,6 +1498,10 @@ def test_load_playlist(mocker, playlist_exists): LearningResourceTopicFactory.create(name=topic["name"]) for topic in expected_topics ] + mocker.patch( + "learning_resources_search.plugins.get_similar_topics_qdrant", + return_value=["topic1", "topic2"], + ) mock_most_common_topics = mocker.patch( "learning_resources.etl.loaders.most_common_topics", return_value=expected_topics, @@ -1905,6 +1914,10 @@ def test_course_with_unpublished_force_ingest_is_test_mode(): @pytest.mark.django_db def test_load_articles(mocker, climate_platform): + mocker.patch( + "learning_resources_search.plugins.get_similar_topics_qdrant", + return_value=["topic1", "topic2"], + ) articles_data = [ { "title": "test", diff --git a/learning_resources_search/plugins_test.py b/learning_resources_search/plugins_test.py index 4bb24b8ca9..bd15aa1698 100644 --- a/learning_resources_search/plugins_test.py +++ b/learning_resources_search/plugins_test.py @@ -128,7 +128,7 @@ def test_resource_similar_topics(mocker, settings): """The plugin function should return expected topics for a resource""" expected_topics = ["topic1", "topic2"] mock_similar_topics = mocker.patch( - "learning_resources_search.plugins.get_similar_topics", + "learning_resources_search.plugins.get_similar_topics_qdrant", return_value=expected_topics, ) resource = LearningResourceFactory.create() From 9f47dfc19c067dbf92679697ac70bae630e84e1c Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 24 Oct 2025 13:58:33 -0400 Subject: [PATCH 09/19] adding score thresholding --- learning_resources_search/api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index 48545b21dc..650978082c 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -858,6 +858,7 @@ def get_similar_topics_qdrant(value_doc: dict, num_topics: int) -> list[str]: input_query=embeddings, num_resources=num_topics, collection_name=TOPICS_COLLECTION_NAME, + score_threshold=0.2, ) ] @@ -942,7 +943,10 @@ def get_similar_resources( def _qdrant_similar_results( - input_query, num_resources=6, collection_name=RESOURCES_COLLECTION_NAME + input_query, + num_resources=6, + collection_name=RESOURCES_COLLECTION_NAME, + score_threshold=0, ): """ Get similar resources from qdrant @@ -971,6 +975,7 @@ def _qdrant_similar_results( query=input_query, limit=num_resources, using=encoder.model_short_name(), + score_threshold=score_threshold, ).points ] From 57d48f7aced3728d86729c6a309293959f879f7d Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 24 Oct 2025 14:26:25 -0400 Subject: [PATCH 10/19] adding management command to sync topics and adding a cache mecahnism for assignment --- learning_resources_search/api.py | 22 ++++++++++++--- learning_resources_search/plugins.py | 1 + .../commands/sync_topic_embeddings.py | 27 +++++++++++++++++++ vector_search/tasks.py | 14 +++++++++- 4 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 vector_search/management/commands/sync_topic_embeddings.py diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index 650978082c..5c14022fde 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -35,7 +35,10 @@ adjust_search_for_percolator, document_percolated_actions, ) -from vector_search.constants import RESOURCES_COLLECTION_NAME, TOPICS_COLLECTION_NAME +from vector_search.constants import ( + RESOURCES_COLLECTION_NAME, + TOPICS_COLLECTION_NAME, +) log = logging.getLogger(__name__) @@ -830,8 +833,11 @@ def user_subscribed_to_query( ) -def get_similar_topics_qdrant(value_doc: dict, num_topics: int) -> list[str]: +def get_similar_topics_qdrant( + resource: LearningResource, value_doc: dict, num_topics: int +) -> list[str]: from vector_search.encoders.utils import dense_encoder + from vector_search.utils import qdrant_client, vector_point_id """ Get a list of similar topics based on vector similarity @@ -846,12 +852,22 @@ def get_similar_topics_qdrant(value_doc: dict, num_topics: int) -> list[str]: list of topic values """ encoder = dense_encoder() + client = qdrant_client() + + response = client.retrieve( + collection_name=RESOURCES_COLLECTION_NAME, + ids=[vector_point_id(resource.readable_id)], + with_vectors=True, + ) embedding_context = "\n".join( [value_doc[key] for key in value_doc if value_doc[key] is not None] ) + if response and len(response) > 0: + embeddings = response[0].vector.get(encoder.model_short_name()) + else: + embeddings = encoder.embed(embedding_context) - embeddings = encoder.embed(embedding_context) return [ hit["name"] for hit in _qdrant_similar_results( diff --git a/learning_resources_search/plugins.py b/learning_resources_search/plugins.py index 581fb1e2c7..f938d316e1 100644 --- a/learning_resources_search/plugins.py +++ b/learning_resources_search/plugins.py @@ -126,6 +126,7 @@ def resource_similar_topics(self, resource) -> list[dict]: } topic_names = get_similar_topics_qdrant( + resource, text_doc, settings.OPEN_VIDEO_MAX_TOPICS, ) diff --git a/vector_search/management/commands/sync_topic_embeddings.py b/vector_search/management/commands/sync_topic_embeddings.py new file mode 100644 index 0000000000..e77516c5ec --- /dev/null +++ b/vector_search/management/commands/sync_topic_embeddings.py @@ -0,0 +1,27 @@ +"""Management command to index content""" + +from django.core.management.base import BaseCommand, CommandError + +from main.utils import clear_search_cache, now_in_utc +from vector_search.tasks import sync_topics + + +class Command(BaseCommand): + """Generates embeddings in Qdrant""" + + help = "update or create the topics collection in Qdrant" + + def handle(self, *args, **options): # noqa: ARG002 + """Sync the topics collection""" + task = sync_topics.apply() + self.stdout.write("Waiting on task...") + start = now_in_utc() + error = task.get() + if error: + msg = f"Geenerate embeddings errored: {error}" + raise CommandError(msg) + clear_search_cache() + total_seconds = (now_in_utc() - start).total_seconds() + self.stdout.write( + f"Embeddings generated and stored, took {total_seconds} seconds" + ) diff --git a/vector_search/tasks.py b/vector_search/tasks.py index ce7ab862ca..cde57f7637 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -32,7 +32,11 @@ chunks, now_in_utc, ) -from vector_search.utils import embed_learning_resources, remove_qdrant_records +from vector_search.utils import ( + embed_learning_resources, + embed_topics, + remove_qdrant_records, +) log = logging.getLogger(__name__) @@ -362,3 +366,11 @@ def remove_run_content_files(run_id): for ids in chunks(content_file_ids, chunk_size=settings.QDRANT_CHUNK_SIZE) ] ) + + +@app.task +def sync_topics(): + """ + Sync topics collection embeddings to Qdrant + """ + embed_topics() From dd7af3c83f423f4f505ca004b7025226c913cb40 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 24 Oct 2025 14:51:43 -0400 Subject: [PATCH 11/19] fix test --- learning_resources_search/plugins_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/learning_resources_search/plugins_test.py b/learning_resources_search/plugins_test.py index bd15aa1698..8eacaa65d0 100644 --- a/learning_resources_search/plugins_test.py +++ b/learning_resources_search/plugins_test.py @@ -135,12 +135,11 @@ def test_resource_similar_topics(mocker, settings): topics = SearchIndexPlugin().resource_similar_topics(resource) assert topics == [{"name": topic} for topic in expected_topics] mock_similar_topics.assert_called_once_with( + resource, { "title": resource.title, "description": resource.description, "full_description": resource.full_description, }, settings.OPEN_VIDEO_MAX_TOPICS, - settings.OPEN_VIDEO_MIN_TERM_FREQ, - settings.OPEN_VIDEO_MIN_DOC_FREQ, ) From 984d917f447edcb75bf8a66ca0d14f6a9a5b7570 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Fri, 24 Oct 2025 15:44:52 -0400 Subject: [PATCH 12/19] docstring update --- vector_search/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vector_search/tasks.py b/vector_search/tasks.py index cde57f7637..3e497a3d9e 100644 --- a/vector_search/tasks.py +++ b/vector_search/tasks.py @@ -371,6 +371,6 @@ def remove_run_content_files(run_id): @app.task def sync_topics(): """ - Sync topics collection embeddings to Qdrant + Sync topics to the Qdrant collection """ embed_topics() From f66ce0caae6b926bb9a5e87b7f8afdc2a97498e2 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Mon, 27 Oct 2025 19:37:14 -0400 Subject: [PATCH 13/19] update docstrings --- vector_search/management/commands/sync_topic_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vector_search/management/commands/sync_topic_embeddings.py b/vector_search/management/commands/sync_topic_embeddings.py index e77516c5ec..8a5d1a8487 100644 --- a/vector_search/management/commands/sync_topic_embeddings.py +++ b/vector_search/management/commands/sync_topic_embeddings.py @@ -1,4 +1,4 @@ -"""Management command to index content""" +"""Management command to update or create the topics collection in Qdrant""" from django.core.management.base import BaseCommand, CommandError @@ -7,7 +7,7 @@ class Command(BaseCommand): - """Generates embeddings in Qdrant""" + """Syncs embeddings for topics in Qdrant""" help = "update or create the topics collection in Qdrant" From bb086bb49db87dc3a07290ecf858d69839454452 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 28 Oct 2025 10:08:11 -0400 Subject: [PATCH 14/19] update topic query --- vector_search/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vector_search/utils.py b/vector_search/utils.py index 731f3bf219..1f9d55a735 100644 --- a/vector_search/utils.py +++ b/vector_search/utils.py @@ -2,6 +2,7 @@ import uuid from django.conf import settings +from django.db.models import Q from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_experimental.text_splitter import SemanticChunker from qdrant_client import QdrantClient, models @@ -185,9 +186,9 @@ def embed_topics(): indexed_count = client.count(collection_name=TOPICS_COLLECTION_NAME).count topic_names = set( - LearningResourceTopic.objects.filter(parent__isnull=False).values_list( - "name", flat=True - ) + LearningResourceTopic.objects.filter( + Q(parent=None) | Q(parent__isnull=False) + ).values_list("name", flat=True) ) if indexed_count > 0: From 7c7e18fc71bf5434668082695da3375ce21284be Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 28 Oct 2025 10:09:07 -0400 Subject: [PATCH 15/19] adding topic sync tests --- vector_search/utils_test.py | 64 +++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py index 346ccda029..c4e48cc17b 100644 --- a/vector_search/utils_test.py +++ b/vector_search/utils_test.py @@ -1,4 +1,5 @@ from decimal import Decimal +from unittest.mock import MagicMock import pytest from django.conf import settings @@ -10,6 +11,7 @@ LearningResourceFactory, LearningResourcePriceFactory, LearningResourceRunFactory, + LearningResourceTopicFactory, ) from learning_resources.models import LearningResource from learning_resources.serializers import LearningResourceMetadataDisplaySerializer @@ -35,6 +37,7 @@ _embed_course_metadata_as_contentfile, create_qdrant_collections, embed_learning_resources, + embed_topics, filter_existing_qdrant_points, qdrant_query_conditions, should_generate_content_embeddings, @@ -915,3 +918,64 @@ def test_update_qdrant_indexes_updates_mismatched_field_type(mocker): for index_field in QDRANT_CONTENT_FILE_INDEXES ] mock_client.create_payload_index.assert_has_calls(expected_calls, any_order=True) + + +def test_embed_topics_no_new_topics(mocker): + """ + Test embed_topics when there are no new topics to embed + """ + mock_client = MagicMock() + mock_qdrant_client = mocker.patch("vector_search.utils.qdrant_client") + mock_qdrant_client.return_value = mock_client + mock_client.count.return_value.count = 1 + mock_vector_search = mocker.patch("vector_search.utils.vector_search") + mock_vector_search.return_value = {"hits": [{"name": "topic1"}]} + LearningResourceTopicFactory.create(name="topic1", parent=None) + mock_remove_points_matching_params = mocker.patch( + "vector_search.utils.remove_points_matching_params" + ) + embed_topics() + mock_remove_points_matching_params.assert_not_called() + mock_client.upload_points.assert_not_called() + + +def test_embed_topics_new_topics(mocker): + """ + Test embed_topics when there are new topics + """ + mock_client = MagicMock() + mock_qdrant_client = mocker.patch("vector_search.utils.qdrant_client") + mock_qdrant_client.return_value = mock_client + mock_client.count.return_value.count = 1 + mock_vector_search = mocker.patch("vector_search.utils.vector_search") + mock_vector_search.return_value = {"hits": [{"name": "topic1"}]} + LearningResourceTopicFactory.create(name="topic1", parent=None) + LearningResourceTopicFactory.create(name="topic2", parent=None) + LearningResourceTopicFactory.create(name="topic3", parent=None) + mocker.patch("vector_search.utils.remove_points_matching_params") + embed_topics() + mock_client.upload_points.assert_called_once() + assert len(list(mock_client.upload_points.mock_calls[0][2]["points"])) == 2 + + +def test_embed_topics_remove_topics(mocker): + """ + Test embed_topics when there are topics to remove + """ + mock_client = MagicMock() + mock_qdrant_client = mocker.patch("vector_search.utils.qdrant_client") + mock_qdrant_client.return_value = mock_client + mock_client.count.return_value.count = 1 + mock_vector_search = mocker.patch("vector_search.utils.vector_search") + mock_vector_search.return_value = {"hits": [{"name": "remove-topic"}]} + + LearningResourceTopicFactory.create(name="topic2", parent=None) + LearningResourceTopicFactory.create(name="topic3", parent=None) + mock_remove_points_matching_params = mocker.patch( + "vector_search.utils.remove_points_matching_params" + ) + embed_topics() + mock_remove_points_matching_params.assert_called_once() + assert ( + mock_remove_points_matching_params.mock_calls[0][1][0]["name"] == "remove-topic" + ) From 1688c5c277b1cf0784aa1b9aa081a90447b9aa73 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 28 Oct 2025 11:37:48 -0400 Subject: [PATCH 16/19] switch default number of topics to 2 --- main/settings_course_etl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/settings_course_etl.py b/main/settings_course_etl.py index a5bf02954e..6f70c4d6c4 100644 --- a/main/settings_course_etl.py +++ b/main/settings_course_etl.py @@ -96,7 +96,7 @@ # course catalog video etl settings OPEN_VIDEO_DATA_BRANCH = get_string("OPEN_VIDEO_DATA_BRANCH", "master") OPEN_VIDEO_USER_LIST_OWNER = get_string("OPEN_VIDEO_USER_LIST_OWNER", None) -OPEN_VIDEO_MAX_TOPICS = get_int("OPEN_VIDEO_MAX_TOPICS", 3) +OPEN_VIDEO_MAX_TOPICS = get_int("OPEN_VIDEO_MAX_TOPICS", 2) OPEN_VIDEO_MIN_TERM_FREQ = get_int("OPEN_VIDEO_MIN_TERM_FREQ", 1) OPEN_VIDEO_MIN_DOC_FREQ = get_int("OPEN_VIDEO_MIN_DOC_FREQ", 15) From a0e34f9aff6a55005fc0100e66c8dbca2694b508 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Tue, 28 Oct 2025 12:08:17 -0400 Subject: [PATCH 17/19] test for cached embedding --- learning_resources_search/api_test.py | 39 ++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/learning_resources_search/api_test.py b/learning_resources_search/api_test.py index c60ac3e0ee..6bb74a95f5 100644 --- a/learning_resources_search/api_test.py +++ b/learning_resources_search/api_test.py @@ -1,6 +1,6 @@ """Search API function tests""" -from unittest.mock import Mock +from unittest.mock import MagicMock, Mock import pytest from freezegun import freeze_time @@ -21,6 +21,7 @@ generate_sort_clause, generate_suggest_clause, get_similar_topics, + get_similar_topics_qdrant, percolate_matches_for_document, relevant_indexes, ) @@ -3266,3 +3267,39 @@ def test_dev_mode(dev_mode): assert construct_search(search_params).to_dict().get("explain") else: assert construct_search(search_params).to_dict().get("explain") is None + + +@pytest.mark.django_db +def test_get_similar_topics_qdrant_uses_cached_embedding(mocker): + """ + Test that get_similar_topics_qdrant uses a cached embedding when available + """ + resource = MagicMock() + resource.readable_id = "test-resource" + value_doc = {"title": "Test Title", "description": "Test Description"} + num_topics = 3 + + mock_encoder = mocker.patch("vector_search.encoders.utils.dense_encoder") + encoder_instance = mock_encoder.return_value + encoder_instance.model_short_name.return_value = "test-model" + encoder_instance.embed.return_value = [0.1, 0.2, 0.3] + + mock_client = mocker.patch("vector_search.utils.qdrant_client") + client_instance = mock_client.return_value + + # Simulate a cached embedding in the response + client_instance.retrieve.return_value = [ + MagicMock(vector={"test-model": [0.9, 0.8, 0.7]}) + ] + + mocker.patch( + "learning_resources_search.api._qdrant_similar_results", + return_value=[{"name": "topic1"}, {"name": "topic2"}], + ) + + result = get_similar_topics_qdrant(resource, value_doc, num_topics) + + # Assert that embed was NOT called (cached embedding used) + encoder_instance.embed.assert_not_called() + # Assert that the result is as expected + assert result == ["topic1", "topic2"] From aca10348c7aecfc0eb23020ad86dbf6ad80e5461 Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 29 Oct 2025 11:01:22 -0400 Subject: [PATCH 18/19] relocating import and making test topics a fixture --- learning_resources/etl/loaders_test.py | 27 ++++++++++++-------------- learning_resources_search/api.py | 3 +-- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/learning_resources/etl/loaders_test.py b/learning_resources/etl/loaders_test.py index 8452213a2a..72b074754a 100644 --- a/learning_resources/etl/loaders_test.py +++ b/learning_resources/etl/loaders_test.py @@ -131,6 +131,14 @@ def mock_duplicates(mocker): ) +@pytest.fixture +def mock_get_similar_topics_qdrant(mocker): + mocker.patch( + "learning_resources_search.plugins.get_similar_topics_qdrant", + return_value=["topic1", "topic2"], + ) + + @pytest.fixture(autouse=True) def mock_upsert_tasks(mocker): """Mock out the upsert task helpers""" @@ -1465,14 +1473,10 @@ def test_load_video(mocker, video_exists, is_published, pass_topics): assert getattr(result, key) == value, f"Property {key} should equal {value}" -def test_load_videos(mocker): +def test_load_videos(mocker, mock_get_similar_topics_qdrant): """Verify that load_videos loads a list of videos""" assert Video.objects.count() == 0 - mocker.patch( - "learning_resources_search.plugins.get_similar_topics_qdrant", - return_value=["topic1", "topic2"], - ) video_resources = [video.learning_resource for video in VideoFactory.build_batch(5)] videos_data = [ { @@ -1491,17 +1495,14 @@ def test_load_videos(mocker): @pytest.mark.parametrize("playlist_exists", [True, False]) -def test_load_playlist(mocker, playlist_exists): +def test_load_playlist(mocker, playlist_exists, mock_get_similar_topics_qdrant): """Test load_playlist""" expected_topics = [{"name": "Biology"}, {"name": "Physics"}] [ LearningResourceTopicFactory.create(name=topic["name"]) for topic in expected_topics ] - mocker.patch( - "learning_resources_search.plugins.get_similar_topics_qdrant", - return_value=["topic1", "topic2"], - ) + mock_most_common_topics = mocker.patch( "learning_resources.etl.loaders.most_common_topics", return_value=expected_topics, @@ -1913,11 +1914,7 @@ def test_course_with_unpublished_force_ingest_is_test_mode(): @pytest.mark.django_db -def test_load_articles(mocker, climate_platform): - mocker.patch( - "learning_resources_search.plugins.get_similar_topics_qdrant", - return_value=["topic1", "topic2"], - ) +def test_load_articles(mocker, climate_platform, mock_get_similar_topics_qdrant): articles_data = [ { "title": "test", diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py index 5c14022fde..b6108f04fa 100644 --- a/learning_resources_search/api.py +++ b/learning_resources_search/api.py @@ -39,6 +39,7 @@ RESOURCES_COLLECTION_NAME, TOPICS_COLLECTION_NAME, ) +from vector_search.encoders.utils import dense_encoder log = logging.getLogger(__name__) @@ -836,7 +837,6 @@ def user_subscribed_to_query( def get_similar_topics_qdrant( resource: LearningResource, value_doc: dict, num_topics: int ) -> list[str]: - from vector_search.encoders.utils import dense_encoder from vector_search.utils import qdrant_client, vector_point_id """ @@ -978,7 +978,6 @@ def _qdrant_similar_results( list of serialized resources """ from vector_search.utils import ( - dense_encoder, qdrant_client, ) From fe3b12eb6179229015dfcc61f0e895884d0b013f Mon Sep 17 00:00:00 2001 From: shankar ambady Date: Wed, 29 Oct 2025 11:24:26 -0400 Subject: [PATCH 19/19] fixing mock --- learning_resources_search/api_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/learning_resources_search/api_test.py b/learning_resources_search/api_test.py index 6bb74a95f5..25f5117ef2 100644 --- a/learning_resources_search/api_test.py +++ b/learning_resources_search/api_test.py @@ -3279,7 +3279,7 @@ def test_get_similar_topics_qdrant_uses_cached_embedding(mocker): value_doc = {"title": "Test Title", "description": "Test Description"} num_topics = 3 - mock_encoder = mocker.patch("vector_search.encoders.utils.dense_encoder") + mock_encoder = mocker.patch("learning_resources_search.api.dense_encoder") encoder_instance = mock_encoder.return_value encoder_instance.model_short_name.return_value = "test-model" encoder_instance.embed.return_value = [0.1, 0.2, 0.3]