-
Notifications
You must be signed in to change notification settings - Fork 3
Vector based topics tagging for videos #2649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
9df62af
3af6f6f
dbc10a3
a81e8d8
67ec1b8
37ee818
bfe4f0c
2d43efa
d512d36
9f47dfc
57d48f7
dd7af3c
984d917
f66ce0c
bb086bb
7c7e18f
1688c5c
88a8333
a0e34f9
aca1034
fe3b12e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,7 +35,10 @@ | |
| adjust_search_for_percolator, | ||
| document_percolated_actions, | ||
| ) | ||
| from vector_search.constants import RESOURCES_COLLECTION_NAME | ||
| from vector_search.constants import ( | ||
| RESOURCES_COLLECTION_NAME, | ||
| TOPICS_COLLECTION_NAME, | ||
| ) | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
|
|
@@ -830,6 +833,52 @@ def user_subscribed_to_query( | |
| ) | ||
|
|
||
|
|
||
| def get_similar_topics_qdrant( | ||
| resource: LearningResource, value_doc: dict, num_topics: int | ||
| ) -> list[str]: | ||
| from vector_search.encoders.utils import dense_encoder | ||
| from vector_search.utils import qdrant_client, vector_point_id | ||
|
||
|
|
||
| """ | ||
| Get a list of similar topics based on vector similarity | ||
|
|
||
| Args: | ||
| value_doc (dict): | ||
| a document representing the data fields we want to search with | ||
| num_topics (int): | ||
| number of topics to return | ||
| Returns: | ||
| list of str: | ||
| list of topic values | ||
| """ | ||
| encoder = dense_encoder() | ||
| client = qdrant_client() | ||
|
|
||
| response = client.retrieve( | ||
| collection_name=RESOURCES_COLLECTION_NAME, | ||
| ids=[vector_point_id(resource.readable_id)], | ||
| with_vectors=True, | ||
| ) | ||
|
|
||
| embedding_context = "\n".join( | ||
| [value_doc[key] for key in value_doc if value_doc[key] is not None] | ||
| ) | ||
| if response and len(response) > 0: | ||
| embeddings = response[0].vector.get(encoder.model_short_name()) | ||
| else: | ||
| embeddings = encoder.embed(embedding_context) | ||
|
|
||
| return [ | ||
| hit["name"] | ||
| for hit in _qdrant_similar_results( | ||
| input_query=embeddings, | ||
| num_resources=num_topics, | ||
| collection_name=TOPICS_COLLECTION_NAME, | ||
| score_threshold=0.2, | ||
| ) | ||
| ] | ||
|
|
||
|
|
||
| def get_similar_topics( | ||
| value_doc: dict, num_topics: int, min_term_freq: int, min_doc_freq: int | ||
| ) -> list[str]: | ||
|
|
@@ -909,7 +958,12 @@ def get_similar_resources( | |
| ) | ||
|
|
||
|
|
||
| def _qdrant_similar_results(doc, num_resources): | ||
| def _qdrant_similar_results( | ||
| input_query, | ||
| num_resources=6, | ||
| collection_name=RESOURCES_COLLECTION_NAME, | ||
| score_threshold=0, | ||
| ): | ||
| """ | ||
| Get similar resources from qdrant | ||
|
|
||
|
|
@@ -926,18 +980,18 @@ def _qdrant_similar_results(doc, num_resources): | |
| from vector_search.utils import ( | ||
| dense_encoder, | ||
| qdrant_client, | ||
| vector_point_id, | ||
| ) | ||
|
|
||
| encoder = dense_encoder() | ||
| client = qdrant_client() | ||
| return [ | ||
| hit.payload | ||
| for hit in client.query_points( | ||
| collection_name=RESOURCES_COLLECTION_NAME, | ||
| query=vector_point_id(doc["readable_id"]), | ||
| collection_name=collection_name, | ||
| query=input_query, | ||
| limit=num_resources, | ||
| using=encoder.model_short_name(), | ||
| score_threshold=score_threshold, | ||
| ).points | ||
| ] | ||
|
|
||
|
|
@@ -956,7 +1010,12 @@ def get_similar_resources_qdrant(value_doc: dict, num_resources: int): | |
| list of str: | ||
| list of learning resources | ||
| """ | ||
| hits = _qdrant_similar_results(value_doc, num_resources) | ||
| from vector_search.utils import vector_point_id | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move to the top of the file
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. see comment about circular import above |
||
|
|
||
| hits = _qdrant_similar_results( | ||
| input_query=vector_point_id(value_doc["readable_id"]), | ||
| num_resources=num_resources, | ||
| ) | ||
| return ( | ||
| LearningResource.objects.for_search_serialization() | ||
| .filter( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,27 @@ | ||
| """Management command to update or create the topics collection in Qdrant""" | ||
|
|
||
| from django.core.management.base import BaseCommand, CommandError | ||
|
|
||
| from main.utils import clear_search_cache, now_in_utc | ||
| from vector_search.tasks import sync_topics | ||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
| """Syncs embeddings for topics in Qdrant""" | ||
|
|
||
| help = "update or create the topics collection in Qdrant" | ||
|
|
||
| def handle(self, *args, **options): # noqa: ARG002 | ||
| """Sync the topics collection""" | ||
| task = sync_topics.apply() | ||
| self.stdout.write("Waiting on task...") | ||
| start = now_in_utc() | ||
| error = task.get() | ||
| if error: | ||
| msg = f"Geenerate embeddings errored: {error}" | ||
| raise CommandError(msg) | ||
| clear_search_cache() | ||
| total_seconds = (now_in_utc() - start).total_seconds() | ||
| self.stdout.write( | ||
| f"Embeddings generated and stored, took {total_seconds} seconds" | ||
| ) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor nitpick, this could be a fixture since it returns the same 2 topics in all the above tests
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good call. I made it a fixture.