From 5f603ebedd598b2b45b2e23028ac881edf13ec17 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Tue, 9 Apr 2024 12:01:36 +0530 Subject: [PATCH 1/6] [ENH]: FastEmbed embedding function support --- chromadb/test/ef/test_fastembed_ef.py | 13 +++++ chromadb/utils/embedding_functions.py | 78 +++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 11 deletions(-) create mode 100644 chromadb/test/ef/test_fastembed_ef.py diff --git a/chromadb/test/ef/test_fastembed_ef.py b/chromadb/test/ef/test_fastembed_ef.py new file mode 100644 index 00000000000..59ba95499b4 --- /dev/null +++ b/chromadb/test/ef/test_fastembed_ef.py @@ -0,0 +1,13 @@ +import pytest + +from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction + +# Skip test if the 'fastembed' package is not installed is not installed +fastembed = pytest.importorskip("fastembed", reason="fastembed not installed") + + +def test_fastembed() -> None: + ef = FastEmbedEmbeddingFunction(model_name="BAAI/bge-small-en-v1.5") + embeddings = ef(["Here is an article about llamas...", "this is another article"]) + assert len(embeddings) == 2 + assert len(embeddings[0]) == 384 diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 3f0a1ce043b..3544b286d16 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -743,9 +743,7 @@ def __call__(self, input: Union[Documents, Images]) -> Embeddings: class RoboflowEmbeddingFunction(EmbeddingFunction[Union[Documents, Images]]): - def __init__( - self, api_key: str = "", api_url = "https://infer.roboflow.com" - ) -> None: + def __init__(self, api_key: str = "", api_url="https://infer.roboflow.com") -> None: """ Create a RoboflowEmbeddingFunction. @@ -757,7 +755,7 @@ def __init__( api_key = os.environ.get("ROBOFLOW_API_KEY") self._api_url = api_url - self._api_key = api_key + self._api_key = api_key try: self._PILImage = importlib.import_module("PIL.Image") @@ -789,10 +787,10 @@ def __call__(self, input: Union[Documents, Images]) -> Embeddings: json=infer_clip_payload, ) - result = res.json()['embeddings'] + result = res.json()["embeddings"] embeddings.append(result[0]) - + elif is_document(item): infer_clip_payload = { "text": input, @@ -803,13 +801,13 @@ def __call__(self, input: Union[Documents, Images]) -> Embeddings: json=infer_clip_payload, ) - result = res.json()['embeddings'] + result = res.json()["embeddings"] embeddings.append(result[0]) return embeddings - + class AmazonBedrockEmbeddingFunction(EmbeddingFunction[Documents]): def __init__( self, @@ -909,7 +907,8 @@ def create_langchain_embedding(langchain_embdding_fn: Any): # type: ignore ) class ChromaLangchainEmbeddingFunction( - LangchainEmbeddings, EmbeddingFunction[Union[Documents, Images]] # type: ignore + LangchainEmbeddings, + EmbeddingFunction[Union[Documents, Images]], # type: ignore ): """ This class is used as bridge between langchain embedding functions and custom chroma embedding functions. @@ -962,7 +961,7 @@ def __call__(self, input: Documents) -> Embeddings: # type: ignore return ChromaLangchainEmbeddingFunction(embedding_function=langchain_embdding_fn) - + class OllamaEmbeddingFunction(EmbeddingFunction[Documents]): """ This class is used to generate embeddings for a list of texts using the Ollama Embedding API (https://github.com/ollama/ollama/blob/main/docs/api.md#generate-embeddings). @@ -1018,7 +1017,64 @@ def __call__(self, input: Documents) -> Embeddings: ], ) - + +class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]): + """ + This class is used to generate embeddings for a list of texts using FastEmbed - https://qdrant.github.io/fastembed/. + Find the list of supported models at https://qdrant.github.io/fastembed/examples/Supported_Models/. + """ + + def __init__( + self, + model_name: str = "BAAI/bge-small-en-v1.5", + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + **kwargs, + ) -> None: + """ + Initialize fastembed.TextEmbedding + + Args: + model_name (str): The name of the model to use. + cache_dir (str, optional): The path to the model cache directory. + Can also be set using the `FASTEMBED_CACHE_PATH` env variable. + threads (int, optional): The number of threads single onnxruntime session can use.. + + Raises: + ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + """ + try: + from fastembed import TextEmbedding + except ImportError: + raise ValueError( + "The 'fastembed' package is not installed. Please install it with `pip install fastembed`" + ) + self._model = TextEmbedding( + model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs + ) + + def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + input (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> fastembed_ef = FastEmbedEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2") + >>> texts = ["Hello, world!", "How are you?"] + >>> embeddings = fastembed_ef(texts) + """ + embeddings = self._model.embed(input) + return cast( + Embeddings, + [embedding.tolist() for embedding in embeddings], + ) + + # List of all classes in this module _classes = [ name From 51f64cfefd255dbea08ee888248a8a563da74217 Mon Sep 17 00:00:00 2001 From: Anush008 Date: Wed, 10 Apr 2024 13:27:08 +0530 Subject: [PATCH 2/6] chore: batch_size, parallel options --- chromadb/utils/embedding_functions.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/chromadb/utils/embedding_functions.py b/chromadb/utils/embedding_functions.py index 3544b286d16..ef8421580dc 100644 --- a/chromadb/utils/embedding_functions.py +++ b/chromadb/utils/embedding_functions.py @@ -1027,18 +1027,27 @@ class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]): def __init__( self, model_name: str = "BAAI/bge-small-en-v1.5", + batch_size: int = 256, cache_dir: Optional[str] = None, threads: Optional[int] = None, + parallel: Optional[int] = None, **kwargs, ) -> None: """ Initialize fastembed.TextEmbedding Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the model cache directory. + model_name (str): The name of the model to use. Defaults to `"BAAI/bge-small-en-v1.5"`. + batch_size (int): Batch size for encoding. Higher values will use more memory, but be faster.\ + Defaults to 256. + cache_dir (str, optional): The path to the model cache directory.\ Can also be set using the `FASTEMBED_CACHE_PATH` env variable. - threads (int, optional): The number of threads single onnxruntime session can use.. + threads (int, optional): The number of threads single onnxruntime session can use. + parallel (int, optional): If `>1`, data-parallel encoding will be used, recommended for offline encoding of large datasets.\ + If `0`, use all available cores.\ + If `None`, don't use data-parallel processing, use default onnxruntime threading instead.\ + Defaults to None. + **kwargs: Additional options to pass to fastembed.TextEmbedding Raises: ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. @@ -1049,6 +1058,8 @@ def __init__( raise ValueError( "The 'fastembed' package is not installed. Please install it with `pip install fastembed`" ) + self._batch_size = batch_size + self._parallel = parallel self._model = TextEmbedding( model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs ) @@ -1068,7 +1079,9 @@ def __call__(self, input: Documents) -> Embeddings: >>> texts = ["Hello, world!", "How are you?"] >>> embeddings = fastembed_ef(texts) """ - embeddings = self._model.embed(input) + embeddings = self._model.embed( + input, batch_size=self._batch_size, parallel=self._parallel + ) return cast( Embeddings, [embedding.tolist() for embedding in embeddings], From ddc92141f2d9d33614c83ca4eaa77c891dee9d75 Mon Sep 17 00:00:00 2001 From: Anush Date: Tue, 14 May 2024 21:05:02 +0530 Subject: [PATCH 3/6] docs: fastembed.md --- .../pages/integrations/fastembed.md | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 docs/docs.trychroma.com/pages/integrations/fastembed.md diff --git a/docs/docs.trychroma.com/pages/integrations/fastembed.md b/docs/docs.trychroma.com/pages/integrations/fastembed.md new file mode 100644 index 00000000000..095604c2fbb --- /dev/null +++ b/docs/docs.trychroma.com/pages/integrations/fastembed.md @@ -0,0 +1,29 @@ +--- +title: FastEmbed +--- + +# FastEmbed + +[FastEmbed](https://qdrant.github.io/fastembed/) is a lightweight, CPU-first Python library built for embedding generation. + +This embedding function requires the `fastembed` package. To install it, run + +```pip install fastembed```. + +You can find a list of all the supported models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/). + +## Example usage: + +Using the default BAAI/bge-small-en-v1.5 model. + +```python +from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction +ef = FastEmbedEmbeddingFunction() +``` + +Additionally, you can also configure the cache directory, number of threads and other FastEmbed options. + +```python +from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction +ef = FastEmbedEmbeddingFunction(model_name="nomic-ai/nomic-embed-text-v1.5", cache_dir="models_cache", threads=5) +``` From 3f329e822d36208bd4d2e304f757c6f6f4cdf088 Mon Sep 17 00:00:00 2001 From: Anush Date: Tue, 14 May 2024 21:06:43 +0530 Subject: [PATCH 4/6] docs: Updated embeddings.md --- docs/docs.trychroma.com/pages/guides/embeddings.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/docs.trychroma.com/pages/guides/embeddings.md b/docs/docs.trychroma.com/pages/guides/embeddings.md index d523c7d3089..31a35a15f05 100644 --- a/docs/docs.trychroma.com/pages/guides/embeddings.md +++ b/docs/docs.trychroma.com/pages/guides/embeddings.md @@ -18,6 +18,7 @@ Chroma provides lightweight wrappers around popular embedding providers, making | [Instructor](/integrations/instructor) | ✅ | ➖ | | [Hugging Face Embedding Server](/integrations/hugging-face-server) | ✅ | ✅ | | [Jina AI](/integrations/jinaai) | ✅ | ✅ | +| [FastEmbed](/integrations/fastembed) | ✅ | ➖ | We welcome pull requests to add new Embedding Functions to the community. From 48f89da2d4d2201b735aaf05d7a12cb45165ca3b Mon Sep 17 00:00:00 2001 From: Anush008 Date: Thu, 27 Jun 2024 23:01:23 +0530 Subject: [PATCH 5/6] refactor: separate file --- chromadb/test/ef/test_ef.py | 1 + chromadb/test/ef/test_fastembed_ef.py | 4 +- .../fastembed_embedding_function.py | 60 +++++++++++++++++++ .../pages/integrations/fastembed.md | 4 +- 4 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 chromadb/utils/embedding_functions/fastembed_embedding_function.py diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py index c93502e3fc8..da10b33e5b2 100644 --- a/chromadb/test/ef/test_ef.py +++ b/chromadb/test/ef/test_ef.py @@ -30,6 +30,7 @@ def test_get_builtins_holds() -> None: "SentenceTransformerEmbeddingFunction", "Text2VecEmbeddingFunction", "ChromaLangchainEmbeddingFunction", + "FastEmbedEmbeddingFunction", } assert expected_builtins == embedding_functions.get_builtins() diff --git a/chromadb/test/ef/test_fastembed_ef.py b/chromadb/test/ef/test_fastembed_ef.py index 59ba95499b4..989d240390a 100644 --- a/chromadb/test/ef/test_fastembed_ef.py +++ b/chromadb/test/ef/test_fastembed_ef.py @@ -1,6 +1,8 @@ import pytest -from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction +from chromadb.utils.embedding_functions.fastembed_embedding_function import ( + FastEmbedEmbeddingFunction, +) # Skip test if the 'fastembed' package is not installed is not installed fastembed = pytest.importorskip("fastembed", reason="fastembed not installed") diff --git a/chromadb/utils/embedding_functions/fastembed_embedding_function.py b/chromadb/utils/embedding_functions/fastembed_embedding_function.py new file mode 100644 index 00000000000..07a1cacfc4c --- /dev/null +++ b/chromadb/utils/embedding_functions/fastembed_embedding_function.py @@ -0,0 +1,60 @@ +from typing import Any, Optional, cast + +from chromadb.api.types import Documents, EmbeddingFunction, Embeddings + + +class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]): + """ + This class is used to generate embeddings for a list of texts using FastEmbed - https://qdrant.github.io/fastembed/. + Find the list of supported models at https://qdrant.github.io/fastembed/examples/Supported_Models/. + """ + + def __init__( + self, + model_name: str = "BAAI/bge-small-en-v1.5", + cache_dir: Optional[str] = None, + threads: Optional[int] = None, + **kwargs: Any, + ) -> None: + """ + Initialize fastembed.TextEmbedding + + Args: + model_name (str): The name of the model to use. + cache_dir (str, optional): The path to the model cache directory. + Can also be set using the `FASTEMBED_CACHE_PATH` env variable. + threads (int, optional): The number of threads single onnxruntime session can use.. + + Raises: + ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. + """ + try: + from fastembed import TextEmbedding + except ImportError: + raise ValueError( + "The 'fastembed' package is not installed. Please install it with `pip install fastembed`" + ) + self._model = TextEmbedding( + model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs + ) + + def __call__(self, input: Documents) -> Embeddings: + """ + Get the embeddings for a list of texts. + + Args: + input (Documents): A list of texts to get embeddings for. + + Returns: + Embeddings: The embeddings for the texts. + + Example: + >>> fastembed_ef = FastEmbedEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2") + >>> texts = ["Hello, world!", "How are you?"] + >>> embeddings = fastembed_ef(texts) + """ + embeddings = self._model.embed(input) + return cast( + Embeddings, + [embedding.tolist() for embedding in embeddings], + ) diff --git a/docs/docs.trychroma.com/pages/integrations/fastembed.md b/docs/docs.trychroma.com/pages/integrations/fastembed.md index 095604c2fbb..6fec044a955 100644 --- a/docs/docs.trychroma.com/pages/integrations/fastembed.md +++ b/docs/docs.trychroma.com/pages/integrations/fastembed.md @@ -12,12 +12,12 @@ This embedding function requires the `fastembed` package. To install it, run You can find a list of all the supported models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/). -## Example usage: +## Example usage Using the default BAAI/bge-small-en-v1.5 model. ```python -from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction +from chromadb.utils.embedding_functions.fastembed_embedding_function import FastEmbedEmbeddingFunction ef = FastEmbedEmbeddingFunction() ``` From 48e28cf75a5f163d3b0e89174cc72b5ec0c5cd3e Mon Sep 17 00:00:00 2001 From: Anush008 Date: Wed, 3 Jul 2024 09:16:59 +0530 Subject: [PATCH 6/6] chore: restore parallel and batch size --- .../fastembed_embedding_function.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/chromadb/utils/embedding_functions/fastembed_embedding_function.py b/chromadb/utils/embedding_functions/fastembed_embedding_function.py index 07a1cacfc4c..af6bac84689 100644 --- a/chromadb/utils/embedding_functions/fastembed_embedding_function.py +++ b/chromadb/utils/embedding_functions/fastembed_embedding_function.py @@ -12,18 +12,27 @@ class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]): def __init__( self, model_name: str = "BAAI/bge-small-en-v1.5", + batch_size: int = 256, cache_dir: Optional[str] = None, threads: Optional[int] = None, + parallel: Optional[int] = None, **kwargs: Any, ) -> None: """ Initialize fastembed.TextEmbedding Args: - model_name (str): The name of the model to use. - cache_dir (str, optional): The path to the model cache directory. + model_name (str): The name of the model to use. Defaults to `"BAAI/bge-small-en-v1.5"`. + batch_size (int): Batch size for encoding. Higher values will use more memory, but be faster.\ + Defaults to 256. + cache_dir (str, optional): The path to the model cache directory.\ Can also be set using the `FASTEMBED_CACHE_PATH` env variable. - threads (int, optional): The number of threads single onnxruntime session can use.. + threads (int, optional): The number of threads single onnxruntime session can use. + parallel (int, optional): If `>1`, data-parallel encoding will be used, recommended for offline encoding of large datasets.\ + If `0`, use all available cores.\ + If `None`, don't use data-parallel processing, use default onnxruntime threading instead.\ + Defaults to None. + **kwargs: Additional options to pass to fastembed.TextEmbedding Raises: ValueError: If the model_name is not in the format / e.g. BAAI/bge-base-en. @@ -34,6 +43,8 @@ def __init__( raise ValueError( "The 'fastembed' package is not installed. Please install it with `pip install fastembed`" ) + self._batch_size = batch_size + self._parallel = parallel self._model = TextEmbedding( model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs ) @@ -53,7 +64,9 @@ def __call__(self, input: Documents) -> Embeddings: >>> texts = ["Hello, world!", "How are you?"] >>> embeddings = fastembed_ef(texts) """ - embeddings = self._model.embed(input) + embeddings = self._model.embed( + input, batch_size=self._batch_size, parallel=self._parallel + ) return cast( Embeddings, [embedding.tolist() for embedding in embeddings],