refactor: Separate file

Anush008 · Anush008 · commit db6f7fd2265b · 2024-06-27T22:57:45.000+05:30
diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py
@@ -30,6 +30,7 @@ def test_get_builtins_holds() -> None:
         "SentenceTransformerEmbeddingFunction",
         "Text2VecEmbeddingFunction",
         "ChromaLangchainEmbeddingFunction",
+        "FastEmbedEmbeddingFunction",
     }
 
     assert expected_builtins == embedding_functions.get_builtins()
diff --git a/chromadb/test/ef/test_fastembed_ef.py b/chromadb/test/ef/test_fastembed_ef.py
@@ -1,6 +1,8 @@
 import pytest
 
-from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction
+from chromadb.utils.embedding_functions.fastembed_embedding_function import (
+    FastEmbedEmbeddingFunction,
+)
 
 # Skip test if the 'fastembed' package is not installed is not installed
 fastembed = pytest.importorskip("fastembed", reason="fastembed not installed")
diff --git a/chromadb/utils/embedding_functions/fastembed_embedding_function.py b/chromadb/utils/embedding_functions/fastembed_embedding_function.py
@@ -0,0 +1,60 @@
+from typing import Any, Optional, cast
+
+from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
+
+
+class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]):
+    """
+    This class is used to generate embeddings for a list of texts using FastEmbed - https://qdrant.github.io/fastembed/.
+    Find the list of supported models at https://qdrant.github.io/fastembed/examples/Supported_Models/.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-small-en-v1.5",
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Initialize fastembed.TextEmbedding
+
+        Args:
+            model_name (str): The name of the model to use.
+            cache_dir (str, optional): The path to the model cache directory.
+                                       Can also be set using the `FASTEMBED_CACHE_PATH` env variable.
+            threads (int, optional): The number of threads single onnxruntime session can use..
+
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+        try:
+            from fastembed import TextEmbedding
+        except ImportError:
+            raise ValueError(
+                "The 'fastembed' package is not installed. Please install it with `pip install fastembed`"
+            )
+        self._model = TextEmbedding(
+            model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs
+        )
+
+    def __call__(self, input: Documents) -> Embeddings:
+        """
+        Get the embeddings for a list of texts.
+
+        Args:
+            input (Documents): A list of texts to get embeddings for.
+
+        Returns:
+            Embeddings: The embeddings for the texts.
+
+        Example:
+            >>> fastembed_ef = FastEmbedEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
+            >>> texts = ["Hello, world!", "How are you?"]
+            >>> embeddings = fastembed_ef(texts)
+        """
+        embeddings = self._model.embed(input)
+        return cast(
+            Embeddings,
+            [embedding.tolist() for embedding in embeddings],
+        )
diff --git a/docs/docs.trychroma.com/pages/integrations/fastembed.md b/docs/docs.trychroma.com/pages/integrations/fastembed.md
@@ -12,12 +12,12 @@ This embedding function requires the `fastembed` package. To install it, run
 
 You can find a list of all the supported models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/).
 
-## Example usage:
+## Example usage
 
 Using the default BAAI/bge-small-en-v1.5 model.
 
 ```python
-from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction
+from chromadb.utils.embedding_functions.fastembed_embedding_function import FastEmbedEmbeddingFunction
 ef = FastEmbedEmbeddingFunction()
 ```
 

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ def test_get_builtins_holds() -> None:`
`30`	`30`	`"SentenceTransformerEmbeddingFunction",`
`31`	`31`	`"Text2VecEmbeddingFunction",`
`32`	`32`	`"ChromaLangchainEmbeddingFunction",`
	`33`	`+ "FastEmbedEmbeddingFunction",`
`33`	`34`	`}`
`34`	`35`
`35`	`36`	`assert expected_builtins == embedding_functions.get_builtins()`