Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions chromadb/test/ef/test_ef.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_get_builtins_holds() -> None:
"SentenceTransformerEmbeddingFunction",
"Text2VecEmbeddingFunction",
"ChromaLangchainEmbeddingFunction",
"VoyageAIEmbeddingFunction",
}

assert expected_builtins == embedding_functions.get_builtins()
Expand Down
161 changes: 161 additions & 0 deletions chromadb/test/ef/test_voyageai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
import os

import pytest

from chromadb.utils.embedding_functions.voyage_ai_embedding_function import (
VoyageAIEmbeddingFunction,
)

voyageai = pytest.importorskip("voyageai", reason="voyageai not installed")


@pytest.fixture(scope="function")
def remove_api_key():
existing_api_key = None
if "VOYAGE_API_KEY" in os.environ:
existing_api_key = os.environ["VOYAGE_API_KEY"]
print("removing key")
del os.environ["VOYAGE_API_KEY"]
yield
if existing_api_key:
print("setting kye")
os.environ["VOYAGE_API_KEY"] = existing_api_key


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage() -> None:
ef = VoyageAIEmbeddingFunction(api_key=os.environ.get("VOYAGE_API_KEY", ""))
embeddings = ef(["test doc"])
assert embeddings is not None
assert len(embeddings) == 1
assert len(embeddings[0]) > 0


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_input_type_query() -> None:
ef = VoyageAIEmbeddingFunction(
api_key=os.environ.get("VOYAGE_API_KEY", ""),
input_type=VoyageAIEmbeddingFunction.InputType.QUERY,
)
embeddings = ef(["test doc"])
assert embeddings is not None
assert len(embeddings) == 1
assert len(embeddings[0]) > 0


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_input_type_document() -> None:
ef = VoyageAIEmbeddingFunction(
api_key=os.environ.get("VOYAGE_API_KEY", ""),
input_type=VoyageAIEmbeddingFunction.InputType.DOCUMENT,
)
embeddings = ef(["test doc"])
assert embeddings is not None
assert len(embeddings) == 1
assert len(embeddings[0]) > 0


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_model() -> None:
ef = VoyageAIEmbeddingFunction(
api_key=os.environ.get("VOYAGE_API_KEY", ""), model_name="voyage-01"
)
embeddings = ef(["def test():\n return 1"])
assert embeddings is not None
assert len(embeddings) == 1
assert len(embeddings[0]) > 0


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_truncation_default() -> None:
ef = VoyageAIEmbeddingFunction(api_key=os.environ.get("VOYAGE_API_KEY", ""))
embeddings = ef(["this is a test-message" * 10000])
assert embeddings is not None
assert len(embeddings) == 1
assert len(embeddings[0]) > 0


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_truncation_enabled() -> None:
ef = VoyageAIEmbeddingFunction(
api_key=os.environ.get("VOYAGE_API_KEY", ""), truncation=True
)
embeddings = ef(["this is a test-message" * 10000])
assert embeddings is not None
assert len(embeddings) == 1
assert len(embeddings[0]) > 0


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_truncation_disabled() -> None:
ef = VoyageAIEmbeddingFunction(
api_key=os.environ.get("VOYAGE_API_KEY", ""), truncation=False
)
with pytest.raises(Exception, match="your batch has too many tokens"):
ef(["this is a test-message" * 10000])


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_env_api_key() -> None:
VoyageAIEmbeddingFunction()


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_no_api_key(remove_api_key) -> None:
with pytest.raises(ValueError, match="Please provide a VoyageAI API key"):
VoyageAIEmbeddingFunction(api_key=None) # type: ignore


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_no_api_key_in_env(remove_api_key) -> None:
with pytest.raises(ValueError, match="Please provide a VoyageAI API key"):
VoyageAIEmbeddingFunction(api_key=None) # type: ignore


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_max_batch_size_exceeded_in_init() -> None:
with pytest.raises(ValueError, match="The maximum batch size supported is"):
VoyageAIEmbeddingFunction(api_key="dummy", max_batch_size=99999999)


@pytest.mark.skipif(
"VOYAGE_API_KEY" not in os.environ,
reason="VOYAGE_API_KEY not set, not going to test VoyageAI EF.",
)
def test_voyage_max_batch_size_exceeded_in_call() -> None:
ef = VoyageAIEmbeddingFunction(api_key="dummy", max_batch_size=1)
with pytest.raises(ValueError, match="The maximum batch size supported is"):
ef(["test doc"] * 2)
77 changes: 77 additions & 0 deletions chromadb/utils/embedding_functions/voyage_ai_embedding_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
from enum import Enum
from typing import Optional, cast

from chromadb.api.types import (
Documents,
EmbeddingFunction,
Embeddings,
)


class VoyageAIEmbeddingFunction(EmbeddingFunction[Documents]):
"""Embedding function for Voyageai.com. API docs - https://docs.voyageai.com/reference/embeddings-api"""

class InputType(str, Enum):
DOCUMENT = "document"
QUERY = "query"

def __init__(
self,
api_key: Optional[str] = None,
model_name: str = "voyage-2",
max_batch_size: int = 128,
truncation: Optional[bool] = True,
input_type: Optional[InputType] = None,
):
"""
Initialize the VoyageAIEmbeddingFunction.
Args:
api_key (str): Your API key for the HuggingFace API.
model_name (str, optional): The name of the model to use for text embeddings. Defaults to "voyage-01".
batch_size (int, optional): The number of documents to send at a time. Defaults to 128 (The max supported 7th Apr 2024). see voyageai.VOYAGE_EMBED_BATCH_SIZE for actual max.
truncation (bool, optional): Whether to truncate the input (`True`) or raise an error if the input is too long (`False`). Defaults to `False`.
input_type (str, optional): The type of input text. Can be `None`, `query`, `document`. Defaults to `None`.
"""

if not api_key and "VOYAGE_API_KEY" not in os.environ:
raise ValueError("Please provide a VoyageAI API key.")

try:
import voyageai

if max_batch_size > voyageai.VOYAGE_EMBED_BATCH_SIZE:
raise ValueError(
f"The maximum batch size supported is {voyageai.VOYAGE_EMBED_BATCH_SIZE}."
)
self._batch_size = max_batch_size
self._model = model_name
self._truncation = truncation
self._client = voyageai.Client(api_key=api_key)
self._input_type = input_type
except ImportError:
raise ValueError(
"The VoyageAI python package is not installed. Please install it with `pip install voyageai`"
)

def __call__(self, input: Documents) -> Embeddings:
"""
Get the embeddings for a list of texts.
Args:
input (Documents): A list of texts to get embeddings for.
Returns:
Embeddings: The embeddings for the texts.
Example:
>>> voyage_ef = VoyageAIEmbeddingFunction(api_key="your_api_key")
>>> input = ["Hello, world!", "How are you?"]
>>> embeddings = voyage_ef(input)
"""
if len(input) > self._batch_size:
raise ValueError(f"The maximum batch size supported is {self._batch_size}.")
results = self._client.embed(
texts=input,
model=self._model,
truncation=self._truncation,
input_type=self._input_type,
)
return cast(Embeddings, results.embeddings)
17 changes: 9 additions & 8 deletions docs/docs.trychroma.com/pages/guides/embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@ Chroma provides lightweight wrappers around popular embedding providers, making
{% special_table %}
{% /special_table %}

| | Python | JS |
|--------------|-----------|---------------|
| [OpenAI](/integrations/openai) | ✅ | ✅ |
| [Google Generative AI](/integrations/google-gemini) | ✅ | ✅ |
| [Cohere](/integrations/cohere) | ✅ | ✅ |
| [Hugging Face](/integrations/hugging-face) | ✅ | ➖ |
| [Instructor](/integrations/instructor) | ✅ | ➖ |
| | Python | JS |
|--------------------------------------------------------------------|-----------|---------------|
| [OpenAI](/integrations/openai) | ✅ | ✅ |
| [Google Generative AI](/integrations/google-gemini) | ✅ | ✅ |
| [Cohere](/integrations/cohere) | ✅ | ✅ |
| [Hugging Face](/integrations/hugging-face) | ✅ | ➖ |
| [Instructor](/integrations/instructor) | ✅ | ➖ |
| [Hugging Face Embedding Server](/integrations/hugging-face-server) | ✅ | ✅ |
| [Jina AI](/integrations/jinaai) | ✅ | ✅ |
| [Jina AI](/integrations/jinaai) | ✅ | ✅ |
| [Voyage AI](/integrations/voyageai) | ✅ | ✅ |

We welcome pull requests to add new Embedding Functions to the community.

Expand Down
1 change: 1 addition & 0 deletions docs/docs.trychroma.com/pages/integrations/_sidenav.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export const items = [
{ href: '/integrations/jinaai', children: 'JinaAI' },
{ href: '/integrations/roboflow', children: 'Roboflow' },
{ href: '/integrations/ollama', children: 'Ollama Embeddings' },
{ href: '/integrations/voyageai', children: 'Voyage AI Embeddings' },
]
},
{
Expand Down
41 changes: 41 additions & 0 deletions docs/docs.trychroma.com/pages/integrations/voyageai.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
title: Voyage AI Embeddings
---

Chroma also provides a convenient wrapper around VoyageAI's embedding API. This embedding function runs remotely on VoyageAI’s servers, and requires an API key. You can get an API key by signing up for an account at [VoyageAI](https://dash.voyageai.com/api-keys).

{% tabs group="code-lang" %}
{% tab label="Python" %}

This embedding function relies on the `voyageai` python package, which you can install with `pip install voyageai`.

```python
from chromadb.utils.embedding_functions.voyage_ai_embedding_function import VoyageAIEmbeddingFunction
voyageai_ef = VoyageAIEmbeddingFunction(api_key="YOUR_API_KEY", model_name="voyage-law-2", input_type=VoyageAIEmbeddingFunction.InputType.DOCUMENT)
result = voyageai_ef(input=["document1","document2"])
```

{% /tab %}
{% tab label="Javascript" %}

```javascript
const {VoyageAIEmbeddingFunction, InputType} = require('chromadb');
// const {VoyageAIEmbeddingFunction, InputType} from "chromadb"; // ESM import
const embedder = new VoyageAIEmbeddingFunction("apiKey", "voyage-law-2", InputType.DOCUMENT)

// use directly
const embeddings = embedder.generate(["document1","document2"])

// pass documents to query for .add and .query
const collection = await client.createCollection({name: "name", embeddingFunction: embedder})
const collectionGet = await client.getCollection({name:"name", embeddingFunction: embedder})
```

{% /codetab %}
{% /codetabs %}

{% /tab %}

{% /tabs %}

You should pass in the `model_name` argument, which lets you choose which VoyageAI embeddings model to use. You can see the available models [here](https://docs.voyageai.com/docs/embeddings).