Skip to content

[Store] Add a way to store document content when using Chroma DB #288

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/store/src/Bridge/ChromaDb/Store.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,18 @@ public function add(VectorDocument ...$documents): void
$ids = [];
$vectors = [];
$metadata = [];
$originalDocuments = [];
foreach ($documents as $document) {
$ids[] = (string) $document->id;
$vectors[] = $document->vector->getData();
$metadata[] = $document->metadata->getArrayCopy();
$metadataCopy = $document->metadata->getArrayCopy();
$originalDocuments[] = $document->metadata->getText() ?? '';
unset($metadataCopy[Metadata::KEY_TEXT]);
$metadata[] = $metadataCopy;
}

$collection = $this->client->getOrCreateCollection($this->collectionName);
$collection->add($ids, $vectors, $metadata);
$collection->add($ids, $vectors, $metadata, $originalDocuments);
}

public function query(Vector $vector, array $options = []): array
Expand Down
2 changes: 1 addition & 1 deletion src/store/src/Document/Loader/TextFileLoader.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public function __invoke(string $source, array $options = []): iterable
}

yield new TextDocument(Uuid::v4(), trim($content), new Metadata([
'source' => $source,
Metadata::KEY_SOURCE => $source,
]));
}
}
54 changes: 54 additions & 0 deletions src/store/src/Document/Metadata.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,58 @@
*/
final class Metadata extends \ArrayObject
{
public const KEY_PARENT_ID = '_parent_id';
public const KEY_TEXT = '_text';
public const KEY_SOURCE = '_source';

public function hasParentId(): bool
{
return $this->offsetExists(self::KEY_PARENT_ID);
}

public function getParentId(): int|string|null
{
return $this->offsetExists(self::KEY_PARENT_ID)
? $this->offsetGet(self::KEY_PARENT_ID)
: null;
}

public function setParentId(int|string $parentId): void
{
$this->offsetSet(self::KEY_PARENT_ID, $parentId);
}

public function hasText(): bool
{
return $this->offsetExists(self::KEY_TEXT);
}

public function setText(string $text): void
{
$this->offsetSet(self::KEY_TEXT, $text);
}

public function getText(): ?string
{
return $this->offsetExists(self::KEY_TEXT)
? $this->offsetGet(self::KEY_TEXT)
: null;
}

public function hasSource(): bool
{
return $this->offsetExists(self::KEY_SOURCE);
}

public function getSource(): ?string
{
return $this->offsetExists(self::KEY_SOURCE)
? $this->offsetGet(self::KEY_SOURCE)
: null;
}

public function setSource(string $source): void
{
$this->offsetSet(self::KEY_SOURCE, $source);
}
}
4 changes: 2 additions & 2 deletions src/store/src/Document/Transformer/TextSplitTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ public function __invoke(iterable $documents, array $options = []): iterable
$chunkText = mb_substr($text, $start, $end - $start);

yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([
'parent_id' => $document->id,
'text' => $chunkText,
Metadata::KEY_PARENT_ID => $document->id,
Metadata::KEY_TEXT => $chunkText,
...$document->metadata,
]));

Expand Down
129 changes: 91 additions & 38 deletions src/store/tests/Bridge/ChromaDb/StoreTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use Codewithkyrian\ChromaDB\Client;
use Codewithkyrian\ChromaDB\Resources\CollectionResource;
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;
use Symfony\AI\Platform\Vector\Vector;
use Symfony\AI\Store\Bridge\ChromaDb\Store;
Expand All @@ -24,8 +25,21 @@
#[CoversClass(Store::class)]
final class StoreTest extends TestCase
{
public function testAddDocumentsSuccessfully()
{
/**
* @param array<VectorDocument> $documents
* @param array<string> $expectedIds
* @param array<array<float>> $expectedVectors
* @param array<array<string, mixed>> $expectedMetadata
* @param array<string> $expectedOriginalDocuments
*/
#[DataProvider('addDocumentsProvider')]
public function testAddDocumentsSuccessfully(
array $documents,
array $expectedIds,
array $expectedVectors,
array $expectedMetadata,
array $expectedOriginalDocuments,
): void {
$collection = $this->createMock(CollectionResource::class);
$client = $this->createMock(Client::class);

Expand All @@ -34,49 +48,88 @@ public function testAddDocumentsSuccessfully()
->with('test-collection')
->willReturn($collection);

$uuid1 = Uuid::v4();
$uuid2 = Uuid::v4();

$collection->expects($this->once())
->method('add')
->with(
[(string) $uuid1, (string) $uuid2],
[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
[[], ['title' => 'Test Document']],
);
->with($expectedIds, $expectedVectors, $expectedMetadata, $expectedOriginalDocuments);

$store = new Store($client, 'test-collection');

$document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3]));
$document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), new Metadata(['title' => 'Test Document']));

$store->add($document1, $document2);
$store->add(...$documents);
}

public function testAddSingleDocument()
/**
* @return \Iterator<string, array{
* documents: array<VectorDocument>,
* expectedIds: array<string>,
* expectedVectors: array<array<float>>,
* expectedMetadata: array<array<string, mixed>>,
* expectedOriginalDocuments: array<string>
* }>
*/
public static function addDocumentsProvider(): \Iterator
{
$collection = $this->createMock(CollectionResource::class);
$client = $this->createMock(Client::class);

$client->expects($this->once())
->method('getOrCreateCollection')
->with('test-collection')
->willReturn($collection);

$uuid = Uuid::v4();

$collection->expects($this->once())
->method('add')
->with(
[(string) $uuid],
[[0.1, 0.2, 0.3]],
[['title' => 'Test Document', 'category' => 'test']],
);

$store = new Store($client, 'test-collection');

$document = new VectorDocument($uuid, new Vector([0.1, 0.2, 0.3]), new Metadata(['title' => 'Test Document', 'category' => 'test']));

$store->add($document);
yield 'multiple documents with and without metadata' => [
'documents' => [
new VectorDocument(
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
new Vector([0.1, 0.2, 0.3]),
),
new VectorDocument(
Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'),
new Vector([0.4, 0.5, 0.6]),
new Metadata(['title' => 'Test Document']),
),
],
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'],
'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
'expectedMetadata' => [[], ['title' => 'Test Document']],
'expectedOriginalDocuments' => ['', ''],
];

yield 'single document with metadata' => [
'documents' => [
new VectorDocument(
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
new Vector([0.1, 0.2, 0.3]),
new Metadata(['title' => 'Test Document', 'category' => 'test']),
),
],
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'],
'expectedVectors' => [[0.1, 0.2, 0.3]],
'expectedMetadata' => [['title' => 'Test Document', 'category' => 'test']],
'expectedOriginalDocuments' => [''],
];

yield 'documents with text content' => [
'documents' => [
new VectorDocument(
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
new Vector([0.1, 0.2, 0.3]),
new Metadata(['_text' => 'This is the content of document 1', 'title' => 'Document 1'])),
new VectorDocument(
Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'),
new Vector([0.4, 0.5, 0.6]),
new Metadata(['_text' => 'This is the content of document 2', 'title' => 'Document 2', 'category' => 'test']),
),
],
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'],
'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
'expectedMetadata' => [['title' => 'Document 1'], ['title' => 'Document 2', 'category' => 'test']],
'expectedOriginalDocuments' => ['This is the content of document 1', 'This is the content of document 2'],
];

yield 'document with null text' => [
'documents' => [
new VectorDocument(
Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'),
new Vector([0.1, 0.2, 0.3]),
new Metadata(['_text' => null, 'title' => 'Test Document']),
),
],
'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'],
'expectedVectors' => [[0.1, 0.2, 0.3]],
'expectedMetadata' => [['title' => 'Test Document']],
'expectedOriginalDocuments' => [''],
];
}
}
3 changes: 2 additions & 1 deletion src/store/tests/Document/Loader/TextFileLoaderTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public function testSourceIsPresentInMetadata()

$this->assertCount(1, $documents);
$this->assertInstanceOf(TextDocument::class, $document = $documents[0]);
$this->assertSame($source, $document->metadata['source']);
$this->assertSame($source, $document->metadata['_source']);
$this->assertSame($source, $document->metadata->getSource());
}
}
Loading