From 231a8070216efd5f0f4618d70ef7dd8bd9050754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20Bok?= Date: Fri, 8 Aug 2025 11:39:00 +0200 Subject: [PATCH] [Store] Add a way to store document content when using Chroma DB --- src/store/src/Bridge/ChromaDb/Store.php | 8 +- .../src/Document/Loader/TextFileLoader.php | 2 +- src/store/src/Document/Metadata.php | 54 ++++ .../Transformer/TextSplitTransformer.php | 4 +- src/store/tests/Bridge/ChromaDb/StoreTest.php | 129 +++++++--- .../Document/Loader/TextFileLoaderTest.php | 3 +- src/store/tests/Document/MetadataTest.php | 235 ++++++++++++++++++ .../Transformer/TextSplitTransformerTest.php | 4 +- 8 files changed, 393 insertions(+), 46 deletions(-) create mode 100644 src/store/tests/Document/MetadataTest.php diff --git a/src/store/src/Bridge/ChromaDb/Store.php b/src/store/src/Bridge/ChromaDb/Store.php index bd9425c6c..f77fa7728 100644 --- a/src/store/src/Bridge/ChromaDb/Store.php +++ b/src/store/src/Bridge/ChromaDb/Store.php @@ -38,14 +38,18 @@ public function add(VectorDocument ...$documents): void $ids = []; $vectors = []; $metadata = []; + $originalDocuments = []; foreach ($documents as $document) { $ids[] = (string) $document->id; $vectors[] = $document->vector->getData(); - $metadata[] = $document->metadata->getArrayCopy(); + $metadataCopy = $document->metadata->getArrayCopy(); + $originalDocuments[] = $document->metadata->getText() ?? ''; + unset($metadataCopy[Metadata::KEY_TEXT]); + $metadata[] = $metadataCopy; } $collection = $this->client->getOrCreateCollection($this->collectionName); - $collection->add($ids, $vectors, $metadata); + $collection->add($ids, $vectors, $metadata, $originalDocuments); } public function query(Vector $vector, array $options = []): array diff --git a/src/store/src/Document/Loader/TextFileLoader.php b/src/store/src/Document/Loader/TextFileLoader.php index d3e1890e3..7c7c01c7e 100644 --- a/src/store/src/Document/Loader/TextFileLoader.php +++ b/src/store/src/Document/Loader/TextFileLoader.php @@ -35,7 +35,7 @@ public function __invoke(string $source, array $options = []): iterable } yield new TextDocument(Uuid::v4(), trim($content), new Metadata([ - 'source' => $source, + Metadata::KEY_SOURCE => $source, ])); } } diff --git a/src/store/src/Document/Metadata.php b/src/store/src/Document/Metadata.php index 5ce7c105c..aa5aae544 100644 --- a/src/store/src/Document/Metadata.php +++ b/src/store/src/Document/Metadata.php @@ -18,4 +18,58 @@ */ final class Metadata extends \ArrayObject { + public const KEY_PARENT_ID = '_parent_id'; + public const KEY_TEXT = '_text'; + public const KEY_SOURCE = '_source'; + + public function hasParentId(): bool + { + return $this->offsetExists(self::KEY_PARENT_ID); + } + + public function getParentId(): int|string|null + { + return $this->offsetExists(self::KEY_PARENT_ID) + ? $this->offsetGet(self::KEY_PARENT_ID) + : null; + } + + public function setParentId(int|string $parentId): void + { + $this->offsetSet(self::KEY_PARENT_ID, $parentId); + } + + public function hasText(): bool + { + return $this->offsetExists(self::KEY_TEXT); + } + + public function setText(string $text): void + { + $this->offsetSet(self::KEY_TEXT, $text); + } + + public function getText(): ?string + { + return $this->offsetExists(self::KEY_TEXT) + ? $this->offsetGet(self::KEY_TEXT) + : null; + } + + public function hasSource(): bool + { + return $this->offsetExists(self::KEY_SOURCE); + } + + public function getSource(): ?string + { + return $this->offsetExists(self::KEY_SOURCE) + ? $this->offsetGet(self::KEY_SOURCE) + : null; + } + + public function setSource(string $source): void + { + $this->offsetSet(self::KEY_SOURCE, $source); + } } diff --git a/src/store/src/Document/Transformer/TextSplitTransformer.php b/src/store/src/Document/Transformer/TextSplitTransformer.php index f13b444bd..7094914ad 100644 --- a/src/store/src/Document/Transformer/TextSplitTransformer.php +++ b/src/store/src/Document/Transformer/TextSplitTransformer.php @@ -57,8 +57,8 @@ public function __invoke(iterable $documents, array $options = []): iterable $chunkText = mb_substr($text, $start, $end - $start); yield new TextDocument(Uuid::v4(), $chunkText, new Metadata([ - 'parent_id' => $document->id, - 'text' => $chunkText, + Metadata::KEY_PARENT_ID => $document->id, + Metadata::KEY_TEXT => $chunkText, ...$document->metadata, ])); diff --git a/src/store/tests/Bridge/ChromaDb/StoreTest.php b/src/store/tests/Bridge/ChromaDb/StoreTest.php index 7fc63805a..d5c4d4d59 100644 --- a/src/store/tests/Bridge/ChromaDb/StoreTest.php +++ b/src/store/tests/Bridge/ChromaDb/StoreTest.php @@ -14,6 +14,7 @@ use Codewithkyrian\ChromaDB\Client; use Codewithkyrian\ChromaDB\Resources\CollectionResource; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProvider; use PHPUnit\Framework\TestCase; use Symfony\AI\Platform\Vector\Vector; use Symfony\AI\Store\Bridge\ChromaDb\Store; @@ -24,8 +25,21 @@ #[CoversClass(Store::class)] final class StoreTest extends TestCase { - public function testAddDocumentsSuccessfully() - { + /** + * @param array $documents + * @param array $expectedIds + * @param array> $expectedVectors + * @param array> $expectedMetadata + * @param array $expectedOriginalDocuments + */ + #[DataProvider('addDocumentsProvider')] + public function testAddDocumentsSuccessfully( + array $documents, + array $expectedIds, + array $expectedVectors, + array $expectedMetadata, + array $expectedOriginalDocuments, + ): void { $collection = $this->createMock(CollectionResource::class); $client = $this->createMock(Client::class); @@ -34,49 +48,88 @@ public function testAddDocumentsSuccessfully() ->with('test-collection') ->willReturn($collection); - $uuid1 = Uuid::v4(); - $uuid2 = Uuid::v4(); - $collection->expects($this->once()) ->method('add') - ->with( - [(string) $uuid1, (string) $uuid2], - [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], - [[], ['title' => 'Test Document']], - ); + ->with($expectedIds, $expectedVectors, $expectedMetadata, $expectedOriginalDocuments); $store = new Store($client, 'test-collection'); - $document1 = new VectorDocument($uuid1, new Vector([0.1, 0.2, 0.3])); - $document2 = new VectorDocument($uuid2, new Vector([0.4, 0.5, 0.6]), new Metadata(['title' => 'Test Document'])); - - $store->add($document1, $document2); + $store->add(...$documents); } - public function testAddSingleDocument() + /** + * @return \Iterator, + * expectedIds: array, + * expectedVectors: array>, + * expectedMetadata: array>, + * expectedOriginalDocuments: array + * }> + */ + public static function addDocumentsProvider(): \Iterator { - $collection = $this->createMock(CollectionResource::class); - $client = $this->createMock(Client::class); - - $client->expects($this->once()) - ->method('getOrCreateCollection') - ->with('test-collection') - ->willReturn($collection); - - $uuid = Uuid::v4(); - - $collection->expects($this->once()) - ->method('add') - ->with( - [(string) $uuid], - [[0.1, 0.2, 0.3]], - [['title' => 'Test Document', 'category' => 'test']], - ); - - $store = new Store($client, 'test-collection'); - - $document = new VectorDocument($uuid, new Vector([0.1, 0.2, 0.3]), new Metadata(['title' => 'Test Document', 'category' => 'test'])); - - $store->add($document); + yield 'multiple documents with and without metadata' => [ + 'documents' => [ + new VectorDocument( + Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'), + new Vector([0.1, 0.2, 0.3]), + ), + new VectorDocument( + Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'), + new Vector([0.4, 0.5, 0.6]), + new Metadata(['title' => 'Test Document']), + ), + ], + 'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'], + 'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], + 'expectedMetadata' => [[], ['title' => 'Test Document']], + 'expectedOriginalDocuments' => ['', ''], + ]; + + yield 'single document with metadata' => [ + 'documents' => [ + new VectorDocument( + Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'), + new Vector([0.1, 0.2, 0.3]), + new Metadata(['title' => 'Test Document', 'category' => 'test']), + ), + ], + 'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'], + 'expectedVectors' => [[0.1, 0.2, 0.3]], + 'expectedMetadata' => [['title' => 'Test Document', 'category' => 'test']], + 'expectedOriginalDocuments' => [''], + ]; + + yield 'documents with text content' => [ + 'documents' => [ + new VectorDocument( + Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'), + new Vector([0.1, 0.2, 0.3]), + new Metadata(['_text' => 'This is the content of document 1', 'title' => 'Document 1'])), + new VectorDocument( + Uuid::fromString('fedcba98-7654-3210-fedc-ba9876543210'), + new Vector([0.4, 0.5, 0.6]), + new Metadata(['_text' => 'This is the content of document 2', 'title' => 'Document 2', 'category' => 'test']), + ), + ], + 'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef', 'fedcba98-7654-3210-fedc-ba9876543210'], + 'expectedVectors' => [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], + 'expectedMetadata' => [['title' => 'Document 1'], ['title' => 'Document 2', 'category' => 'test']], + 'expectedOriginalDocuments' => ['This is the content of document 1', 'This is the content of document 2'], + ]; + + yield 'document with null text' => [ + 'documents' => [ + new VectorDocument( + Uuid::fromString('01234567-89ab-cdef-0123-456789abcdef'), + new Vector([0.1, 0.2, 0.3]), + new Metadata(['_text' => null, 'title' => 'Test Document']), + ), + ], + 'expectedIds' => ['01234567-89ab-cdef-0123-456789abcdef'], + 'expectedVectors' => [[0.1, 0.2, 0.3]], + 'expectedMetadata' => [['title' => 'Test Document']], + 'expectedOriginalDocuments' => [''], + ]; } } diff --git a/src/store/tests/Document/Loader/TextFileLoaderTest.php b/src/store/tests/Document/Loader/TextFileLoaderTest.php index 2ebf5f6f0..fb15fc6a8 100644 --- a/src/store/tests/Document/Loader/TextFileLoaderTest.php +++ b/src/store/tests/Document/Loader/TextFileLoaderTest.php @@ -52,6 +52,7 @@ public function testSourceIsPresentInMetadata() $this->assertCount(1, $documents); $this->assertInstanceOf(TextDocument::class, $document = $documents[0]); - $this->assertSame($source, $document->metadata['source']); + $this->assertSame($source, $document->metadata['_source']); + $this->assertSame($source, $document->metadata->getSource()); } } diff --git a/src/store/tests/Document/MetadataTest.php b/src/store/tests/Document/MetadataTest.php new file mode 100644 index 000000000..0ffe25399 --- /dev/null +++ b/src/store/tests/Document/MetadataTest.php @@ -0,0 +1,235 @@ + + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\AI\Store\Tests\Document; + +use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProvider; +use PHPUnit\Framework\TestCase; +use Symfony\AI\Store\Document\Metadata; + +#[CoversClass(Metadata::class)] +final class MetadataTest extends TestCase +{ + public function testMetadataExtendsArrayObject() + { + $metadata = new Metadata(); + + $this->assertInstanceOf(\ArrayObject::class, $metadata); + } + + public function testMetadataCanBeInitializedWithData() + { + $data = ['title' => 'Test Document', 'category' => 'test']; + $metadata = new Metadata($data); + + $this->assertSame('Test Document', $metadata['title']); + $this->assertSame('test', $metadata['category']); + $this->assertSame($data, $metadata->getArrayCopy()); + } + + public function testConstants() + { + $this->assertSame('_parent_id', Metadata::KEY_PARENT_ID); + $this->assertSame('_text', Metadata::KEY_TEXT); + $this->assertSame('_source', Metadata::KEY_SOURCE); + } + + #[DataProvider('parentIdProvider')] + public function testParentIdMethods(int|string|null $parentId) + { + $metadata = new Metadata(); + + // Initially should not have parent ID + $this->assertFalse($metadata->hasParentId()); + $this->assertNull($metadata->getParentId()); + + // Set parent ID + $metadata->setParentId($parentId); + + $this->assertTrue($metadata->hasParentId()); + $this->assertSame($parentId, $metadata->getParentId()); + } + + /** + * @return \Iterator + */ + public static function parentIdProvider(): \Iterator + { + yield 'integer parent id' => [ + 'parentId' => 123, + ]; + + yield 'string parent id' => [ + 'parentId' => 'parent-123', + ]; + } + + #[DataProvider('textProvider')] + public function testTextMethods(?string $text) + { + $metadata = new Metadata(); + + // Initially should not have text + $this->assertFalse($metadata->hasText()); + $this->assertNull($metadata->getText()); + + // Set text + $metadata->setText($text); + + $this->assertTrue($metadata->hasText()); + $this->assertSame($text, $metadata->getText()); + } + + /** + * @return \Iterator + */ + public static function textProvider(): \Iterator + { + yield 'string text' => [ + 'text' => 'This is some text content', + ]; + + yield 'empty string text' => [ + 'text' => '', + ]; + } + + #[DataProvider('sourceProvider')] + public function testSourceMethods(?string $source) + { + $metadata = new Metadata(); + + // Initially should not have source + $this->assertFalse($metadata->hasSource()); + $this->assertNull($metadata->getSource()); + + // Set source + $metadata->setSource($source); + + $this->assertTrue($metadata->hasSource()); + $this->assertSame($source, $metadata->getSource()); + } + + /** + * @return \Iterator + */ + public static function sourceProvider(): \Iterator + { + yield 'string source' => [ + 'source' => 'document.pdf', + ]; + + yield 'empty string source' => [ + 'source' => '', + ]; + } + + public function testMetadataInitializedWithSpecialKeys() + { + $data = [ + Metadata::KEY_PARENT_ID => 'parent-123', + Metadata::KEY_TEXT => 'This is the text content', + Metadata::KEY_SOURCE => 'document.pdf', + 'title' => 'Test Document', + ]; + + $metadata = new Metadata($data); + + // Test parent ID + $this->assertTrue($metadata->hasParentId()); + $this->assertSame('parent-123', $metadata->getParentId()); + + // Test text + $this->assertTrue($metadata->hasText()); + $this->assertSame('This is the text content', $metadata->getText()); + + // Test source + $this->assertTrue($metadata->hasSource()); + $this->assertSame('document.pdf', $metadata->getSource()); + + // Test regular metadata + $this->assertSame('Test Document', $metadata['title']); + } + + public function testArrayObjectBehavior() + { + $metadata = new Metadata(); + + // Test setting and getting values + $metadata['title'] = 'Test Document'; + $metadata['category'] = 'test'; + + $this->assertSame('Test Document', $metadata['title']); + $this->assertSame('test', $metadata['category']); + + // Test isset + $this->assertTrue(isset($metadata['title'])); + $this->assertFalse(isset($metadata['nonexistent'])); + + // Test unset + unset($metadata['category']); + $this->assertFalse(isset($metadata['category'])); + + // Test count + $this->assertCount(1, $metadata); + } + + public function testIteratorBehavior() + { + $data = ['title' => 'Test Document', 'category' => 'test', 'author' => 'John Doe']; + $metadata = new Metadata($data); + + $iteratedData = []; + foreach ($metadata as $key => $value) { + $iteratedData[$key] = $value; + } + + $this->assertSame($data, $iteratedData); + } + + public function testGettersReturnNullForMissingKeys() + { + $metadata = new Metadata(); + + $this->assertNull($metadata->getParentId()); + $this->assertNull($metadata->getText()); + $this->assertNull($metadata->getSource()); + } + + public function testHasMethodsReturnFalseForMissingKeys() + { + $metadata = new Metadata(); + + $this->assertFalse($metadata->hasParentId()); + $this->assertFalse($metadata->hasText()); + $this->assertFalse($metadata->hasSource()); + } + + public function testOverwritingSpecialKeys() + { + $metadata = new Metadata(); + + // Set initial values + $metadata->setParentId('parent-1'); + $metadata->setText('initial text'); + $metadata->setSource('initial.pdf'); + + // Overwrite values + $metadata->setParentId('parent-2'); + $metadata->setText('updated text'); + $metadata->setSource('updated.pdf'); + + $this->assertSame('parent-2', $metadata->getParentId()); + $this->assertSame('updated text', $metadata->getText()); + $this->assertSame('updated.pdf', $metadata->getSource()); + } +} diff --git a/src/store/tests/Document/Transformer/TextSplitTransformerTest.php b/src/store/tests/Document/Transformer/TextSplitTransformerTest.php index fc0c108ae..bfea94eaa 100644 --- a/src/store/tests/Document/Transformer/TextSplitTransformerTest.php +++ b/src/store/tests/Document/Transformer/TextSplitTransformerTest.php @@ -130,8 +130,8 @@ public function testParentIdIsSetInMetadata() ])); $this->assertCount(2, $chunks); - $this->assertSame($document->id, $chunks[0]->metadata['parent_id']); - $this->assertSame($document->id, $chunks[1]->metadata['parent_id']); + $this->assertSame($document->id, $chunks[0]->metadata['_parent_id']); + $this->assertSame($document->id, $chunks[1]->metadata['_parent_id']); } public function testMetadataIsInherited()