@@ -16,7 +16,7 @@ import { IRange, Range } from '../../../util/vs/editor/common/core/range';
16
16
import { IInstantiationService , ServicesAccessor } from '../../../util/vs/platform/instantiation/common/instantiation' ;
17
17
import { FileChunk , FileChunkWithEmbedding } from '../../chunking/common/chunk' ;
18
18
import { stripChunkTextMetadata } from '../../chunking/common/chunkingStringUtils' ;
19
- import { EmbeddingType , EmbeddingVector } from '../../embeddings/common/embeddingsComputer' ;
19
+ import { Embedding , EmbeddingType , EmbeddingVector } from '../../embeddings/common/embeddingsComputer' ;
20
20
import { IFileSystemService } from '../../filesystem/common/fileSystemService' ;
21
21
import { ILogService } from '../../log/common/logService' ;
22
22
import { FileRepresentation , IWorkspaceFileIndex } from './workspaceFileIndex' ;
@@ -430,7 +430,6 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
430
430
db . exec ( 'DELETE FROM CacheMeta;' ) ;
431
431
db . prepare ( 'INSERT INTO CacheMeta (version, embeddingModel) VALUES (?, ?)' ) . run ( this . version , embeddingType . id ) ;
432
432
433
-
434
433
// Load existing disk db if it exists
435
434
const diskCache = await instantiationService . invokeFunction ( accessor => DiskCache . readDiskCache (
436
435
accessor ,
@@ -456,7 +455,10 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
456
455
chunk . range . startColumn ,
457
456
chunk . range . endLineNumber ,
458
457
chunk . range . endColumn ,
459
- Float32Array . from ( typeof chunk . embedding === 'string' ? DiskCache . decodeEmbedding ( chunk . embedding ) : chunk . embedding ) ,
458
+ packEmbedding ( {
459
+ type : embeddingType ,
460
+ value : typeof chunk . embedding === 'string' ? DiskCache . decodeEmbedding ( chunk . embedding ) : chunk . embedding ,
461
+ } ) ,
460
462
chunk . chunkHash ?? ''
461
463
) ;
462
464
}
@@ -532,8 +534,7 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
532
534
if ( all . length > 0 ) {
533
535
const out = new Map < string , FileChunkWithEmbedding > ( ) ;
534
536
for ( const row of all ) {
535
- const embeddingData = row . embedding as Uint8Array ;
536
- const embedding = Array . from ( new Float32Array ( embeddingData . buffer , embeddingData . byteOffset , embeddingData . byteLength / Float32Array . BYTES_PER_ELEMENT ) ) ;
537
+ const embedding = unpackEmbedding ( this . embeddingType , row . embedding as Uint8Array ) ;
537
538
538
539
const chunk : FileChunkWithEmbedding = {
539
540
chunk : {
@@ -542,10 +543,7 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
542
543
rawText : undefined ,
543
544
range : new Range ( row . range_startLineNumber as number , row . range_startColumn as number , row . range_endLineNumber as number , row . range_endColumn as number ) ,
544
545
} ,
545
- embedding : {
546
- type : this . embeddingType ,
547
- value : embedding ,
548
- } ,
546
+ embedding,
549
547
chunkHash : row . chunkHash as string ,
550
548
} ;
551
549
if ( chunk . chunkHash ) {
@@ -576,18 +574,14 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
576
574
contentVersionId : fileIdResult . contentVersionId as string | undefined ,
577
575
fileHash : undefined ,
578
576
value : chunks . map ( ( row ) : FileChunkWithEmbedding => {
579
- const embeddingData = row . embedding as Uint8Array ;
580
577
return {
581
578
chunk : {
582
579
file : file . uri ,
583
580
text : row . text as string ,
584
581
rawText : undefined ,
585
582
range : new Range ( row . range_startLineNumber as number , row . range_startColumn as number , row . range_endLineNumber as number , row . range_endColumn as number ) ,
586
583
} ,
587
- embedding : {
588
- type : this . embeddingType ,
589
- value : Array . from ( new Float32Array ( embeddingData . buffer , embeddingData . byteOffset , embeddingData . byteLength / Float32Array . BYTES_PER_ELEMENT ) ) ,
590
- } ,
584
+ embedding : unpackEmbedding ( this . embeddingType , row . embedding as Uint8Array ) ,
591
585
chunkHash : row . chunkHash as string | undefined ,
592
586
} ;
593
587
} ) ,
@@ -643,16 +637,14 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
643
637
644
638
this . db . exec ( 'BEGIN TRANSACTION' ) ;
645
639
for ( const chunk of newEntry . value ?? [ ] ) {
646
- const float32Array = Float32Array . from ( chunk . embedding . value ) ;
647
- const embeddingData = new Uint8Array ( float32Array . buffer , float32Array . byteOffset , float32Array . byteLength ) ;
648
640
insertStatement . run (
649
641
fileResult . lastInsertRowid as number ,
650
642
chunk . chunk . text ,
651
643
chunk . chunk . range . startLineNumber ,
652
644
chunk . chunk . range . startColumn ,
653
645
chunk . chunk . range . endLineNumber ,
654
646
chunk . chunk . range . endColumn ,
655
- embeddingData ,
647
+ packEmbedding ( chunk . embedding ) ,
656
648
chunk . chunkHash ?? '' ,
657
649
) ;
658
650
}
@@ -665,4 +657,52 @@ class DbCache implements IWorkspaceChunkAndEmbeddingCache {
665
657
666
658
return chunks ;
667
659
}
660
+ }
661
+
662
+ /**
663
+ * Packs the embedding into a binary value for efficient storage.
664
+ */
665
+ export function packEmbedding ( embedding : Embedding ) : Uint8Array {
666
+ if ( embedding . type . equals ( EmbeddingType . metis_1024_I16_Binary ) ) {
667
+ // Generate packed binary
668
+ if ( embedding . value . length % 8 !== 0 ) {
669
+ throw new Error ( `Embedding value length must be a multiple of 8 for ${ embedding . type . id } , got ${ embedding . value . length } ` ) ;
670
+ }
671
+
672
+ const data = new Uint8Array ( embedding . value . length / 8 ) ;
673
+ for ( let i = 0 ; i < embedding . value . length ; i += 8 ) {
674
+ let value = 0 ;
675
+ for ( let j = 0 ; j < 8 ; j ++ ) {
676
+ value |= ( embedding . value [ i + j ] >= 0 ? 1 : 0 ) << j ;
677
+ }
678
+ data [ i / 8 ] = value ;
679
+ }
680
+ return data ;
681
+ }
682
+
683
+ // All other formats default to float32 for now
684
+ const data = Float32Array . from ( embedding . value ) ;
685
+ return new Uint8Array ( data . buffer , data . byteOffset , data . byteLength ) ;
686
+ }
687
+
688
+ /**
689
+ * Unpacks an embedding from a binary value packed with {@link packEmbedding}.
690
+ */
691
+ export function unpackEmbedding ( type : EmbeddingType , data : Uint8Array ) : Embedding {
692
+ if ( type . equals ( EmbeddingType . metis_1024_I16_Binary ) ) {
693
+ // Old versions may have stored the values as a float32
694
+ if ( data . length <= 1024 ) {
695
+ const values = new Array ( data . length * 8 ) ;
696
+ for ( let i = 0 ; i < data . length ; i ++ ) {
697
+ const byte = data [ i ] ;
698
+ for ( let j = 0 ; j < 8 ; j ++ ) {
699
+ values [ i * 8 + j ] = ( byte & ( 1 << j ) ) > 0 ? 0.03125 : - 0.03125 ;
700
+ }
701
+ }
702
+ return { type, value : values } ;
703
+ }
704
+ }
705
+
706
+ const float32Array = new Float32Array ( data . buffer , data . byteOffset , data . byteLength / 4 ) ;
707
+ return { type, value : Array . from ( float32Array ) } ;
668
708
}
0 commit comments