Skip to content

Commit 6ba23e4

Browse files
authored
feat: vector buckets (#774)
* feat: vector buckets * feat: sharding of vector buckets
1 parent 76df298 commit 6ba23e4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+9924
-225
lines changed

.env.test.sample

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,8 @@ AWS_DEFAULT_REGION=ap-southeast-1
2222
STORAGE_S3_ENDPOINT=http://127.0.0.1:9000
2323
STORAGE_S3_PROTOCOL=http
2424
STORAGE_S3_FORCE_PATH_STYLE=true
25-
REQUEST_X_FORWARDED_HOST_REGEXP=
25+
REQUEST_X_FORWARDED_HOST_REGEXP=
26+
27+
VECTOR_ENABLED=true
28+
ICEBERG_ENABLED=true
29+
ICEBERG_BUCKET_DETECTION_MODE="BUCKET"

.eslintrc.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
module.exports = {
2-
ignorePatterns: ['src/test/assets/**', 'src/test/db/**', 'src/test/*.yaml'],
2+
ignorePatterns: ['src/test/assets/**', 'src/test/db/**', 'src/test/*.yaml', 'src/**/**/*.md'],
33
parser: '@typescript-eslint/parser',
44
extends: ['plugin:@typescript-eslint/recommended', 'plugin:prettier/recommended'],
55
parserOptions: {
66
ecmaVersion: 2020, // Allows for the parsing of modern ECMAScript features
77
sourceType: 'module', // Allows for the use of imports
8-
"project": "./tsconfig.json",
8+
project: './tsconfig.json',
99
},
1010
rules: {
1111
'@typescript-eslint/no-floating-promises': 'error',
1212
'@typescript-eslint/no-explicit-any': 'warn',
1313
'@typescript-eslint/no-unused-vars': [
1414
'warn',
15-
{ 'argsIgnorePattern': '^_+$', 'varsIgnorePattern': '^_+$' } // allows intentionally unused variables named _
15+
{ argsIgnorePattern: '^_+$', varsIgnorePattern: '^_+$' }, // allows intentionally unused variables named _
1616
],
1717
'@typescript-eslint/no-require-imports': 'warn',
1818
},

.github/workflows/ci.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,9 @@ jobs:
8686
MULTI_TENANT: false
8787
S3_PROTOCOL_ACCESS_KEY_ID: ${{ secrets.TENANT_ID }}
8888
S3_PROTOCOL_ACCESS_KEY_SECRET: ${{ secrets.SERVICE_KEY }}
89+
VECTOR_S3_BUCKETS: supa-test-local-dev
90+
VECTOR_ENABLED: true
91+
ICEBERG_ENABLED: true
8992

9093
- name: Upload coverage results to Coveralls
9194
uses: coverallsapp/github-action@master

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@ static/api.json
88
data/
99
bin/
1010
coverage/
11-
.idea/
11+
.idea/
12+
src/scripts/*.py
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ALTER TABLE tenants ADD COLUMN IF NOT EXISTS feature_vector_buckets boolean NOT NULL DEFAULT false;
2+
ALTER TABLE tenants ADD COLUMN IF NOT EXISTS feature_vector_buckets_max_buckets int NOT NULL DEFAULT 10;
3+
ALTER TABLE tenants ADD COLUMN IF NOT EXISTS feature_vector_buckets_max_indexes int NOT NULL DEFAULT 5;
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
2+
3+
-- Main shards table.
4+
CREATE TABLE IF NOT EXISTS shard (
5+
id BIGSERIAL PRIMARY KEY,
6+
kind TEXT NOT NULL,
7+
shard_key TEXT NOT NULL,
8+
capacity INT NOT NULL DEFAULT 10000,
9+
next_slot INT NOT NULL DEFAULT 0,
10+
status TEXT NOT NULL DEFAULT 'active', -- active|draining|disabled
11+
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
12+
UNIQUE (kind, shard_key)
13+
);
14+
15+
-- Sparse slot rows: only the slots that have ever been used exist here.
16+
-- A "free" slot is a row with reservation_id NULL and resource_id NULL.
17+
CREATE TABLE IF NOT EXISTS shard_slots (
18+
shard_id BIGINT NOT NULL REFERENCES shard(id) ON DELETE CASCADE,
19+
slot_no INT NOT NULL,
20+
tenant_id TEXT,
21+
resource_id TEXT, -- set when confirmed
22+
PRIMARY KEY (shard_id, slot_no)
23+
);
24+
25+
-- Reservations with short leases
26+
CREATE TABLE IF NOT EXISTS shard_reservation (
27+
id UUID PRIMARY KEY default gen_random_uuid(),
28+
kind text NOT NULL,
29+
tenant_id TEXT,
30+
resource_id TEXT NOT NULL, -- e.g. "vector::bucket::name"
31+
shard_id BIGINT NOT NULL,
32+
slot_no INT NOT NULL,
33+
status TEXT NOT NULL DEFAULT 'pending', -- pending|confirmed|expired|cancelled
34+
lease_expires_at TIMESTAMPTZ NOT NULL,
35+
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
36+
UNIQUE (kind, resource_id),
37+
UNIQUE (shard_id, slot_no)
38+
);
39+
40+
-- Fast “used count” per shard
41+
CREATE INDEX IF NOT EXISTS shard_slots_used_idx
42+
ON shard_slots (shard_id)
43+
WHERE resource_id IS NOT NULL;
44+
45+
ALTER TABLE shard
46+
ADD CONSTRAINT shard_capacity_not_less_than_minted
47+
CHECK (capacity >= next_slot);
48+
49+
50+
-- Create index for counting slots by tenant
51+
CREATE INDEX IF NOT EXISTS shard_slots_tenant_id_idx
52+
ON shard_slots (tenant_id);
53+
54+
-- Create index for counting reservations by tenant
55+
CREATE INDEX IF NOT EXISTS shard_reservation_tenant_id_idx
56+
ON shard_reservation (tenant_id);
57+
58+
-- Create index for counting used slots by tenant
59+
CREATE INDEX IF NOT EXISTS shard_slots_tenant_resource_idx
60+
ON shard_slots (tenant_id, shard_id)
61+
WHERE resource_id IS NOT NULL;
62+
63+
64+
ALTER TABLE shard_reservation
65+
ADD CONSTRAINT fk_shard_slot
66+
FOREIGN KEY (shard_id, slot_no)
67+
REFERENCES shard_slots(shard_id, slot_no)
68+
ON DELETE RESTRICT;
69+
70+
71+
CREATE INDEX IF NOT EXISTS shard_slots_free_idx
72+
ON shard_slots (shard_id, slot_no)
73+
WHERE resource_id IS NULL;
74+
75+
-- Add index for finding active reservations by slot
76+
CREATE INDEX IF NOT EXISTS shard_reservation_active_slot_idx
77+
ON shard_reservation (shard_id, slot_no, lease_expires_at)
78+
WHERE status = 'pending';
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
DO $$
2+
DECLARE
3+
BEGIN
4+
IF NOT EXISTS (
5+
SELECT 1
6+
FROM pg_enum
7+
JOIN pg_type ON pg_enum.enumtypid = pg_type.oid
8+
WHERE pg_type.typname = 'buckettype'
9+
AND enumlabel = 'VECTOR'
10+
) THEN
11+
ALTER TYPE storage.BucketType ADD VALUE 'VECTOR';
12+
END IF;
13+
END$$;
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
DO $$
2+
DECLARE
3+
anon_role text = COALESCE(current_setting('storage.anon_role', true), 'anon');
4+
authenticated_role text = COALESCE(current_setting('storage.authenticated_role', true), 'authenticated');
5+
service_role text = COALESCE(current_setting('storage.service_role', true), 'service_role');
6+
BEGIN
7+
CREATE TABLE IF NOT EXISTS storage.buckets_vectors (
8+
id text not null primary key,
9+
type storage.BucketType NOT NULL default 'VECTOR',
10+
created_at timestamptz NOT NULL default now(),
11+
updated_at timestamptz NOT NULL default now()
12+
);
13+
14+
CREATE TABLE IF NOT EXISTS storage.vector_indexes
15+
(
16+
id text primary key default gen_random_uuid(),
17+
name text COLLATE "C" NOT NULL,
18+
bucket_id text NOT NULL references storage.buckets_vectors (id),
19+
data_type text NOT NULL,
20+
dimension integer NOT NULL,
21+
distance_metric text NOT NULL,
22+
metadata_configuration jsonb NULL,
23+
created_at timestamptz NOT NULL default now(),
24+
updated_at timestamptz NOT NULL default now()
25+
);
26+
27+
ALTER TABLE storage.buckets_vectors ENABLE ROW LEVEL SECURITY;
28+
ALTER TABLE storage.vector_indexes ENABLE ROW LEVEL SECURITY;
29+
30+
EXECUTE 'GRANT SELECT ON TABLE storage.buckets_vectors TO ' || service_role || ', ' || authenticated_role || ', ' || anon_role;
31+
EXECUTE 'GRANT SELECT ON TABLE storage.vector_indexes TO ' || service_role || ', ' || authenticated_role || ', ' || anon_role;
32+
33+
CREATE UNIQUE INDEX IF NOT EXISTS vector_indexes_name_bucket_id_idx ON storage.vector_indexes (name, bucket_id);
34+
END$$;

0 commit comments

Comments
 (0)