Skip to content

Commit 40414e5

Browse files
committed
validator/tests: add vector similarity functions tests
Add tests to validate the results of vector similarity functions including: - `similarity_cosine` function returns proper results, - `similarity_euclidean` function returns proper results, - `similarity_dot_product` function returns proper results, - similarity function works well with multicolumn partition key, - similarity function works well with both partition and clustering key. Refs: scylladb/scylladb#25993
1 parent e18cd64 commit 40414e5

File tree

2 files changed

+352
-0
lines changed

2 files changed

+352
-0
lines changed

crates/validator/src/tests/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ mod crud;
88
mod full_scan;
99
mod reconnect;
1010
mod serde;
11+
mod vector_similarity;
1112

1213
use crate::ServicesSubnet;
1314
use crate::dns::Dns;
@@ -221,6 +222,7 @@ pub(crate) async fn register() -> Vec<(String, TestCase)> {
221222
("full_scan", full_scan::new().await),
222223
("reconnect", reconnect::new().await),
223224
("serde", serde::new().await),
225+
("vector_similarity", vector_similarity::new().await),
224226
]
225227
.into_iter()
226228
.map(|(name, test_case)| (name.to_string(), test_case))
Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
/*
2+
* Copyright 2025-present ScyllaDB
3+
* SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
4+
*/
5+
6+
use crate::common::*;
7+
use crate::tests::*;
8+
use scylla::client::session::Session;
9+
use std::time::Duration;
10+
use tracing::info;
11+
12+
pub(crate) async fn new() -> TestCase {
13+
let timeout = Duration::from_secs(30);
14+
TestCase::empty()
15+
.with_init(timeout, init)
16+
.with_cleanup(timeout, cleanup)
17+
.with_test(
18+
"similarity_cosine_function_with_single_column_partition_key",
19+
timeout,
20+
similarity_cosine_function_with_single_column_partition_key,
21+
)
22+
.with_test(
23+
"similarity_euclidean_function_with_single_column_partition_key",
24+
timeout,
25+
similarity_euclidean_function_with_single_column_partition_key,
26+
)
27+
.with_test(
28+
"similarity_dot_product_function_with_single_column_partition_key",
29+
timeout,
30+
similarity_dot_product_function_with_single_column_partition_key,
31+
)
32+
.with_test(
33+
"vector_similarity_function_with_clustering_key",
34+
timeout,
35+
vector_similarity_function_with_clustering_key,
36+
)
37+
.with_test(
38+
"vector_similarity_function_with_multi_column_partition_key",
39+
timeout,
40+
vector_similarity_function_with_multi_column_partition_key,
41+
)
42+
}
43+
44+
/// Normilized (L2 norm = 1) embeddings for testing
45+
pub(crate) static EMBEDDINGS: [[f32; 3]; 3] = [
46+
[0.267261, 0.534522, 0.801784],
47+
[0.455842, 0.569803, 0.683763],
48+
[0.502571, 0.574366, 0.646162],
49+
];
50+
51+
/// Expected results for similarity functions when querying with [1.0, 0.0, -1.0]
52+
pub(crate) const SIMILARITY_RESULTS: [(&str, [(i32, f32); 3]); 3] = [
53+
("cosine", [(2, 1.1015341), (1, 1.1611645), (0, 1.3779647)]),
54+
("euclidean", [(2, 3.2871814), (1, 3.4558413), (0, 4.069046)]),
55+
(
56+
"dot_product",
57+
[(2, 1.1435909), (1, 1.227921), (0, 1.534523)],
58+
),
59+
];
60+
61+
async fn assert_similarity_function_results(
62+
session: &Session,
63+
table: &str,
64+
key_column: &str,
65+
similarity_function: &str,
66+
) {
67+
let results = get_query_results(
68+
format!(
69+
"SELECT {key_column}, similarity_{similarity_function}(v, [1.0, 0.0, -1.0]) FROM {table} ORDER BY v ANN OF [1.0, 0.0, -1.0] LIMIT 5"
70+
),
71+
session,
72+
)
73+
.await;
74+
let rows = results.rows::<(i32, f32)>().expect("failed to get rows");
75+
assert_eq!(rows.rows_remaining(), 3);
76+
77+
let (_, expected_distances) = SIMILARITY_RESULTS
78+
.iter()
79+
.find(|(name, _)| *name == similarity_function)
80+
.expect("similarity function not found");
81+
for (i, row) in rows.enumerate() {
82+
let row = row.expect("failed to get row");
83+
let (key, distance) = row;
84+
assert_eq!(
85+
(key, distance),
86+
expected_distances[i],
87+
"Row {i} does not match expected result"
88+
);
89+
}
90+
}
91+
92+
async fn similarity_cosine_function_with_single_column_partition_key(actors: TestActors) {
93+
info!("started");
94+
95+
let (session, client) = prepare_connection(&actors).await;
96+
97+
let keyspace = create_keyspace(&session).await;
98+
let table = create_table(&session, "pk INT PRIMARY KEY, v VECTOR<FLOAT, 3>", None).await;
99+
100+
// Insert test data
101+
for (i, embedding) in EMBEDDINGS.into_iter().enumerate() {
102+
session
103+
.query_unpaged(
104+
format!("INSERT INTO {table} (pk, v) VALUES (?, ?)"),
105+
(i as i32, embedding.as_slice()),
106+
)
107+
.await
108+
.expect("failed to insert data");
109+
}
110+
111+
let similarity_function = "cosine";
112+
let index = create_index(
113+
&session,
114+
&client,
115+
&table,
116+
"v",
117+
Some(format!(
118+
"{{'similarity_function' : '{similarity_function}'}}"
119+
)),
120+
)
121+
.await;
122+
123+
wait_for(
124+
|| async { client.count(&index.keyspace, &index.index).await == Some(3) },
125+
"Waiting for 3 vectors to be indexed",
126+
Duration::from_secs(5),
127+
)
128+
.await;
129+
130+
// Check if the query returns the expected distances
131+
assert_similarity_function_results(&session, &table, "pk", similarity_function).await;
132+
133+
// Drop keyspace
134+
session
135+
.query_unpaged(format!("DROP KEYSPACE {keyspace}"), ())
136+
.await
137+
.expect("failed to drop a keyspace");
138+
139+
info!("finished");
140+
}
141+
142+
async fn similarity_euclidean_function_with_single_column_partition_key(actors: TestActors) {
143+
info!("started");
144+
145+
let (session, client) = prepare_connection(&actors).await;
146+
147+
let keyspace = create_keyspace(&session).await;
148+
let table = create_table(&session, "pk INT PRIMARY KEY, v VECTOR<FLOAT, 3>", None).await;
149+
150+
// Insert test data
151+
for (i, embedding) in EMBEDDINGS.into_iter().enumerate() {
152+
session
153+
.query_unpaged(
154+
format!("INSERT INTO {table} (pk, v) VALUES (?, ?)"),
155+
(i as i32, embedding.as_slice()),
156+
)
157+
.await
158+
.expect("failed to insert data");
159+
}
160+
161+
let similarity_function = "euclidean";
162+
let index = create_index(
163+
&session,
164+
&client,
165+
&table,
166+
"v",
167+
Some(format!(
168+
"{{'similarity_function' : '{similarity_function}'}}"
169+
)),
170+
)
171+
.await;
172+
173+
wait_for(
174+
|| async { client.count(&index.keyspace, &index.index).await == Some(3) },
175+
"Waiting for 3 vectors to be indexed",
176+
Duration::from_secs(5),
177+
)
178+
.await;
179+
180+
// Check if the query returns the expected distances
181+
assert_similarity_function_results(&session, &table, "pk", similarity_function).await;
182+
183+
// Drop keyspace
184+
session
185+
.query_unpaged(format!("DROP KEYSPACE {keyspace}"), ())
186+
.await
187+
.expect("failed to drop a keyspace");
188+
189+
info!("finished");
190+
}
191+
192+
async fn similarity_dot_product_function_with_single_column_partition_key(actors: TestActors) {
193+
info!("started");
194+
195+
let (session, client) = prepare_connection(&actors).await;
196+
197+
let keyspace = create_keyspace(&session).await;
198+
let table = create_table(&session, "pk INT PRIMARY KEY, v VECTOR<FLOAT, 3>", None).await;
199+
200+
// Insert test data
201+
for (i, embedding) in EMBEDDINGS.into_iter().enumerate() {
202+
session
203+
.query_unpaged(
204+
format!("INSERT INTO {table} (pk, v) VALUES (?, ?)"),
205+
(i as i32, embedding.as_slice()),
206+
)
207+
.await
208+
.expect("failed to insert data");
209+
}
210+
211+
let similarity_function = "dot_product";
212+
let index = create_index(
213+
&session,
214+
&client,
215+
&table,
216+
"v",
217+
Some(format!(
218+
"{{'similarity_function' : '{similarity_function}'}}"
219+
)),
220+
)
221+
.await;
222+
223+
wait_for(
224+
|| async { client.count(&index.keyspace, &index.index).await == Some(3) },
225+
"Waiting for 3 vectors to be indexed",
226+
Duration::from_secs(5),
227+
)
228+
.await;
229+
230+
// Check if the query returns the expected distances
231+
assert_similarity_function_results(&session, &table, "pk", similarity_function).await;
232+
233+
// Drop keyspace
234+
session
235+
.query_unpaged(format!("DROP KEYSPACE {keyspace}"), ())
236+
.await
237+
.expect("failed to drop a keyspace");
238+
239+
info!("finished");
240+
}
241+
242+
async fn vector_similarity_function_with_clustering_key(actors: TestActors) {
243+
info!("started");
244+
245+
let (session, client) = prepare_connection(&actors).await;
246+
247+
let keyspace = create_keyspace(&session).await;
248+
let table = create_table(
249+
&session,
250+
"pk INT, ck INT, v VECTOR<FLOAT, 3>, PRIMARY KEY (pk, ck)",
251+
None,
252+
)
253+
.await;
254+
255+
// Insert test data
256+
for (i, embedding) in EMBEDDINGS.into_iter().enumerate() {
257+
session
258+
.query_unpaged(
259+
format!("INSERT INTO {table} (pk, ck, v) VALUES (?, ?, ?)"),
260+
(123, i as i32, &embedding.as_slice()),
261+
)
262+
.await
263+
.expect("failed to insert data");
264+
}
265+
266+
let similarity_function = "euclidean";
267+
let index = create_index(
268+
&session,
269+
&client,
270+
&table,
271+
"v",
272+
Some(format!(
273+
"{{'similarity_function' : '{similarity_function}'}}"
274+
)),
275+
)
276+
.await;
277+
278+
wait_for(
279+
|| async { client.count(&index.keyspace, &index.index).await == Some(3) },
280+
"Waiting for 3 vectors to be indexed",
281+
Duration::from_secs(5),
282+
)
283+
.await;
284+
285+
// Check if the query returns the expected distances
286+
assert_similarity_function_results(&session, &table, "ck", similarity_function).await;
287+
288+
// Drop keyspace
289+
session
290+
.query_unpaged(format!("DROP KEYSPACE {keyspace}"), ())
291+
.await
292+
.expect("failed to drop a keyspace");
293+
294+
info!("finished");
295+
}
296+
297+
async fn vector_similarity_function_with_multi_column_partition_key(actors: TestActors) {
298+
info!("started");
299+
300+
let (session, client) = prepare_connection(&actors).await;
301+
302+
let keyspace = create_keyspace(&session).await;
303+
let table = create_table(
304+
&session,
305+
"pk1 INT, pk2 INT, v VECTOR<FLOAT, 3>, PRIMARY KEY ((pk1, pk2))",
306+
None,
307+
)
308+
.await;
309+
310+
// Insert test data
311+
for (i, embedding) in EMBEDDINGS.into_iter().enumerate() {
312+
session
313+
.query_unpaged(
314+
format!("INSERT INTO {table} (pk1, pk2, v) VALUES (?, ?, ?)"),
315+
(123, i as i32, &embedding.as_slice()),
316+
)
317+
.await
318+
.expect("failed to insert data");
319+
}
320+
321+
let similarity_function = "euclidean";
322+
let index = create_index(
323+
&session,
324+
&client,
325+
&table,
326+
"v",
327+
Some(format!(
328+
"{{'similarity_function' : '{similarity_function}'}}"
329+
)),
330+
)
331+
.await;
332+
333+
wait_for(
334+
|| async { client.count(&index.keyspace, &index.index).await == Some(3) },
335+
"Waiting for 3 vectors to be indexed",
336+
Duration::from_secs(5),
337+
)
338+
.await;
339+
340+
// Check if the query returns the expected distances
341+
assert_similarity_function_results(&session, &table, "pk2", similarity_function).await;
342+
343+
// Drop keyspace
344+
session
345+
.query_unpaged(format!("DROP KEYSPACE {keyspace}"), ())
346+
.await
347+
.expect("failed to drop a keyspace");
348+
349+
info!("finished");
350+
}

0 commit comments

Comments
 (0)