risingwavelabs
diff --git a/‎Cargo.lock‎
Lines changed: 307 additions & 271 deletions b/‎Cargo.lock‎
Lines changed: 307 additions & 271 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 10 additions & 8 deletions b/‎Cargo.toml‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎README.md‎
Lines changed: 6 additions & 2 deletions b/‎README.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎develop/sql_bench/benchmarks/gap_fill_eowc.yaml‎
Lines changed: 61 additions & 0 deletions b/‎develop/sql_bench/benchmarks/gap_fill_eowc.yaml‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎develop/sql_bench/benchmarks/gap_fill_streaming.yaml‎
Lines changed: 44 additions & 0 deletions b/‎develop/sql_bench/benchmarks/gap_fill_streaming.yaml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎e2e_test/backfill/test_backfill_order_validation.slt‎
Lines changed: 99 additions & 0 deletions b/‎e2e_test/backfill/test_backfill_order_validation.slt‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎e2e_test/batch/catalog/pg_settings.slt.part‎
Lines changed: 1 addition & 0 deletions b/‎e2e_test/batch/catalog/pg_settings.slt.part‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎e2e_test/batch/describe_fragments.slt‎
Lines changed: 3 additions & 21 deletions b/‎e2e_test/batch/describe_fragments.slt‎
Lines changed: 3 additions & 21 deletions
@@ -113,7 +113,7 @@ apache-avro = { git = "https://github.com/risingwavelabs/avro", rev = "25113ba88
 ] }
 arc-swap = "1"
 arrow-udf-runtime = "0.8.0"
-async-openai = "0.29.0"
+async-openai = "0.30.1"
 auto_enums = { version = "0.8", features = ["futures03", "tokio1"] }
 await-tree = { version = "0.3.2-alpha.2", features = ["serde", "attributes"] }
 aws-config = { version = "1", default-features = false, features = [
@@ -140,7 +140,7 @@ aws-sdk-sqs = { version = "1", default-features = false, features = [
 ] }
 aws-smithy-http = "0.62"
 aws-smithy-runtime = "1.8"
-aws-smithy-runtime-api = "1.8"
+aws-smithy-runtime-api = "1.9"
 aws-smithy-types = { version = "1.3", default-features = false, features = [
     "hyper-0-14-x", # required by aws sdk
 ] }
@@ -149,7 +149,7 @@ axum = "=0.7.4" # TODO: 0.7.5+ does not work with current toolchain
 axum-extra = "0.9"
 chrono = { version = "0.4.40", default-features = false }
 clap = { version = "4", features = ["cargo", "derive", "env"] }
-criterion = { version = "0.5", features = ["async_futures"] }
+criterion = { version = "0.7", features = ["async_futures"] }
 # Use a forked version which removes the dependencies on dynamo db to reduce
 # compile time and binary size.
 deltalake = { version = "0.26", features = ["s3", "gcs", "datafusion"] }
@@ -167,20 +167,22 @@ hashbrown0_14 = { package = "hashbrown", version = "0.14", features = [
 ] }
 hytra = "0.1"
 # branch dev_rebase_main_20250325
-iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "d1b8cabf4f8a33090d018b539bfec6e7c623d7c4", features = [
+iceberg = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "c15a32b4220cf1b02132129eb34a5ea301215b7f", features = [
     "storage-s3",
     "storage-gcs",
     "storage-azblob",
     "storage-azdls",
 ] }
-iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "d1b8cabf4f8a33090d018b539bfec6e7c623d7c4" }
-iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "d1b8cabf4f8a33090d018b539bfec6e7c623d7c4" }
+iceberg-catalog-glue = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "c15a32b4220cf1b02132129eb34a5ea301215b7f" }
+iceberg-catalog-rest = { git = "https://github.com/risingwavelabs/iceberg-rust.git", rev = "c15a32b4220cf1b02132129eb34a5ea301215b7f" }
+
 indexmap = { version = "2.12.0", features = ["serde"] }
 itertools = "0.14.0"
 jni = { version = "0.21.1", features = ["invocation"] }
 jsonbb = { version = "0.2.2", features = ["float_roundtrip"] }
+jsonwebtoken = { version = "10", features = ["aws_lc_rs"] }
 linkme = { version = "0.3.32", features = ["used_linker"] }
-lru = "0.14"
+lru = "0.16"
 madsim = "0.2.34"
 mixtrics = { version = "0.2", features = ["prometheus"] }
 mysql_async = { version = "0.36", features = ["native-tls-tls", "rust_decimal"] }
@@ -203,7 +205,7 @@ rdkafka = { package = "madsim-rdkafka", version = "0.4.4", features = [
     "cmake-build",
 ] }
 redis = { version = "0.32" }
-regex = "1.11"
+regex = "1.12"
 reqwest = { version = "0.12.2", features = ["json", "stream"] }
 risingwave_backup = { path = "./src/storage/backup" }
 risingwave_batch = { path = "./src/batch" }
 
@@ -64,12 +64,16 @@ To learn about other installation options, such as using a Docker image, see the
 
 RisingWave delivers a unified streaming data platform that combines **ultra-low-latency stream processing** and **Iceberg-native data management**.
 
-### Streaming analytics
+### Low-latency streaming processing and analytics
 RisingWave integrates real-time stream processing and low-latency serving in a single system. It continuously ingests data from streaming and batch sources, performs incremental computations across streams and tables with end-to-end freshness under 100 ms. Materialized views can be served directly within RisingWave with 10–20 ms p99 query latency, or delivered to downstream systems.
 
-### Iceberg-based lakehouse ingestion and management
+### Iceberg lakehouse ingestion, transformation, and management
 RisingWave treats Apache Iceberg™ as a first-class citizen. It directly hosts and manages the Iceberg REST catalog, allowing users to create and operate Iceberg tables through a PostgreSQL-compatible interface. RisingWave supports two write modes: Merge-on-Read (MoR) and Copy-on-Write (CoW), to suit different ingestion and query patterns. It also provides built-in table maintenance capabilities, including compaction, small-file optimization, vacuum, and snapshot cleanup, ensuring efficient and consistent data management without external tools or pipelines.
 
+_Plug: [Nimtable](https://github.com/nimtable/nimtable) is an observability tool developed by RisingWave for easily exploring and managing Iceberg tables._
+
+
+
 ## Key design decisions
 
 RisingWave is designed to be easier to use and more cost-efficient:
 
@@ -0,0 +1,61 @@
+# Benchmark configuration for GAP_FILL in EOWC (Emit On Window Close) mode
+# This benchmark measures the throughput of processing incoming data through GAP_FILL with EOWC
+benchmark_name: gap_fill_eowc
+
+# SQL to set up the initial schema and data (run once)
+setup_sql: |
+  CREATE TABLE sensor_data_eowc (
+    ts TIMESTAMP,
+    value DOUBLE,
+    WATERMARK FOR ts AS ts - INTERVAL '1' MINUTE,
+    PRIMARY KEY (ts)
+  ) APPEND ONLY;
+
+  CREATE MATERIALIZED VIEW gap_filled_sensors_eowc AS
+  SELECT ts, value
+  FROM GAP_FILL(sensor_data_eowc, ts, INTERVAL '1' MINUTE)
+  EMIT ON WINDOW CLOSE;
+
+# SQL to prepare the data before each run
+prepare_sql: |
+  -- No preparation needed for each run
+
+# SQL to clean up after each run
+conclude_sql: |
+  -- APPEND ONLY tables do not support DELETE
+  -- We use different time ranges for each run to avoid primary key conflicts
+
+# SQL to clean up everything after all runs are complete
+cleanup_sql: |
+  DROP MATERIALIZED VIEW IF EXISTS gap_filled_sensors_eowc;
+  DROP TABLE IF EXISTS sensor_data_eowc;
+
+# SQL to benchmark - Insert sparse time series data and measure processing time
+# This tests the throughput of GAP_FILL with EOWC processing incoming data
+# Note: Uses current timestamp to ensure unique time ranges for each run
+benchmark_sql: |
+  -- Insert data with gaps (every 5 minutes) starting from NOW()
+  -- Each run will use a different time range since NOW() advances
+  INSERT INTO sensor_data_eowc (ts, value)
+  SELECT
+    NOW() + (i * INTERVAL '5 minutes'),
+    20.0 + (i * 0.5)
+  FROM generate_series(0, 100) AS i;
+
+  -- Insert a timestamp far in the future to advance watermark and close all windows
+  -- Watermark is defined as ts - INTERVAL '1' MINUTE, so this will close all windows
+  INSERT INTO sensor_data_eowc (ts, value)
+  VALUES (
+    NOW() + INTERVAL '10 hours',  -- Far enough to close all windows
+    999.0
+  );
+
+  -- Wait for watermark to advance and windows to close
+  SELECT pg_sleep(3);
+
+  -- Query the results - windows should now be closed by watermark advancement
+  SELECT COUNT(*) FROM gap_filled_sensors_eowc;
+
+# Number of times to run the benchmark
+runs: 30
+
@@ -0,0 +1,44 @@
+# Benchmark configuration for GAP_FILL in normal streaming mode
+benchmark_name: gap_fill_streaming
+
+# SQL to set up the initial schema and data (run once)
+setup_sql: |
+  CREATE TABLE sensor_data (
+    ts TIMESTAMP,
+    value DOUBLE,
+    PRIMARY KEY (ts)
+  );
+
+  CREATE MATERIALIZED VIEW gap_filled_sensors AS
+  SELECT ts, value
+  FROM GAP_FILL(sensor_data, ts, INTERVAL '1' MINUTE);
+
+# SQL to prepare the data before each run
+prepare_sql: |
+  -- No preparation needed for each run
+
+# SQL to clean up after each run
+conclude_sql: |
+  -- Clean up the inserted data after each run
+  DELETE FROM sensor_data;
+
+# SQL to clean up everything after all runs are complete
+cleanup_sql: |
+  DROP MATERIALIZED VIEW IF EXISTS gap_filled_sensors;
+  DROP TABLE IF EXISTS sensor_data;
+
+# SQL to benchmark - Insert sparse time series data and measure processing time
+# This tests the throughput of GAP_FILL processing incoming data
+benchmark_sql: |
+  INSERT INTO sensor_data (ts, value)
+  SELECT
+    '2024-05-21 10:00:00'::TIMESTAMP + (i * INTERVAL '5 minutes'),
+    20.0 + (i * 0.5)
+  FROM generate_series(0, 100) AS i;
+
+  -- Wait for all data to be processed
+  SELECT COUNT(*) FROM gap_filled_sensors;
+
+# Number of times to run the benchmark
+runs: 30
+
@@ -0,0 +1,99 @@
+statement ok
+SET RW_IMPLICIT_FLUSH TO TRUE;
+
+statement ok
+drop table if exists car_sales cascade;
+
+statement ok
+drop table if exists car_info cascade;
+
+statement ok
+drop table if exists car_regions cascade;
+
+statement ok
+drop table if exists t cascade;
+
+statement ok
+create table car_sales(id int, car_id int, region_id int, price int);
+
+statement ok
+create table car_info(id int, name varchar);
+
+statement ok
+create table car_regions(id int, region varchar);
+
+# Create table t that is NOT used in the query
+statement ok
+create table t(a int, b int, c int);
+
+# Test 1: Should fail because 't' is specified in backfill_order but not used in query
+statement error Table or source 't' specified in backfill_order is not used in the query
+create materialized view m1
+    with (backfill_order = FIXED(car_regions -> car_sales, t -> car_sales))
+    as
+      with price_ranges as (
+        select
+          car_info.name as name,
+          car_sales.price as price,
+          round(log10(1 + car_sales.price)::numeric, 1) as price_range
+        from car_sales join car_info
+          on car_sales.car_id = car_info.id
+          join car_regions
+            on car_sales.region_id = car_regions.id
+      )
+      select
+        name,
+        price_range,
+        count(*) as sales_count,
+        sum(price) as sales_volume,
+        avg(price) as sales_avg,
+        min(price) as sales_min,
+        max(price) as sales_max,
+        approx_percentile(0.5) WITHIN GROUP (ORDER BY price) as sales_est_median,
+        approx_percentile(0.01) WITHIN GROUP (ORDER BY price) as sales_est_bottom_1_percent,
+        approx_percentile(0.99) WITHIN GROUP (ORDER BY price) as sales_est_top_1_percent
+      FROM
+        price_ranges
+GROUP BY name, price_range;
+
+# Test 2: Should also fail when 't' is on the right side of the arrow
+statement error Table or source 't' specified in backfill_order is not used in the query
+create materialized view m2
+    with (backfill_order = FIXED(car_sales -> t))
+    as
+      select
+        car_info.name as name,
+        car_sales.price as price
+      from car_sales join car_info
+        on car_sales.car_id = car_info.id
+        join car_regions
+          on car_sales.region_id = car_regions.id;
+
+# Test 3: Should succeed when only actual tables from the query are specified
+statement ok
+create materialized view m3
+    with (backfill_order = FIXED(car_regions -> car_sales))
+    as
+      select
+        car_info.name as name,
+        car_sales.price as price
+      from car_sales join car_info
+        on car_sales.car_id = car_info.id
+        join car_regions
+          on car_sales.region_id = car_regions.id;
+
+# Cleanup
+statement ok
+drop materialized view m3;
+
+statement ok
+drop table car_sales;
+
+statement ok
+drop table car_info;
+
+statement ok
+drop table car_regions;
+
+statement ok
+drop table t;
@@ -61,6 +61,7 @@ user server_version
 user server_version_num
 user sink_decouple
 user sink_rate_limit
+user slow_ddl_notification_secs
 user source_rate_limit
 user standard_conforming_strings
 user statement_timeout
 
@@ -133,35 +133,17 @@ skipif madsim
 system ok
 psql_validate.py --db $__DATABASE__ --sql "DESCRIBE FRAGMENTS describe_plan_test.idx" \
 --expected 'Fragment % (Actor %)
-StreamMaterialize { columns: [name, age, created_at, tbl.id(hidden)], stream_key: [tbl.id], pk_columns: [name, age, tbl.id], pk_conflict: NoCheck }
+StreamMaterialize { columns: [name, age, created_at, tbl.id(hidden)], stream_key: [name, tbl.id], pk_columns: [name, age, tbl.id], pk_conflict: NoCheck }
 ├── output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ]
-├── stream key: [ tbl.id ]
+├── stream key: [ tbl.name, tbl.id ]
 └── MergeExecutor { output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ], stream key: [ tbl.id ] }
 (empty)
 Fragment % (Actor %)
-StreamTableScan { table: tbl, columns: [name, age, created_at, id] }
-├── output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ]
-├── stream key: [ tbl.id ]
+StreamTableScan { table: tbl, columns: [name, age, created_at, id] } { output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ], stream key: [ tbl.id ] }
 ├── Upstream { output: [ name, age, created_at, id ], stream key: [] }
 └── BatchPlanNode { output: [ name, age, created_at, id ], stream key: [] }'
 
 
-skipif madsim
-system ok
-psql_validate.py --db $__DATABASE__ --sql "DESCRIBE FRAGMENTS describe_plan_test.idx" \
---expected 'Fragment % (Actor %)
-StreamMaterialize { columns: [name, age, created_at, tbl.id(hidden)], stream_key: [tbl.id], pk_columns: [name, age, tbl.id], pk_conflict: NoCheck }
-├── output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ]
-├── stream key: [ tbl.id ]
-└── MergeExecutor { output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ], stream key: [ tbl.id ] }
-(empty)
-Fragment % (Actor %)
-StreamTableScan { table: tbl, columns: [name, age, created_at, id] }
-├── output: [ tbl.name, tbl.age, tbl.created_at, tbl.id ]
-├── stream key: [ tbl.id ]
-├── Upstream { output: [ name, age, created_at, id ], stream key: [] }
-└── BatchPlanNode { output: [ name, age, created_at, id ], stream key: [] }'
-
 
 skipif madsim
 system ok