ydb-platform · vgvoleg · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
@@ -30,9 +30,54 @@ jobs:
 
     strategy:
       matrix:
-        workload:
-          - sync-table
-          - sync-query
+        include:
+          - prefix: table
+            workload: sync-table
+            create-args: grpc://localhost:2135 /Root/testdb
+            run-args: |
+              grpc://localhost:2135 /Root/testdb \
+              --prom-pgw localhost:9091 \
+              --report-period 250 \
+              --time ${{inputs.slo_workload_duration_seconds || 600}} \
+              --read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
+              --write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
+              --read-timeout 1000 \
+              --write-timeout 1000
+            cleanup-args: grpc://localhost:2135 /Root/testdb
+          - prefix: table
+            workload: sync-query
+            create-args: grpc://localhost:2135 /Root/testdb
+            run-args: |
+              grpc://localhost:2135 /Root/testdb \
+              --prom-pgw localhost:9091 \
+              --report-period 250 \
+              --time ${{inputs.slo_workload_duration_seconds || 600}} \
+              --read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
+              --write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
+              --read-timeout 1000 \
+              --write-timeout 1000
+            cleanup-args: grpc://localhost:2135 /Root/testdb
+          - prefix: topic
+            workload: topic-basic
+            create-args: |
+              grpc://localhost:2135 /Root/testdb \
+              --path /Root/testdb/slo_topic \
+              --partitions-count 10
+            run-args: |
+              grpc://localhost:2135 /Root/testdb \
+              --path /Root/testdb/slo_topic \
+              --prom-pgw localhost:9091 \
+              --partitions-count 10 \
+              --read-threads 10 \
+              --write-threads 10 \
+              --report-period 250 \
+              --time ${{inputs.slo_workload_duration_seconds || 600}} \
+              --read-rps ${{inputs.slo_workload_read_max_rps || 100}} \
+              --write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
+              --read-timeout 5000 \
+              --write-timeout 5000
+            cleanup-args: grpc://localhost:2135 /Root/testdb --path /Root/testdb/slo_topic
+
 
     concurrency:
       group: slo-${{ github.ref }}-${{ matrix.workload }}
@@ -64,26 +109,19 @@ jobs:
 
       - name: Prepare SLO Database
         run: |
-          python ./tests/slo/src create grpc://localhost:2135 /Root/testdb
+          python ./tests/slo/src ${{ matrix.prefix }}-create ${{ matrix.create-args }}
 
       - name: Run SLO Tests
         env:
           REF: '${{ github.head_ref || github.ref }}'
           WORKLOAD: '${{ matrix.workload }}'
         run: |
-          python ./tests/slo/src run grpc://localhost:2135 /Root/testdb \
-            --prom-pgw localhost:9091 \
-            --report-period 250 \
-            --time ${{inputs.slo_workload_duration_seconds || 600}} \
-            --read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
-            --write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
-            --read-timeout 1000 \
-            --write-timeout 1000
+          python ./tests/slo/src ${{ matrix.prefix }}-run ${{ matrix.run-args }}
 
       - if: always()
         name: Cleanup SLO Database
         run: |
-          python ./tests/slo/src cleanup grpc://localhost:2135 /Root/testdb
+          python ./tests/slo/src ${{ matrix.prefix }}-cleanup ${{ matrix.cleanup-args }}
 
       - if: always()
         name: Store ydb chaos testing logs

diff --git a/tests/slo/README.md b/tests/slo/README.md
@@ -3,42 +3,72 @@
 SLO is the type of test where app based on ydb-sdk is tested against falling YDB cluster nodes, tablets, network
 (that is possible situations for distributed DBs with hundreds of nodes)
 
-### Implementations:
+### Workload types:
+
+There are two workload types:
+
+- **Table SLO** - tests table operations (read/write)
+- **Topic SLO** - tests topic operations (publish/consume)
 
-There are two implementations:
+### Implementations:
 
 - `sync`
 - `async` (now unimplemented)
 
 ### Usage:
 
-It has 3 commands:
+Each workload type has 3 commands:
+
+**Table commands:**
+- `table-create`  - creates table in database
+- `table-cleanup` - drops table in database
+- `table-run`     - runs table workload (read and write to table with set RPS)
 
-- `create`  - creates table in database
-- `cleanup` - drops table in database
-- `run`     - runs workload (read and write to table with sets RPS)
+**Topic commands:**
+- `topic-create`  - creates topic with consumer in database
+- `topic-cleanup` - drops topic in database
+- `topic-run`     - runs topic workload (publish and consume messages with set RPS)
 
 ### Run examples with all arguments:
 
-create:
-`python tests/slo/src/ create localhost:2136 /local -t tableName
+**Table examples:**
+
+table-create:
+`python tests/slo/src/ table-create localhost:2136 /local -t tableName
 --min-partitions-count 6 --max-partitions-count 1000 --partition-size 1 -с 1000
 --write-timeout 10000`
 
-cleanup:
-`python tests/slo/src/ cleanup localhost:2136 /local -t tableName`
+table-cleanup:
+`python tests/slo/src/ table-cleanup localhost:2136 /local -t tableName`
 
-run:
-`python tests/slo/src/ run localhost:2136 /local -t tableName
---prom-pgw http://prometheus-pushgateway:9091 -report-period 250
+table-run:
+`python tests/slo/src/ table-run localhost:2136 /local -t tableName
+--prom-pgw http://prometheus-pushgateway:9091 --report-period 250
 --read-rps 1000 --read-timeout 10000
 --write-rps 100 --write-timeout 10000
 --time 600 --shutdown-time 30`
 
+**Topic examples:**
+
+topic-create:
+`python tests/slo/src/ topic-create localhost:2136 /local
+--topic-path /local/slo_topic --topic-consumer slo_consumer`
+
+topic-cleanup:
+`python tests/slo/src/ topic-cleanup localhost:2136 /local --topic-path /local/slo_topic`
+
+topic-run:
+`python tests/slo/src/ topic-run localhost:2136 /local
+--topic-path /local/slo_topic --topic-consumer slo_consumer
+--prom-pgw http://prometheus-pushgateway:9091 --report-period 250
+--topic-write-rps 50 --topic-read-rps 100
+--topic-write-timeout 5000 --topic-read-timeout 3000
+--time 600 --shutdown-time 30`
+
 ## Arguments for commands:
 
-### create
-`python tests/slo/src/ create <endpoint> <db> [options]`
+### table-create
+`python tests/slo/src/ table-create <endpoint> <db> [options]`
 
 ```
 Arguments:
@@ -61,8 +91,8 @@ Options:
 
 ```
 
-### cleanup
-`python tests/slo/src/ cleanup <endpoint> <db> [options]`
+### table-cleanup
+`python tests/slo/src/ table-cleanup <endpoint> <db> [options]`
 
 ```
 Arguments:
@@ -73,8 +103,8 @@ Options:
   -t --table-name                  <string> table name to create
 ```
 
-### run
-`python tests/slo/src/ run <endpoint> <db> [options]`
+### table-run
+`python tests/slo/src/ table-run <endpoint> <db> [options]`
 
 ```
 Arguments:
@@ -100,12 +130,70 @@ Options:
   --write-threads         <int>    number of threads to use for read requests
 ```
 
+### topic-create
+`python tests/slo/src/ topic-create <endpoint> <db> [options]`
+
+```
+Arguments:
+  endpoint                        YDB endpoint to connect to
+  db                              YDB database to connect to
+
+Options:
+  --topic-path                    <string> topic path to create
+  --topic-consumer                <string> consumer name
+  --topic-min-partitions          <int>    minimum active partitions
+  --topic-max-partitions          <int>    maximum active partitions
+  --topic-retention-hours         <int>    retention period in hours
+```
+
+### topic-cleanup
+`python tests/slo/src/ topic-cleanup <endpoint> <db> [options]`
+
+```
+Arguments:
+  endpoint                        YDB endpoint to connect to
+  db                              YDB database to connect to
+
+Options:
+  --topic-path                    <string> topic path to drop
+```
+
+### topic-run
+`python tests/slo/src/ topic-run <endpoint> <db> [options]`
+
+```
+Arguments:
+  endpoint                        YDB endpoint to connect to
+  db                              YDB database to connect to
+
+Options:
+  --topic-path                    <string> topic path
+  --topic-consumer                <string> consumer name
+
+  --prom-pgw                      <string> prometheus push gateway
+  --report-period                 <int>    prometheus push period in milliseconds
+
+  --topic-read-rps                <int>    read RPS for topics
+  --topic-read-timeout            <int>    read timeout milliseconds for topics
+  --topic-write-rps               <int>    write RPS for topics
+  --topic-write-timeout           <int>    write timeout milliseconds for topics
+
+  --topic-message-size            <int>    message size in bytes
+  --topic-read-threads            <int>    number of threads to use for read requests
+  --topic-write-threads           <int>    number of threads to use for write requests
+
+  --time                          <int>    run time in seconds
+  --shutdown-time                 <int>    graceful shutdown time in seconds
+```
+
 ## Authentication
 
 Workload using [auth-env](https://ydb.yandex-team.ru/docs/reference/ydb-sdk/recipes/auth-env) for authentication.
 
 ## What's inside
-When running `run` command, the program creates three jobs: `readJob`, `writeJob`, `metricsJob`.
+
+### Table workload
+When running `table-run` command, the program creates three jobs: `readJob`, `writeJob`, `metricsJob`.
 
 - `readJob`    reads rows from the table one by one with random identifiers generated by writeJob
 - `writeJob`   generates and inserts rows
@@ -120,13 +208,27 @@ Table have these fields:
 
 Primary key: `("object_hash", "object_id")`
 
+### Topic workload
+When running `topic-run` command, the program creates three jobs: `readJob`, `writeJob`, `metricsJob`.
+
+- `readJob`    reads messages from topic using TopicReader and commits offsets
+- `writeJob`   generates and publishes messages to topic using TopicWriter
+- `metricsJob` periodically sends metrics to Prometheus
+
+Messages contain:
+- Sequential message ID
+- Thread identifier
+- Configurable payload size (padded with 'x' characters)
+
 ## Collected metrics
 - `oks`      - amount of OK requests
 - `not_oks`  - amount of not OK requests
 - `inflight` - amount of requests in flight
 - `latency`  - summary of latencies in ms
 - `attempts` - summary of amount for request
 
+Metrics are collected for both table operations (`read`, `write`) and topic operations (`read`, `write`).
+
 > You must reset metrics to keep them `0` in prometheus and grafana before beginning and after ending of jobs
 
 ## Look at metrics in grafana

diff --git a/tests/slo/playground/README.md b/tests/slo/playground/README.md
@@ -0,0 +1,40 @@
+# SLO playground
+
+Playground may be used for testing SLO workloads locally
+
+It has several services:
+
+- `prometheus` - storage for metrics
+- `prometheus-pushgateway` - push acceptor for prometheus
+- `grafana` - provides chats for metrics
+- `ydb` - local instance of ydb-database to run workload with
+
+## Network addresses
+
+- Grafana dashboard: http://localhost:3000
+- Prometheus pushgateway: http://localhost:9091
+- YDB monitoring: http://localhost:8765
+- YDB GRPC: grpc://localhost:2136
+- YDB GRPC TLS: grpcs://localhost:2135
+
+## Start
+
+```shell
+docker-compose up -d
+```
+
+## Stop
+
+```shell
+docker-compose down
+```
+
+## Configs
+
+Grafana's dashboards stored in `configs/grafana/provisioning/dashboards`
+
+## Data
+
+YDB databases are not persistent
+
+All other data like metrics and certs stored in `data/`
diff --git a/tests/slo/playground/configs/chaos.sh b/tests/slo/playground/configs/chaos.sh
@@ -0,0 +1,52 @@
+#!/bin/sh -e
+
+get_random_container() {
+    # Get a list of all containers starting with ydb-database-*
+    containers=$(docker ps --format '{{.Names}}' | grep '^ydb-database-')
+
+    # Convert the list to a newline-separated string
+    containers=$(echo "$containers" | tr ' ' '\n')
+
+    # Count the number of containers
+    containersCount=$(echo "$containers" | wc -l)
+
+    # Generate a random number between 0 and containersCount - 1
+    randomIndex=$(shuf -i 0-$(($containersCount - 1)) -n 1)
+
+    # Get the container name at the random index
+    nodeForChaos=$(echo "$containers" | sed -n "$(($randomIndex + 1))p")
+}
+
+
+sleep 60
+
+echo "Start CHAOS YDB cluster!"
+
+for i in $(seq 1 5)
+do
+  echo "[$(date)]: docker stop/start iteration $i"
+
+  get_random_container
+
+  sh -c "docker stop ${nodeForChaos} -t 10"
+  sh -c "docker start ${nodeForChaos}"
+
+  sleep 60
+done
+
+for i in $(seq 1 3)
+do
+  echo "[$(date)]: docker restart iteration $i"
+
+  get_random_container
+
+  sh -c "docker restart ${nodeForChaos} -t 0"
+
+  sleep 60
+done
+
+get_random_container
+
+echo "[$(date)]: docker kill -s SIGKILL ${nodeForChaos}"
+
+sh -c "docker kill -s SIGKILL ${nodeForChaos}"