From 60e2cbbb2d40968ccccebb1f442ee7a6f787c099 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 29 Oct 2025 16:28:20 -0400 Subject: [PATCH 1/9] tooling for repartitioning --- v03_pipeline/ops/__init__.py | 0 .../ops/repartition_grch38_snv_indel.py | 116 ++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 v03_pipeline/ops/__init__.py create mode 100644 v03_pipeline/ops/repartition_grch38_snv_indel.py diff --git a/v03_pipeline/ops/__init__.py b/v03_pipeline/ops/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/ops/repartition_grch38_snv_indel.py b/v03_pipeline/ops/repartition_grch38_snv_indel.py new file mode 100644 index 000000000..b63b6cc49 --- /dev/null +++ b/v03_pipeline/ops/repartition_grch38_snv_indel.py @@ -0,0 +1,116 @@ +#################################################### +# +# This script is provided as means to repartition an existing `GRCh38/SNV_INDEL/entries` table +# from a project-only partitioning strategy to one that includes project subpartitions. +# Very large genome projects with thousands of families will require additional +# splitting to maintain reasonable loading performance and to keep partition size under +# the recommended ClickHouse maximum. Unfortunately, a ClickHouse table partition definition +# is static upon creation, necessitating the expensive table re-write process demonstrated below. +# +# Post seqr-platform version x.x.x, the subpartition-ing strategy is provided by default; +# earlier installations will continue to function as-is. This script is meant to +# be run under human supervision and with caution. +# +# At the end of this script, you should run the following SQL to finalize the migration: +# EXCHANGE TABLES seqr.'GRCh38/SNV_INDEL/entries' AND staging_grch38_snvindel_repartition.'GRCh38/SNV_INDEL/repartitioned_entries' +# DROP DATABASE `staging_grch38_snvindel_repartition`; +# +# Resource Requirements: +# - Free disk space equal to 2.5x the usage of your current `GRCh38/SNV_INDEL/entries` table. +# +#################################################### +import argparse + +from v03_pipeline.lib.misc.clickhouse import ( + logged_query, + normalize_partition, +) + +DATABASE_NAME = 'staging_grch38_snvindel_repartition' + + +def get_partitions_for_project(project_guid: str): + rows = logged_query( + """ + SELECT DISTINCT partition + FROM system.parts + WHERE + database = %(database)s + AND table = %(table)s + AND partition like %(project_guids)s + """, + { + 'database': DATABASE_NAME, + 'table': 'GRCh38/SNV_INDEL/repartitioned_entries', + 'project_guid': project_guid, + }, + ) + return [normalize_partition(row[0]) for row in rows] + + +def main(max_insert_threads: int, project_guids: list[str]): + logged_query( + f""" + CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}; + """, + ) + logged_query( + f""" + CREATE TABLE IF NOT EXISTS {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + AS {DATABASE_NAME}.`GRCh38/SNV_INDEL/entries` PARTITION BY (project_guid, partition_id) + """, + ) + logged_query( + f""" + ALTER TABLE {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` ADD COLUMN `n_partitions` UInt8 MATERIALIZED dictGetOrDefault('GRCh38/SNV_INDEL/project_partitions_dict', 'n_partitions', project_guid, 1) AFTER `sign`; + """, + ) + logged_query( + f""" + ALTER TABLE {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` ADD COLUMN `partition_id` UInt8 MATERIALIZED farmHash64(family_guid) % n_partitions AFTER `n_partitions`; + """, + ) + if not project_guids: + project_guids = logged_query( + """ + SELECT DISTINCT project_guid from seqr.`GRCh38/SNV_INDEL/gt_stats` + """, + ) + for project_guid in project_guids: + for partition in get_partitions_for_project( + project_guid, + ): + logged_query( + f""" + ALTER TABLE {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + DROP PARTITION %(partition)s + """, + {'partition': partition}, + ) + logged_query( + f""" + INSERT INTO {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + SELECT * FROM `GRCh38/SNV_INDEL/entries + WHERE project_guid=%(project_guid)s + SETTINGS max_insert_threads=%(max_insert_threads)s + """, # noqa: S608 + {'project_guid': project_guid, 'max_insert_threads': max_insert_threads}, + ) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--max-insert-threads', + type=int, + default=4, + help='Maximum number of insert threads to use (default: 4).', + ) + parser.add_argument( + '--project-guids', + nargs='+', + required=True, + help='Optionally provide an override list of project guids: --project-guids proj1 proj2 proj3', + ) + args = parser.parse_args() + main(args.max_insert_threads, args.project_guids) From 337b3c7c99b0877144c591b1ea2c1a6e9cf77653 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 29 Oct 2025 17:16:36 -0400 Subject: [PATCH 2/9] lint --- v03_pipeline/ops/repartition_grch38_snv_indel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/ops/repartition_grch38_snv_indel.py b/v03_pipeline/ops/repartition_grch38_snv_indel.py index b63b6cc49..b7e482856 100644 --- a/v03_pipeline/ops/repartition_grch38_snv_indel.py +++ b/v03_pipeline/ops/repartition_grch38_snv_indel.py @@ -93,7 +93,7 @@ def main(max_insert_threads: int, project_guids: list[str]): SELECT * FROM `GRCh38/SNV_INDEL/entries WHERE project_guid=%(project_guid)s SETTINGS max_insert_threads=%(max_insert_threads)s - """, # noqa: S608 + """, # noqa: S608 {'project_guid': project_guid, 'max_insert_threads': max_insert_threads}, ) From e83c4f799e38b686ebefc23d943da1976dfcd732 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 30 Oct 2025 12:40:14 -0400 Subject: [PATCH 3/9] test scaffold --- ...epartition_clickhouse_grch38_snv_indel.py} | 11 +-- ...tition_clickhouse_grch38_snv_indel_test.py | 86 +++++++++++++++++++ 2 files changed, 92 insertions(+), 5 deletions(-) rename v03_pipeline/ops/{repartition_grch38_snv_indel.py => repartition_clickhouse_grch38_snv_indel.py} (91%) create mode 100644 v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py diff --git a/v03_pipeline/ops/repartition_grch38_snv_indel.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py similarity index 91% rename from v03_pipeline/ops/repartition_grch38_snv_indel.py rename to v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py index b7e482856..7b9c3f98b 100644 --- a/v03_pipeline/ops/repartition_grch38_snv_indel.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py @@ -21,6 +21,7 @@ #################################################### import argparse +from v03_pipeline.lib.core.environment import Env from v03_pipeline.lib.misc.clickhouse import ( logged_query, normalize_partition, @@ -57,7 +58,7 @@ def main(max_insert_threads: int, project_guids: list[str]): logged_query( f""" CREATE TABLE IF NOT EXISTS {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` - AS {DATABASE_NAME}.`GRCh38/SNV_INDEL/entries` PARTITION BY (project_guid, partition_id) + AS {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` PARTITION BY (project_guid, partition_id) """, ) logged_query( @@ -72,8 +73,8 @@ def main(max_insert_threads: int, project_guids: list[str]): ) if not project_guids: project_guids = logged_query( - """ - SELECT DISTINCT project_guid from seqr.`GRCh38/SNV_INDEL/gt_stats` + f""" + SELECT DISTINCT project_guid from {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/gt_stats` """, ) for project_guid in project_guids: @@ -90,10 +91,10 @@ def main(max_insert_threads: int, project_guids: list[str]): logged_query( f""" INSERT INTO {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` - SELECT * FROM `GRCh38/SNV_INDEL/entries + SELECT * FROM {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries WHERE project_guid=%(project_guid)s SETTINGS max_insert_threads=%(max_insert_threads)s - """, # noqa: S608 + """, {'project_guid': project_guid, 'max_insert_threads': max_insert_threads}, ) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py new file mode 100644 index 000000000..53068f03f --- /dev/null +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py @@ -0,0 +1,86 @@ +import unittest + +from repartion_clickhouse_grch38_snv_indel import main + +from v03_pipeline.lib.core.environment import Env +from v03_pipeline.lib.misc.clickhouse import get_clickhouse_client + + +class RepartitionGRCh38SnvIndelTest(unittest.TestCase): + def setUp(self): + client = get_clickhouse_client() + client.execute( + f""" + DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE}; + """, + ) + client.execute( + f""" + CREATE DATABASE {Env.CLICKHOUSE_DATABASE}; + """, + ) + client.execute( + f""" + CREATE TABLE {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` ( + `key` UInt32, + `project_guid` LowCardinality(String), + `family_guid` String, + `is_annotated_in_any_gene` Boolean, + `sign` Int8, + PROJECTION xpos_projection + ( + SELECT * + ORDER BY is_annotated_in_any_gene + ) + ) + ENGINE = CollapsingMergeTree(sign) + PARTITION BY project_guid + ORDER BY (project_guid, family_guid, key) + SETTINGS deduplicate_merge_projection_mode = 'rebuild'; + """, + ) + client.execute( + f""" + INSERT INTO {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` + VALUES + (0, 'project_a', 'family_a1', 0, 1), + (1, 'project_a', 'family_a2', 0, 1), + (2, 'project_a', 'family_a3', 0, 1), + (3, 'project_a', 'family_a4', 0, 1), + (4, 'project_a', 'family_a5', 1, 1), + (4, 'project_a', 'family_a6', 0, 1), + (0, 'project_b', 'family_b1', 0, 1), + (1, 'project_b', 'family_b2', 0, 1), + (2, 'project_b', 'family_b2', 0, 1), + (3, 'project_b', 'family_b3', 0, 1), + (4, 'project_b', 'family_b3', 0, 1), + (0, 'project_c', 'family_c1', 1, 1), + (3, 'project_c', 'family_c2', 1, 1), + (4, 'project_c', 'family_c3', 1, 1), + (5, 'project_c', 'family_c4', 1, 1) + """, + ) + client.execute( + f""" + CREATE DICTIONARY {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/project_partitions_dict` + ( + `project_guid` String, + `n_partitions` UInt8 + ) + PRIMARY KEY project_guid + SOURCE( + CLICKHOUSE( + USER {Env.CLICKHOUSE_WRITER_USER} PASSWORD {Env.CLICKHOUSE_WRITER_PASSWORD} + DB {Env.CLICKHOUSE_DATABASE} QUERY 'SELECT project_guid, 3 FROM `GRCh38/SNV_INDEL/entries`' + ) + ) + LIFETIME(0) + LAYOUT(FLAT(MAX_ARRAY_SIZE 10000)) + """, + ) + + def test_main_all_projects(self): + main(1, []) + + def test_main_one_project(self): + main(1, ['project_a']) From 0afa8478e4df21cabc3b7e5a6a359c1e6036cf99 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 30 Oct 2025 12:57:03 -0400 Subject: [PATCH 4/9] ops --- .../ops/repartition_clickhouse_grch38_snv_indel_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py index 53068f03f..dbf01a3dd 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py @@ -1,6 +1,6 @@ import unittest -from repartion_clickhouse_grch38_snv_indel import main +from v03_pipeline.ops.repartion_clickhouse_grch38_snv_indel import main from v03_pipeline.lib.core.environment import Env from v03_pipeline.lib.misc.clickhouse import get_clickhouse_client From 388b6f52da4b0c1ece065de58a221f79132e148a Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 30 Oct 2025 12:57:44 -0400 Subject: [PATCH 5/9] ruff --- .../ops/repartition_clickhouse_grch38_snv_indel_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py index dbf01a3dd..33ab44e5f 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py @@ -1,9 +1,8 @@ import unittest -from v03_pipeline.ops.repartion_clickhouse_grch38_snv_indel import main - from v03_pipeline.lib.core.environment import Env from v03_pipeline.lib.misc.clickhouse import get_clickhouse_client +from v03_pipeline.ops.repartion_clickhouse_grch38_snv_indel import main class RepartitionGRCh38SnvIndelTest(unittest.TestCase): From 4e159197f240b5d52227da26bd33c736763101a9 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 30 Oct 2025 14:00:42 -0400 Subject: [PATCH 6/9] fix up --- .../ops/repartition_clickhouse_grch38_snv_indel_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py index 33ab44e5f..ffe50bc52 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py @@ -2,7 +2,7 @@ from v03_pipeline.lib.core.environment import Env from v03_pipeline.lib.misc.clickhouse import get_clickhouse_client -from v03_pipeline.ops.repartion_clickhouse_grch38_snv_indel import main +from v03_pipeline.ops.repartition_clickhouse_grch38_snv_indel import main class RepartitionGRCh38SnvIndelTest(unittest.TestCase): From cbf46735ada27d4af6fad339935cfb7a3ae66ae1 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 31 Oct 2025 14:56:24 -0400 Subject: [PATCH 7/9] cleanup and tests passing --- ...repartition_clickhouse_grch38_snv_indel.py | 39 ++++++-------- ...tition_clickhouse_grch38_snv_indel_test.py | 54 +++++++++++++++---- 2 files changed, 59 insertions(+), 34 deletions(-) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py index 7b9c3f98b..fab4f1439 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py @@ -27,7 +27,7 @@ normalize_partition, ) -DATABASE_NAME = 'staging_grch38_snvindel_repartition' +REPARTITION_DATABASE_NAME = 'staging_grch38_snvindel_repartition' def get_partitions_for_project(project_guid: str): @@ -38,12 +38,12 @@ def get_partitions_for_project(project_guid: str): WHERE database = %(database)s AND table = %(table)s - AND partition like %(project_guids)s + AND partition like %(project_guid)s """, { - 'database': DATABASE_NAME, + 'database': REPARTITION_DATABASE_NAME, 'table': 'GRCh38/SNV_INDEL/repartitioned_entries', - 'project_guid': project_guid, + 'project_guid': f'%{project_guid}%', }, ) return [normalize_partition(row[0]) for row in rows] @@ -52,46 +52,39 @@ def get_partitions_for_project(project_guid: str): def main(max_insert_threads: int, project_guids: list[str]): logged_query( f""" - CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}; + CREATE DATABASE IF NOT EXISTS {REPARTITION_DATABASE_NAME}; """, ) logged_query( f""" - CREATE TABLE IF NOT EXISTS {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + CREATE TABLE IF NOT EXISTS {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` AS {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` PARTITION BY (project_guid, partition_id) """, ) - logged_query( - f""" - ALTER TABLE {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` ADD COLUMN `n_partitions` UInt8 MATERIALIZED dictGetOrDefault('GRCh38/SNV_INDEL/project_partitions_dict', 'n_partitions', project_guid, 1) AFTER `sign`; - """, - ) - logged_query( - f""" - ALTER TABLE {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` ADD COLUMN `partition_id` UInt8 MATERIALIZED farmHash64(family_guid) % n_partitions AFTER `n_partitions`; - """, - ) if not project_guids: - project_guids = logged_query( - f""" - SELECT DISTINCT project_guid from {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/gt_stats` + project_guids = [ + x[0] + for x in logged_query( + f""" + SELECT DISTINCT project_guid from {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` """, - ) + ) + ] for project_guid in project_guids: for partition in get_partitions_for_project( project_guid, ): logged_query( f""" - ALTER TABLE {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + ALTER TABLE {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` DROP PARTITION %(partition)s """, {'partition': partition}, ) logged_query( f""" - INSERT INTO {DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` - SELECT * FROM {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries + INSERT INTO {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + SELECT * FROM {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` WHERE project_guid=%(project_guid)s SETTINGS max_insert_threads=%(max_insert_threads)s """, diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py index ffe50bc52..7723088cc 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py @@ -2,7 +2,7 @@ from v03_pipeline.lib.core.environment import Env from v03_pipeline.lib.misc.clickhouse import get_clickhouse_client -from v03_pipeline.ops.repartition_clickhouse_grch38_snv_indel import main +from v03_pipeline.ops.repartition_clickhouse_grch38_snv_indel import main, REPARTITION_DATABASE_NAME class RepartitionGRCh38SnvIndelTest(unittest.TestCase): @@ -10,12 +10,16 @@ def setUp(self): client = get_clickhouse_client() client.execute( f""" - DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE}; + DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE} + PARALLEL WITH + DROP DATABASE IF EXISTS {REPARTITION_DATABASE_NAME}; """, ) client.execute( f""" - CREATE DATABASE {Env.CLICKHOUSE_DATABASE}; + CREATE DATABASE {Env.CLICKHOUSE_DATABASE} + PARALLEL WITH + CREATE DATABASE {REPARTITION_DATABASE_NAME}; """, ) client.execute( @@ -26,6 +30,8 @@ def setUp(self): `family_guid` String, `is_annotated_in_any_gene` Boolean, `sign` Int8, + `n_partitions` UInt8 MATERIALIZED 2, + `partition_id` UInt8 MATERIALIZED farmHash64(family_guid) % n_partitions, PROJECTION xpos_projection ( SELECT * @@ -45,18 +51,11 @@ def setUp(self): (0, 'project_a', 'family_a1', 0, 1), (1, 'project_a', 'family_a2', 0, 1), (2, 'project_a', 'family_a3', 0, 1), - (3, 'project_a', 'family_a4', 0, 1), - (4, 'project_a', 'family_a5', 1, 1), - (4, 'project_a', 'family_a6', 0, 1), (0, 'project_b', 'family_b1', 0, 1), (1, 'project_b', 'family_b2', 0, 1), (2, 'project_b', 'family_b2', 0, 1), - (3, 'project_b', 'family_b3', 0, 1), - (4, 'project_b', 'family_b3', 0, 1), (0, 'project_c', 'family_c1', 1, 1), (3, 'project_c', 'family_c2', 1, 1), - (4, 'project_c', 'family_c3', 1, 1), - (5, 'project_c', 'family_c4', 1, 1) """, ) client.execute( @@ -69,7 +68,7 @@ def setUp(self): PRIMARY KEY project_guid SOURCE( CLICKHOUSE( - USER {Env.CLICKHOUSE_WRITER_USER} PASSWORD {Env.CLICKHOUSE_WRITER_PASSWORD} + USER '{Env.CLICKHOUSE_WRITER_USER}' PASSWORD '{Env.CLICKHOUSE_WRITER_PASSWORD}' DB {Env.CLICKHOUSE_DATABASE} QUERY 'SELECT project_guid, 3 FROM `GRCh38/SNV_INDEL/entries`' ) ) @@ -79,7 +78,40 @@ def setUp(self): ) def test_main_all_projects(self): + client = get_clickhouse_client() main(1, []) + res = client.execute( + f""" + SELECT *, n_partitions, partition_id FROM {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + """ + ) + self.assertCountEqual( + res, + [ + (3, 'project_c', 'family_c2', True, 1, 2, 1), + (0, 'project_b', 'family_b1', 0, 1, 2, 1), + (1, 'project_b', 'family_b2', 0, 1, 2, 1), + (2, 'project_b', 'family_b2', 0, 1, 2, 1), + (2, 'project_a', 'family_a3', 0, 1, 2, 0), + (0, 'project_a', 'family_a1', 0, 1, 2, 1), + (1, 'project_a', 'family_a2', 0, 1, 2, 1), + (0, 'project_c', 'family_c1', True, 1, 2, 0), + ], + ) def test_main_one_project(self): + client = get_clickhouse_client() main(1, ['project_a']) + res = client.execute( + f""" + SELECT *, n_partitions, partition_id FROM {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` + """ + ) + self.assertCountEqual( + res, + [ + (2, 'project_a', 'family_a3', 0, 1, 2, 0), + (0, 'project_a', 'family_a1', 0, 1, 2, 1), + (1, 'project_a', 'family_a2', 0, 1, 2, 1), + ], + ) From d7df7e2ed0a86a987718a2fbd4a33320455d665c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 31 Oct 2025 15:06:29 -0400 Subject: [PATCH 8/9] ruff --- .../ops/repartition_clickhouse_grch38_snv_indel_test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py index 7723088cc..e09cab5fc 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel_test.py @@ -2,7 +2,10 @@ from v03_pipeline.lib.core.environment import Env from v03_pipeline.lib.misc.clickhouse import get_clickhouse_client -from v03_pipeline.ops.repartition_clickhouse_grch38_snv_indel import main, REPARTITION_DATABASE_NAME +from v03_pipeline.ops.repartition_clickhouse_grch38_snv_indel import ( + REPARTITION_DATABASE_NAME, + main, +) class RepartitionGRCh38SnvIndelTest(unittest.TestCase): @@ -83,7 +86,7 @@ def test_main_all_projects(self): res = client.execute( f""" SELECT *, n_partitions, partition_id FROM {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` - """ + """, ) self.assertCountEqual( res, @@ -105,7 +108,7 @@ def test_main_one_project(self): res = client.execute( f""" SELECT *, n_partitions, partition_id FROM {REPARTITION_DATABASE_NAME}.`GRCh38/SNV_INDEL/repartitioned_entries` - """ + """, ) self.assertCountEqual( res, From ac6d938994f775aff9f5e4693e472d969a504623 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 31 Oct 2025 15:35:57 -0400 Subject: [PATCH 9/9] optionally! --- v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py index fab4f1439..ffc0a52f8 100644 --- a/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py +++ b/v03_pipeline/ops/repartition_clickhouse_grch38_snv_indel.py @@ -103,7 +103,7 @@ def main(max_insert_threads: int, project_guids: list[str]): parser.add_argument( '--project-guids', nargs='+', - required=True, + required=False, help='Optionally provide an override list of project guids: --project-guids proj1 proj2 proj3', ) args = parser.parse_args()