|
12 | 12 | from v03_pipeline.lib.core import DatasetType, ReferenceGenome |
13 | 13 | from v03_pipeline.lib.core.environment import Env |
14 | 14 | from v03_pipeline.lib.logger import get_logger |
| 15 | +from v03_pipeline.lib.misc.math import split_ranges |
15 | 16 | from v03_pipeline.lib.misc.retry import retry |
16 | 17 | from v03_pipeline.lib.paths import ( |
17 | 18 | new_entries_parquet_path, |
@@ -793,12 +794,27 @@ def rebuild_gt_stats( |
793 | 794 | table_name_builder.dst_prefix, |
794 | 795 | table_name_builder.staging_dst_prefix, |
795 | 796 | ) |
796 | | - logged_query( |
| 797 | + # NB: encountered OOMs with large projects, necessitating sharding the insertion query. |
| 798 | + max_key = logged_query( |
797 | 799 | f""" |
798 | | - INSERT INTO {table_name_builder.staging_dst_table(ClickHouseTable.PROJECT_GT_STATS)} |
799 | | - {select_statement} |
| 800 | + SELECT max(key) FROM {table_name_builder.dst_table(ClickHouseTable.GT_STATS)} |
800 | 801 | """, |
801 | | - ) |
| 802 | + )[0][0] |
| 803 | + for range_start, range_end in split_ranges(max_key): |
| 804 | + logged_query( |
| 805 | + f""" |
| 806 | + INSERT INTO { |
| 807 | + table_name_builder.staging_dst_table(ClickHouseTable.PROJECT_GT_STATS) |
| 808 | + } |
| 809 | + { |
| 810 | + select_statement.replace( |
| 811 | + 'GROUP BY project_guid', |
| 812 | + 'WHERE key >= %(range_start)s AND key <= %(range_end)s GROUP BY project_guid', |
| 813 | + ) |
| 814 | + } |
| 815 | + """, |
| 816 | + {'range_start': range_start, 'range_end': range_end}, |
| 817 | + ) |
802 | 818 | finalize_refresh_flow(table_name_builder, project_guids) |
803 | 819 |
|
804 | 820 |
|
|
0 commit comments