Skip to content

Commit b8ee4a0

Browse files
committed
add argument chunck size
1 parent f9a7d3b commit b8ee4a0

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

bolt/common/pcgr.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,12 +524,17 @@ def annotate_record(record, annotations, *, allow_missing=False):
524524

525525
return record
526526

527-
def split_vcf(input_vcf, output_dir):
527+
def split_vcf(input_vcf, output_dir, *, max_variants=None):
528528
"""
529529
Splits a VCF file into multiple chunks, each containing up to max_variants variants.
530530
Each chunk includes the VCF header.
531531
Ensures no overlapping positions between chunks.
532532
"""
533+
if max_variants is None:
534+
max_variants = constants.MAX_SOMATIC_VARIANTS
535+
elif max_variants <= 0:
536+
raise ValueError("max_variants must be a positive integer.")
537+
533538
output_dir = pathlib.Path(output_dir / "vcf_chunks")
534539
output_dir.mkdir(parents=True, exist_ok=True)
535540
chunk_files = []
@@ -547,7 +552,7 @@ def split_vcf(input_vcf, output_dir):
547552
for record in vcf_in:
548553
current_position = record.POS
549554
# Check if we need to start a new chunk
550-
if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
555+
if variant_count >= max_variants and (last_position is None or current_position != last_position):
551556
# Close the current chunk file and start a new one
552557
vcf_out.close()
553558
chunk_number += 1

bolt/workflows/smlv_somatic/annotate.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
@click.option('--threads', required=False, default=4, type=int)
3131

3232
@click.option('--output_dir', required=True, type=click.Path())
33+
@click.option('--pcgr_variant_chunk_size', required=False, type=int, help='Override maximum variants per PCGR chunk.')
3334

3435
def entry(ctx, **kwargs):
3536
'''Annotate variants with information from several sources\f
@@ -99,8 +100,13 @@ def entry(ctx, **kwargs):
99100
print(f"Total number of variants in the input VCF: {total_variants}")
100101

101102
# Run PCGR in chunks if exceeding the maximum allowed for somatic variants
102-
if total_variants > constants.MAX_SOMATIC_VARIANTS:
103-
vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir)
103+
chunk_size = kwargs.get('pcgr_variant_chunk_size')
104+
if chunk_size is not None and chunk_size <= 0:
105+
raise click.BadParameter('must be a positive integer', param_hint='--pcgr_variant_chunk_size')
106+
chunk_size = chunk_size or constants.MAX_SOMATIC_VARIANTS
107+
108+
if total_variants > chunk_size:
109+
vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir, max_variants=chunk_size)
104110
pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic_chunck(
105111
vcf_chunks,
106112
kwargs['pcgr_data_dir'],

0 commit comments

Comments
 (0)