From 6bada08ca2a84d38ba90c0ffc5b50f44cfbe4ac9 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Sun, 15 Sep 2024 15:25:08 -0400 Subject: [PATCH 1/5] Add WDL script for normalizing VCF files by removing HAPCOMP and HAPDOM fields --- wdl/malaria/NormalizeVCF.wdl | 94 ++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 wdl/malaria/NormalizeVCF.wdl diff --git a/wdl/malaria/NormalizeVCF.wdl b/wdl/malaria/NormalizeVCF.wdl new file mode 100644 index 000000000..a3ce4d6f7 --- /dev/null +++ b/wdl/malaria/NormalizeVCF.wdl @@ -0,0 +1,94 @@ +version 1.0 + +workflow NormalizeVCF { + # Define inputs in an input block + input { + File input_vcf + File reference_fa + } + + # Step 1: Remove HAPCOMP field + call RemoveHAPCOMP { + input: + input_vcf = input_vcf + } + + # Step 2: Remove HAPDOM field + call RemoveHAPDOM { + input: + input_vcf = RemoveHAPCOMP.output_vcf + } + + # Step 3: Normalize VCF + call NormalizeVCFFile { + input: + input_vcf = RemoveHAPDOM.output_vcf, + reference_fa = reference_fa + } + + # Output + output { + File normalized_vcf = NormalizeVCFFile.output_vcf + } +} + +task RemoveHAPCOMP { + input { + File input_vcf + } + + command { + bcftools annotate -x 'INFO/HAPCOMP' ~{input_vcf} | bgzip -c > output.no_hapcomp.vcf.gz + } + + output { + File output_vcf = "output.no_hapcomp.vcf.gz" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" + memory: "4G" + cpu: 1 + } +} + +task RemoveHAPDOM { + input { + File input_vcf + } + + command { + bcftools annotate -x 'INFO/HAPDOM' ~{input_vcf} | bgzip -c > output.no_hapdom.vcf.gz + } + + output { + File output_vcf = "output.no_hapdom.vcf.gz" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" + memory: "4G" + cpu: 1 + } +} + +task NormalizeVCFFile { + input { + File input_vcf + File reference_fa + } + + command { + bcftools norm -m -any --atom-overlaps . -f ~{reference_fa} ~{input_vcf} | bgzip -c > output.norm.vcf.gz + } + + output { + File output_vcf = "output.norm.vcf.gz" + } + + runtime { + docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" + memory: "4G" + cpu: 1 + } +} From 5955ce5093993353628d1ebc95460f1496520fbf Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Sun, 15 Sep 2024 15:30:41 -0400 Subject: [PATCH 2/5] Disk SSD --- wdl/malaria/NormalizeVCF.wdl | 47 +++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/wdl/malaria/NormalizeVCF.wdl b/wdl/malaria/NormalizeVCF.wdl index a3ce4d6f7..09a8a6e32 100644 --- a/wdl/malaria/NormalizeVCF.wdl +++ b/wdl/malaria/NormalizeVCF.wdl @@ -1,32 +1,39 @@ version 1.0 workflow NormalizeVCF { - # Define inputs in an input block input { File input_vcf File reference_fa + Int disk_size_gb = 20 # Default disk size in GB + Int memory_gb = 4 # Default memory size in GB + Int cpu_cores = 1 # Default number of CPU cores } - # Step 1: Remove HAPCOMP field call RemoveHAPCOMP { input: - input_vcf = input_vcf + input_vcf = input_vcf, + disk_size_gb = disk_size_gb, + memory_gb = memory_gb, + cpu_cores = cpu_cores } - # Step 2: Remove HAPDOM field call RemoveHAPDOM { input: - input_vcf = RemoveHAPCOMP.output_vcf + input_vcf = RemoveHAPCOMP.output_vcf, + disk_size_gb = disk_size_gb, + memory_gb = memory_gb, + cpu_cores = cpu_cores } - # Step 3: Normalize VCF call NormalizeVCFFile { input: input_vcf = RemoveHAPDOM.output_vcf, - reference_fa = reference_fa + reference_fa = reference_fa, + disk_size_gb = disk_size_gb, + memory_gb = memory_gb, + cpu_cores = cpu_cores } - # Output output { File normalized_vcf = NormalizeVCFFile.output_vcf } @@ -35,6 +42,9 @@ workflow NormalizeVCF { task RemoveHAPCOMP { input { File input_vcf + Int disk_size_gb + Int memory_gb + Int cpu_cores } command { @@ -47,14 +57,18 @@ task RemoveHAPCOMP { runtime { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" - memory: "4G" - cpu: 1 + memory: "~{memory_gb}G" + cpu: "~{cpu_cores}" + disks: "local-disk ~{disk_size_gb} HDD" } } task RemoveHAPDOM { input { File input_vcf + Int disk_size_gb + Int memory_gb + Int cpu_cores } command { @@ -67,8 +81,9 @@ task RemoveHAPDOM { runtime { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" - memory: "4G" - cpu: 1 + memory: "~{memory_gb}G" + cpu: "~{cpu_cores}" + disks: "local-disk ~{disk_size_gb} SSD" } } @@ -76,6 +91,9 @@ task NormalizeVCFFile { input { File input_vcf File reference_fa + Int disk_size_gb + Int memory_gb + Int cpu_cores } command { @@ -88,7 +106,8 @@ task NormalizeVCFFile { runtime { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" - memory: "4G" - cpu: 1 + memory: "~{memory_gb}G" + cpu: "~{cpu_cores}" + disks: "local-disk ~{disk_size_gb} HDD" } } From 9a4627e4a7378d2b63f229b8bd9cc39f1fbf00b7 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Sun, 15 Sep 2024 15:48:10 -0400 Subject: [PATCH 3/5] sample id --- wdl/malaria/NormalizeVCF.wdl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/wdl/malaria/NormalizeVCF.wdl b/wdl/malaria/NormalizeVCF.wdl index 09a8a6e32..edbf17d2c 100644 --- a/wdl/malaria/NormalizeVCF.wdl +++ b/wdl/malaria/NormalizeVCF.wdl @@ -4,9 +4,10 @@ workflow NormalizeVCF { input { File input_vcf File reference_fa - Int disk_size_gb = 20 # Default disk size in GB - Int memory_gb = 4 # Default memory size in GB - Int cpu_cores = 1 # Default number of CPU cores + String sample_name # New input for sample name + Int disk_size_gb = 20 + Int memory_gb = 4 + Int cpu_cores = 1 } call RemoveHAPCOMP { @@ -29,6 +30,7 @@ workflow NormalizeVCF { input: input_vcf = RemoveHAPDOM.output_vcf, reference_fa = reference_fa, + sample_name = sample_name, disk_size_gb = disk_size_gb, memory_gb = memory_gb, cpu_cores = cpu_cores @@ -83,7 +85,7 @@ task RemoveHAPDOM { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" memory: "~{memory_gb}G" cpu: "~{cpu_cores}" - disks: "local-disk ~{disk_size_gb} SSD" + disks: "local-disk ~{disk_size_gb} HDD" } } @@ -91,23 +93,24 @@ task NormalizeVCFFile { input { File input_vcf File reference_fa + String sample_name # Input for the sample name Int disk_size_gb Int memory_gb Int cpu_cores } command { - bcftools norm -m -any --atom-overlaps . -f ~{reference_fa} ~{input_vcf} | bgzip -c > output.norm.vcf.gz + bcftools norm -m -any --atom-overlaps . -f ~{reference_fa} ~{input_vcf} | bgzip -c > ~{sample_name}.norm.vcf.gz } output { - File output_vcf = "output.norm.vcf.gz" + File output_vcf = "~{sample_name}.norm.vcf.gz" } runtime { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" memory: "~{memory_gb}G" cpu: "~{cpu_cores}" - disks: "local-disk ~{disk_size_gb} HDD" + disks: "local-disk ~{disk_size_gb} SSD" } } From af3c2cad53fd92f58c983e96aec45b382c90a703 Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Sun, 15 Sep 2024 16:00:00 -0400 Subject: [PATCH 4/5] output indexed files --- wdl/malaria/NormalizeVCF.wdl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/wdl/malaria/NormalizeVCF.wdl b/wdl/malaria/NormalizeVCF.wdl index edbf17d2c..6854813f7 100644 --- a/wdl/malaria/NormalizeVCF.wdl +++ b/wdl/malaria/NormalizeVCF.wdl @@ -4,10 +4,10 @@ workflow NormalizeVCF { input { File input_vcf File reference_fa - String sample_name # New input for sample name - Int disk_size_gb = 20 - Int memory_gb = 4 - Int cpu_cores = 1 + String sample_name + Int disk_size_gb = 30 + Int memory_gb = 8 + Int cpu_cores = 4 } call RemoveHAPCOMP { @@ -38,6 +38,7 @@ workflow NormalizeVCF { output { File normalized_vcf = NormalizeVCFFile.output_vcf + File normalized_vcf_index = NormalizeVCFFile.output_vcf_index } } @@ -61,7 +62,7 @@ task RemoveHAPCOMP { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" memory: "~{memory_gb}G" cpu: "~{cpu_cores}" - disks: "local-disk ~{disk_size_gb} HDD" + disks: "local-disk ~{disk_size_gb} SSD" } } @@ -85,7 +86,7 @@ task RemoveHAPDOM { docker: "us.gcr.io/broad-dsp-lrma/bcftools_htslib:v9152024" memory: "~{memory_gb}G" cpu: "~{cpu_cores}" - disks: "local-disk ~{disk_size_gb} HDD" + disks: "local-disk ~{disk_size_gb} SSD" } } @@ -100,11 +101,16 @@ task NormalizeVCFFile { } command { + # Normalize the VCF bcftools norm -m -any --atom-overlaps . -f ~{reference_fa} ~{input_vcf} | bgzip -c > ~{sample_name}.norm.vcf.gz + + # Index the normalized VCF + bcftools index ~{sample_name}.norm.vcf.gz } output { File output_vcf = "~{sample_name}.norm.vcf.gz" + File output_vcf_index = "~{sample_name}.norm.vcf.gz.csi" } runtime { From 0bfb471a8aaccf57db7c4e99dfe0eb9160f2b4bd Mon Sep 17 00:00:00 2001 From: Shadi Zaheri Date: Sun, 15 Sep 2024 19:16:26 -0400 Subject: [PATCH 5/5] add .tbi --- wdl/malaria/NormalizeVCF.wdl | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wdl/malaria/NormalizeVCF.wdl b/wdl/malaria/NormalizeVCF.wdl index 6854813f7..ff7901940 100644 --- a/wdl/malaria/NormalizeVCF.wdl +++ b/wdl/malaria/NormalizeVCF.wdl @@ -4,10 +4,10 @@ workflow NormalizeVCF { input { File input_vcf File reference_fa - String sample_name - Int disk_size_gb = 30 - Int memory_gb = 8 - Int cpu_cores = 4 + String sample_name # New input for sample name + Int disk_size_gb = 50 + Int memory_gb = 16 + Int cpu_cores = 8 } call RemoveHAPCOMP { @@ -104,13 +104,13 @@ task NormalizeVCFFile { # Normalize the VCF bcftools norm -m -any --atom-overlaps . -f ~{reference_fa} ~{input_vcf} | bgzip -c > ~{sample_name}.norm.vcf.gz - # Index the normalized VCF - bcftools index ~{sample_name}.norm.vcf.gz + # Index the normalized VCF with tabix (.tbi) + tabix -p vcf ~{sample_name}.norm.vcf.gz } output { File output_vcf = "~{sample_name}.norm.vcf.gz" - File output_vcf_index = "~{sample_name}.norm.vcf.gz.csi" + File output_vcf_index = "~{sample_name}.norm.vcf.gz.tbi" } runtime {