diff --git a/.dockstore.yml b/.dockstore.yml index e2cdb4871e1..de0622ed3e9 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -344,7 +344,7 @@ workflows: branches: - master - ah_var_store - - master_merge_2025_07_16 + - ploidy_status_rows tags: - /.*/ - name: GvsIngestTieout diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 57d3146e258..1d924715909 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -131,7 +131,7 @@ task GetToolVersions { String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:524.0.0-slim" String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2025-06-30-alpine-61f482e29ee5" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" - String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-07-16-gatkbase-92f3d8e94af8" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2025-07-17-gatkbase-04a2f72cdb4b" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java index d3ef61e59b9..2fac466b6e1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java @@ -170,6 +170,8 @@ public final class CreateVariantIngestFiles extends VariantWalker { private boolean shouldWriteVCFHeadersLoadedStatusRow = false; + private boolean shouldWritePloidiesLoadedStatusRow = false; + private final Set gqStatesToIgnore = new HashSet<>(); // getGenotypes() returns list of lists for all samples at variant @@ -229,6 +231,7 @@ public void onTraversalStart() { Boolean refRangesRowsExist = null; Boolean vetRowsExist = null; Boolean vcfHeaderRowsExist = null; + Boolean ploidyRowsExist = null; // If BQ, check the load status table to see if this sample has already been loaded. if (outputType == CommonCode.OutputType.BQ) { @@ -257,6 +260,20 @@ public void onTraversalStart() { } shouldWriteReferencesLoadedStatusRow = true; } + + if (state.arePloidiesLoaded()) { + logger.info("Sample ID {}: Reference ranges writing enabled but PLOIDIES_LOADED status row found, skipping.", sampleId); + } else { + logger.info("Sample ID {}: Reference ranges writing enabled and no PLOIDIES_LOADED status row found, looking for existing rows.", sampleId); + ploidyRowsExist = SamplePloidyCreator.doRowsExistFor(outputType, projectID, datasetName, sampleId); + if (ploidyRowsExist) { + logger.warn("Reference ranges enabled for sample id = {}, name = {} but preexisting ploidy rows found, skipping ploidy data writes.", + sampleId, sampleName); + } else { + logger.info("Sample ID {}: No preexisting ploidy rows found.", sampleId); + } + shouldWritePloidiesLoadedStatusRow = true; + } } if (enableVet) { @@ -299,14 +316,23 @@ public void onTraversalStart() { // some class members that the CreateVariantIngestFilesTest expects to be initialized. SAMSequenceDictionary seqDictionary = initializeGQConfigurationAndIntervals(); - if (enableReferenceRanges && refRangesRowsExist == Boolean.FALSE) { - logger.info("Writing reference range data for sample id = {}, name = {}", sampleId, sampleName); - refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences); + if (enableReferenceRanges && (refRangesRowsExist == Boolean.FALSE || ploidyRowsExist == Boolean.FALSE)) { + // The refCreator is required if either reference ranges or ploidy data is being written, but should only + // write reference ranges if there are no preexisting reference range rows for this sample. + if (refRangesRowsExist == Boolean.FALSE) { + logger.info("Writing reference range data for sample id = {}, name = {}", sampleId, sampleName); + refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, true, projectID, datasetName, storeCompressedReferences); + } else { + logger.info("*Not* writing reference range data for sample id = {}, name = {} but instantiating ref creator for ploidy writing", sampleId, sampleName); + refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, false, projectID, datasetName, storeCompressedReferences); + } - // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes - // directly. If we're not ingesting references, we can't compute and ingest ploidy either - logger.info("Writing ploidy data for sample id = {}, name = {}", sampleId, sampleName); - samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType); + if (ploidyRowsExist == Boolean.FALSE) { + // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes + // directly. If we're not ingesting references, we can't compute and ingest ploidy either + logger.info("Writing ploidy data for sample id = {}, name = {}", sampleId, sampleName); + samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType); + } } if (enableVet && vetRowsExist == Boolean.FALSE) { @@ -321,11 +347,11 @@ public void onTraversalStart() { logger.info("enableReferenceRanges = {}, enableVet = {}, enableVCFHeaders = {}", enableReferenceRanges, enableVet, enableVCFHeaders); - logger.info("shouldWriteReferencesLoadedStatus = {}, shouldWriteVariantsLoadedStatus = {}, shouldWriteVCFHeadersLoadedStatus = {}", - shouldWriteReferencesLoadedStatusRow, shouldWriteVariantsLoadedStatusRow, shouldWriteVCFHeadersLoadedStatusRow); + logger.info("shouldWriteReferencesLoadedStatus = {}, shouldWriteVariantsLoadedStatus = {}, shouldWriteVCFHeadersLoadedStatus = {}, shouldWritePloidiesLoadedStatus = {}", + shouldWriteReferencesLoadedStatusRow, shouldWriteVariantsLoadedStatusRow, shouldWriteVCFHeadersLoadedStatusRow, shouldWritePloidiesLoadedStatusRow); - if (refCreator == null && vetCreator == null && vcfHeaderLineScratchCreator == null && - !shouldWriteReferencesLoadedStatusRow && !shouldWriteVariantsLoadedStatusRow && !shouldWriteVCFHeadersLoadedStatusRow) { + if (refCreator == null && vetCreator == null && vcfHeaderLineScratchCreator == null && samplePloidyCreator == null && + !shouldWriteReferencesLoadedStatusRow && !shouldWriteVariantsLoadedStatusRow && !shouldWriteVCFHeadersLoadedStatusRow && !shouldWritePloidiesLoadedStatusRow) { logger.info("No data to be written, exiting successfully."); System.exit(0); } @@ -403,7 +429,7 @@ public Object onTraversalSuccess() { refCreator.commitData(); // this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we - // later separate their creation, throwing the ploidy stuff explicity behind a flag + // later separate their creation, throwing the ploidy stuff explicitly behind a flag if (samplePloidyCreator != null) { try { samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries()); @@ -423,6 +449,7 @@ public Object onTraversalSuccess() { if (shouldWriteVCFHeadersLoadedStatusRow) loadStatus.writeHeadersLoadedStatus(sampleId); if (shouldWriteVariantsLoadedStatusRow) loadStatus.writeVariantsLoadedStatus(sampleId); if (shouldWriteReferencesLoadedStatusRow) loadStatus.writeReferencesLoadedStatus(sampleId); + if (shouldWritePloidiesLoadedStatusRow) loadStatus.writePloidiesLoadedStatus(sampleId); } return 0; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/LoadStatus.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/LoadStatus.java index 30461dda257..4efdad2b134 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/LoadStatus.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/LoadStatus.java @@ -24,7 +24,7 @@ public class LoadStatus { // Preserve legacy 'STARTED' and 'FINISHED' states in the enum so this code doesn't crash if asked to convert one // of these states from a stringified representation. - private enum LoadStatusValue { STARTED, FINISHED, REFERENCES_LOADED, VARIANTS_LOADED, HEADERS_LOADED } + private enum LoadStatusValue { STARTED, FINISHED, REFERENCES_LOADED, VARIANTS_LOADED, HEADERS_LOADED, PLOIDIES_LOADED } public static class LoadState { private final Set statusValues; @@ -49,6 +49,10 @@ public boolean areVariantsLoaded() { public boolean areHeadersLoaded() { return statusValues.contains(LoadStatusValue.HEADERS_LOADED); } + + public boolean arePloidiesLoaded() { + return statusValues.contains(LoadStatusValue.PLOIDIES_LOADED); + } } private final String projectID; @@ -107,6 +111,10 @@ public void writeHeadersLoadedStatus(long sampleId) { writeLoadStatus(LoadStatusValue.HEADERS_LOADED, sampleId); } + public void writePloidiesLoadedStatus(long sampleId) { + writeLoadStatus(LoadStatusValue.PLOIDIES_LOADED, sampleId); + } + protected void writeLoadStatus(LoadStatusValue status, long sampleId) { final ExponentialBackOff backoff = new ExponentialBackOff.Builder(). setInitialIntervalMillis(2000). diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java index bfd093cb579..9b1c4eb90f0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java @@ -85,22 +85,20 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); try { - if (writeReferenceRanges) { - final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase()); - switch (outputType) { - case BQ: - if (projectId == null || datasetName == null) { - throw new UserException("Must specify project-id and dataset-name when using BQ output mode."); - } - refRangesWriter = new RefRangesBQWriter(projectId, datasetName,REF_RANGES_FILETYPE_PREFIX + tableNumber); - break; - case TSV: - refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath()); - break; - case AVRO: - refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath()); - break; - } + final File refOutputFile = new File(outputDirectory, REF_RANGES_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + "." + outputType.toString().toLowerCase()); + switch (outputType) { + case BQ: + if (projectId == null || datasetName == null) { + throw new UserException("Must specify project-id and dataset-name when using BQ output mode."); + } + refRangesWriter = new RefRangesBQWriter(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber); + break; + case TSV: + refRangesWriter = new RefRangesTsvWriter(refOutputFile.getCanonicalPath()); + break; + case AVRO: + refRangesWriter = new RefRangesAvroWriter(refOutputFile.getCanonicalPath()); + break; } } catch (final IOException ioex) { throw new UserException("Could not create reference range outputs", ioex); @@ -130,36 +128,37 @@ public void apply(VariantContext variant, List intervalsToWrite) thro setCoveredInterval(variantChr, start, end); // if we are writing ref ranges, and this is a reference block, write it! - if (writeReferenceRanges) { - if (variant.isReferenceBlock()) { + if (variant.isReferenceBlock()) { - // Record reference ploidy if this is not in a PAR - if (!PloidyUtils.doesVariantOverlapPAR(variant)) { - // create the bitset for this ploidy if it isn't there - if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) { - ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>()); - } - // set the bit for this ploidy so we record having seen it - Map ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig()); - - int ploidy = variant.getMaxPloidy(1); - - Long currentCount = 0L; - if (ploidyCounts.containsKey(ploidy)) { - currentCount = ploidyCounts.get(ploidy); - } + // Record reference ploidy if this is not in a PAR + if (!PloidyUtils.doesVariantOverlapPAR(variant)) { + // create the bitset for this ploidy if it isn't there + if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) { + ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>()); + } + // set the bit for this ploidy so we record having seen it + Map ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig()); - // increment counts for this one and put it back - ++currentCount; - ploidyCounts.put(ploidy, currentCount); + int ploidy = variant.getMaxPloidy(1); - ++totalRefEntries; + Long currentCount = 0L; + if (ploidyCounts.containsKey(ploidy)) { + currentCount = ploidyCounts.get(ploidy); } + // increment counts for this one and put it back + ++currentCount; + ploidyCounts.put(ploidy, currentCount); + ++totalRefEntries; + } + } + + if (writeReferenceRanges) { + if (variant.isReferenceBlock()) { // break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE int localStart = start; - while ( localStart <= end ) { + while (localStart <= end) { int length = Math.min(end - localStart + 1, IngestConstants.MAX_REFERENCE_BLOCK_BASES); if (storeCompressedReferences) { refRangesWriter.writeCompressed( @@ -175,12 +174,11 @@ public void apply(VariantContext variant, List intervalsToWrite) thro ); } - localStart = localStart + length ; + localStart = localStart + length; } - - // Write out no-calls as a single-base GQ0 reference. - // UNLESS we are ignoring GQ0, in which case ignore them too. } else if (CreateVariantIngestFiles.isNoCall(variant) && (!this.gqStatesToIgnore.contains(GQStateEnum.ZERO))) { + // Write out no-calls as a single-base GQ0 reference. + // UNLESS we are ignoring GQ0, in which case ignore them too. if (storeCompressedReferences) { refRangesWriter.writeCompressed( SchemaUtils.encodeCompressedRefBlock(variantChr, start, 1, @@ -201,16 +199,18 @@ public void apply(VariantContext variant, List intervalsToWrite) thro } public void writeMissingIntervals(GenomeLocSortedSet intervalArgumentGenomeLocSortedSet) throws IOException { - GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet); - logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize()); - logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize()); - // for each block of uncovered locations - for (GenomeLoc genomeLoc : uncoveredIntervals) { - final String contig = genomeLoc.getContig(); - // write all positions in this block - writeMissingPositions( - SchemaUtils.encodeLocation(contig, genomeLoc.getStart()), - SchemaUtils.encodeLocation(contig, genomeLoc.getEnd())); + if (writeReferenceRanges) { + GenomeLocSortedSet uncoveredIntervals = intervalArgumentGenomeLocSortedSet.subtractRegions(coverageLocSortedSet); + logger.info("MISSING_GREP_HERE:" + uncoveredIntervals.coveredSize()); + logger.info("MISSING_PERCENTAGE_GREP_HERE:" + (1.0 * uncoveredIntervals.coveredSize()) / intervalArgumentGenomeLocSortedSet.coveredSize()); + // for each block of uncovered locations + for (GenomeLoc genomeLoc : uncoveredIntervals) { + final String contig = genomeLoc.getContig(); + // write all positions in this block + writeMissingPositions( + SchemaUtils.encodeLocation(contig, genomeLoc.getStart()), + SchemaUtils.encodeLocation(contig, genomeLoc.getEnd())); + } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index 54c4bcd7429..93fabd89210 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -6,14 +6,12 @@ import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.gvs.common.CommonCode; import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; -import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils; import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter; import org.json.JSONObject; import java.io.IOException; -import java.util.BitSet; import java.util.Map; -import java.util.Set; import java.util.concurrent.ExecutionException; public class SamplePloidyCreator { @@ -52,6 +50,10 @@ public SamplePloidyCreator(Long sampleId, String projectId, String datasetName, } } + public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, Long sampleId) { + if (outputType != CommonCode.OutputType.BQ) return false; + return BigQueryUtils.doRowsExistFor(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId); + } public void apply(Map> ploidyData, long totalRefEntries) throws IOException { for (final Map.Entry> ploidyLine : ploidyData.entrySet()) {