Skip to content

Commit 0aa5616

Browse files
authored
Merge pull request #20820 from mvdbeek/limit_sam_metadata
[25.0] Skip sam metadata if we have too many references
2 parents 7bcdd37 + 118bf71 commit 0aa5616

File tree

1 file changed

+30
-13
lines changed

1 file changed

+30
-13
lines changed

lib/galaxy/datatypes/binary.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -552,22 +552,29 @@ class _BamOrSam:
552552
Helper class to set the metadata common to sam and bam files
553553
"""
554554

555+
max_references = 100000
556+
555557
def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
556558
try:
557-
bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb")
558-
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
559-
dataset.metadata.reference_names = list(bam_file.references)
560-
dataset.metadata.reference_lengths = list(bam_file.lengths)
561-
dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined]
562-
dataset.metadata.read_groups = [
563-
read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group
564-
]
565-
dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None)
566-
dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None)
559+
with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file:
560+
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
561+
if bam_file.nreferences <= self.max_references:
562+
dataset.metadata.reference_names = list(bam_file.references)
563+
dataset.metadata.reference_lengths = list(bam_file.lengths)
564+
dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined]
565+
dataset.metadata.read_groups = [
566+
read_group["ID"]
567+
for read_group in dataset.metadata.bam_header.get("RG", [])
568+
if "ID" in read_group
569+
]
570+
else:
571+
dataset.metadata.metadata_incomplete = True
572+
dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None) # type: ignore [attr-defined]
573+
dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None) # type: ignore [attr-defined]
567574
except Exception:
568575
# Per Dan, don't log here because doing so will cause datasets that
569576
# fail metadata to end in the error state
570-
pass
577+
dataset.metadata.metadata_incomplete = True
571578

572579

573580
class BamNative(CompressedArchive, _BamOrSam):
@@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam):
656663
optional=True,
657664
no_value={},
658665
)
666+
MetadataElement(
667+
name="metadata_incomplete",
668+
default=False,
669+
desc="Indicates if metadata is incomplete",
670+
param=MetadataParameter,
671+
readonly=True,
672+
visible=False,
673+
optional=True,
674+
no_value=False,
675+
)
659676

660677
def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
661678
_BamOrSam().set_meta(dataset, overwrite=overwrite, **kwd)
@@ -1054,7 +1071,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
10541071
"""
10551072
# The best way to ensure that BAM files are coordinate-sorted and indexable
10561073
# is to actually index them.
1057-
with pysam.AlignmentFile(filename=file_name) as f:
1074+
with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
10581075
# The only sure thing we know here is that the sort order can't be coordinate
10591076
return f.header.get("HD", {}).get("SO") == "coordinate" # type: ignore[attr-defined]
10601077

@@ -1074,7 +1091,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
10741091
"""
10751092
# The best way to ensure that BAM files are coordinate-sorted and indexable
10761093
# is to actually index them.
1077-
with pysam.AlignmentFile(filename=file_name) as f:
1094+
with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
10781095
return f.header.get("HD", {}).get("SO") != "queryname" # type: ignore[attr-defined]
10791096

10801097

0 commit comments

Comments
 (0)