@@ -552,22 +552,29 @@ class _BamOrSam:
552552 Helper class to set the metadata common to sam and bam files
553553 """
554554
555+ max_references = 100000
556+
555557 def set_meta (self , dataset : DatasetProtocol , overwrite : bool = True , ** kwd ) -> None :
556558 try :
557- bam_file = pysam .AlignmentFile (dataset .get_file_name (), mode = "rb" )
558- # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
559- dataset .metadata .reference_names = list (bam_file .references )
560- dataset .metadata .reference_lengths = list (bam_file .lengths )
561- dataset .metadata .bam_header = dict (bam_file .header .items ()) # type: ignore [attr-defined]
562- dataset .metadata .read_groups = [
563- read_group ["ID" ] for read_group in dataset .metadata .bam_header .get ("RG" , []) if "ID" in read_group
564- ]
565- dataset .metadata .sort_order = dataset .metadata .bam_header .get ("HD" , {}).get ("SO" , None )
566- dataset .metadata .bam_version = dataset .metadata .bam_header .get ("HD" , {}).get ("VN" , None )
559+ with pysam .AlignmentFile (dataset .get_file_name (), mode = "rb" , check_sq = False ) as bam_file :
560+ # TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
561+ if bam_file .nreferences <= self .max_references :
562+ dataset .metadata .reference_names = list (bam_file .references )
563+ dataset .metadata .reference_lengths = list (bam_file .lengths )
564+ dataset .metadata .bam_header = dict (bam_file .header .items ()) # type: ignore [attr-defined]
565+ dataset .metadata .read_groups = [
566+ read_group ["ID" ]
567+ for read_group in dataset .metadata .bam_header .get ("RG" , [])
568+ if "ID" in read_group
569+ ]
570+ else :
571+ dataset .metadata .metadata_incomplete = True
572+ dataset .metadata .sort_order = bam_file .header .get ("HD" , {}).get ("SO" , None ) # type: ignore [attr-defined]
573+ dataset .metadata .bam_version = bam_file .header .get ("HD" , {}).get ("VN" , None ) # type: ignore [attr-defined]
567574 except Exception :
568575 # Per Dan, don't log here because doing so will cause datasets that
569576 # fail metadata to end in the error state
570- pass
577+ dataset . metadata . metadata_incomplete = True
571578
572579
573580class BamNative (CompressedArchive , _BamOrSam ):
@@ -656,6 +663,16 @@ class BamNative(CompressedArchive, _BamOrSam):
656663 optional = True ,
657664 no_value = {},
658665 )
666+ MetadataElement (
667+ name = "metadata_incomplete" ,
668+ default = False ,
669+ desc = "Indicates if metadata is incomplete" ,
670+ param = MetadataParameter ,
671+ readonly = True ,
672+ visible = False ,
673+ optional = True ,
674+ no_value = False ,
675+ )
659676
660677 def set_meta (self , dataset : DatasetProtocol , overwrite : bool = True , ** kwd ) -> None :
661678 _BamOrSam ().set_meta (dataset , overwrite = overwrite , ** kwd )
@@ -1054,7 +1071,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
10541071 """
10551072 # The best way to ensure that BAM files are coordinate-sorted and indexable
10561073 # is to actually index them.
1057- with pysam .AlignmentFile (filename = file_name ) as f :
1074+ with pysam .AlignmentFile (filename = file_name , check_sq = False ) as f :
10581075 # The only sure thing we know here is that the sort order can't be coordinate
10591076 return f .header .get ("HD" , {}).get ("SO" ) == "coordinate" # type: ignore[attr-defined]
10601077
@@ -1074,7 +1091,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
10741091 """
10751092 # The best way to ensure that BAM files are coordinate-sorted and indexable
10761093 # is to actually index them.
1077- with pysam .AlignmentFile (filename = file_name ) as f :
1094+ with pysam .AlignmentFile (filename = file_name , check_sq = False ) as f :
10781095 return f .header .get ("HD" , {}).get ("SO" ) != "queryname" # type: ignore[attr-defined]
10791096
10801097
0 commit comments