Skip to content

Commit a914023

Browse files
committed
Fix sam_hdr_dup to cope with long refs.
The h->sdict hash is used to track references that are > 4Gb in size. The dup code didn't copy this. This manifested itself as CRAM SQ headers being truncated (read SAM hdr, dup, write as CRAM hdr). To fix this a function was written that creates or updates the sdict from the hrecs parsed header structs. It's possible this should be called directly from the sam_hdr_create function (part of the SAM format parser) instead of manually keeping track of sdict itself, however doing so would require initialising the new header structs so I haven't done this. This is a general utility, so perhaps should be made a public part of the header API. However IMO the new header API should hide this nuance away and just return the correct data, also ensuring that header updates work correctly and honour the text form. Since c83c9e2 the header API also was using the 32-bit capped target_len in preference to the parsed text from SQ LN fields when they differed. I am assuming this was a decision in what takes priority in BAM where the sequence names and lengths exist in both text and binary form. This commit reverses this and makes the text form always take priority. As this is at least required in some scenarios (long references) it seems easier to simply make it apply in all scenarios.
1 parent 7e3234e commit a914023

File tree

3 files changed

+65
-28
lines changed

3 files changed

+65
-28
lines changed

header.c

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ typedef khash_t(rm) rmhash_t;
4747
static int sam_hdr_link_pg(sam_hdr_t *bh);
4848

4949
static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap);
50-
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...);
5150

5251

5352
#define MAX_ERROR_QUOTE 320 // Prevent over-long error messages
@@ -182,14 +181,10 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
182181
if (hrecs->ref[nref].ty == NULL) {
183182
// Attach header line to existing stub entry.
184183
hrecs->ref[nref].ty = h_type;
185-
// Check lengths match; correct if not.
186-
if (len != hrecs->ref[nref].len) {
187-
char tmp[32];
188-
snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
189-
hrecs->ref[nref].len);
190-
if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
191-
return -1;
192-
}
184+
// The old hrecs length may be incorrect as it is initially
185+
// populated from h->target_len which has a max 32-bit size.
186+
// We always take our latest data as canonical.
187+
hrecs->ref[nref].len = len;
193188
if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0)
194189
return -1;
195190

@@ -2485,25 +2480,6 @@ int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) {
24852480
return 0;
24862481
}
24872482

2488-
/*
2489-
* Adds or updates tag key,value pairs in a header line.
2490-
* Eg for adding M5 tags to @SQ lines or updating sort order for the
2491-
* @HD line.
2492-
*
2493-
* Specify multiple key,value pairs ending in NULL.
2494-
*
2495-
* Returns 0 on success
2496-
* -1 on failure
2497-
*/
2498-
static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) {
2499-
va_list args;
2500-
int res;
2501-
va_start(args, type);
2502-
res = sam_hrecs_vupdate(hrecs, type, args);
2503-
va_end(args);
2504-
return res;
2505-
}
2506-
25072483
/*
25082484
* Looks for a specific key in a single sam header line identified by *type.
25092485
* If prev is non-NULL it also fills this out with the previous tag, to

sam.c

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,53 @@ void sam_hdr_destroy(sam_hdr_t *bh)
131131
free(bh);
132132
}
133133

134+
/*
135+
* Create or update the header sdict field from the new
136+
* parsed hrecs data. This is important for sam_hdr_dup or
137+
* when reading / creating a new header.
138+
*
139+
* Returns 0 on success,
140+
* NULL on failure
141+
*/
142+
static int sam_hdr_update_sdict(sam_hdr_t *h) {
143+
int i;
144+
khash_t(s2i) *long_refs = h->sdict;
145+
146+
if (!h->hrecs)
147+
return 0;
148+
149+
for (i = 0; i < h->n_targets; i++) {
150+
hts_pos_t len = sam_hdr_tid2len(h, i);
151+
const char *name = sam_hdr_tid2name(h, i);
152+
khint_t k;
153+
if (len < UINT32_MAX) {
154+
if (!long_refs)
155+
continue;
156+
157+
// Check if we have an old length, if so remove it.
158+
k = kh_get(s2i, long_refs, name);
159+
if (k < kh_end(long_refs))
160+
kh_del(s2i, long_refs, k);
161+
}
162+
163+
if (!long_refs) {
164+
if (!(h->sdict = long_refs = kh_init(s2i)))
165+
goto err;
166+
}
167+
168+
// Add / update length
169+
int absent;
170+
k = kh_put(s2i, long_refs, name, &absent);
171+
if (absent < 0)
172+
goto err;
173+
kh_val(long_refs, k) = len;
174+
}
175+
return 0;
176+
177+
err:
178+
return -1;
179+
}
180+
134181
sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
135182
{
136183
if (h0 == NULL) return NULL;
@@ -179,6 +226,14 @@ sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0)
179226
h->text[h->l_text] = '\0';
180227
}
181228

229+
if (h0->sdict) {
230+
// We have a reason to use new API, so build it so we
231+
// can replicate sdict.
232+
if (sam_hdr_fill_hrecs(h) < 0 ||
233+
sam_hdr_update_sdict(h) < 0)
234+
goto fail;
235+
}
236+
182237
return h;
183238

184239
fail:

test/test.pl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -645,6 +645,12 @@ sub test_view
645645
testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz";
646646
testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_";
647647

648+
# CRAM disabled for now as the positions cannot be 32-bit. (These tests are useful for
649+
# checking SQ headers only.)
650+
# testv $opts, "./test_view $tv_args -C -o no_ref -p longrefs/longref.tmp.cram longrefs/longref.sam";
651+
# testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.cram";
652+
# testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_";
653+
648654
# Build index and compare with on-the-fly one made earlier.
649655
test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1;
650656

0 commit comments

Comments
 (0)