Skip to content

Commit 7e3234e

Browse files
committed
Fixed CRAM decode on ultra-long cigar.
We can now have many cigar ops (eg ref-skips) that combine to more than 512Mb. This isn't a full fix for long references as CRAM sequence positions are still 32-bit. However this is a starting point down that road and there are still sizes between 28-bit and 32-bit where the CIGAR failed to operate correctly.
1 parent d256bed commit 7e3234e

File tree

2 files changed

+65
-49
lines changed

2 files changed

+65
-49
lines changed

cram/cram_decode.c

Lines changed: 61 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,25 @@ static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_
10871087
return -1;
10881088
}
10891089

1090+
static int add_cigar(cram_slice *s, uint64_t cig_len, int cig_op) {
1091+
int nlen = 1 + cig_len/(1<<28);
1092+
if (s->ncigar + nlen >= s->cigar_alloc) {
1093+
s->cigar_alloc = (s->ncigar + nlen) < 1024 ? 1024 : (s->ncigar + nlen)*2;
1094+
uint32_t *c2 = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar));
1095+
if (!c2)
1096+
return -1;
1097+
s->cigar = c2;
1098+
}
1099+
1100+
while (cig_len) {
1101+
int sub_len = cig_len < (1<<28) ? cig_len : (1<<28)-1;
1102+
s->cigar[s->ncigar++] = (sub_len<<4) + cig_op;
1103+
cig_len -= sub_len;
1104+
}
1105+
1106+
return 0;
1107+
}
1108+
10901109
/*
10911110
* Internal part of cram_decode_slice().
10921111
* Generates the sequence, quality and cigar components.
@@ -1097,13 +1116,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
10971116
int has_MD, int has_NM) {
10981117
int prev_pos = 0, f, r = 0, out_sz = 1;
10991118
int seq_pos = 1;
1100-
int cig_len = 0;
1119+
uint64_t cig_len = 0;
11011120
int64_t ref_pos = cr->apos;
11021121
int32_t fn, i32;
11031122
enum cigar_op cig_op = BAM_CMATCH;
1104-
uint32_t *cigar = s->cigar;
1105-
uint32_t ncigar = s->ncigar;
1106-
uint32_t cigar_alloc = s->cigar_alloc;
11071123
uint32_t nm = 0;
11081124
int32_t md_dist = 0;
11091125
int orig_aux = 0;
@@ -1134,7 +1150,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
11341150
}
11351151

11361152
ref_pos--; // count from 0
1137-
cr->cigar = ncigar;
1153+
cr->cigar = s->ncigar;
11381154

11391155
if (!(ds & (CRAM_FC | CRAM_FP)))
11401156
goto skip_cigar;
@@ -1143,13 +1159,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
11431159
int32_t pos = 0;
11441160
char op;
11451161

1146-
if (ncigar+2 >= cigar_alloc) {
1147-
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1148-
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1149-
return -1;
1150-
s->cigar = cigar;
1151-
}
1152-
11531162
if (ds & CRAM_FC) {
11541163
if (!c->comp_hdr->codecs[DS_FC]) return -1;
11551164
r |= c->comp_hdr->codecs[DS_FC]->decode(s,
@@ -1231,13 +1240,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
12311240
}
12321241
#ifdef USE_X
12331242
if (cig_len && cig_op != BAM_CBASE_MATCH) {
1234-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1243+
if (add_cigar(s, cig_len, cig_op) < 0)
1244+
goto err;
12351245
cig_len = 0;
12361246
}
12371247
cig_op = BAM_CBASE_MATCH;
12381248
#else
12391249
if (cig_len && cig_op != BAM_CMATCH) {
1240-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1250+
if (add_cigar(s, cig_len, cig_op) < 0)
1251+
goto err;
12411252
cig_len = 0;
12421253
}
12431254
cig_op = BAM_CMATCH;
@@ -1258,7 +1269,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
12581269
int have_sc = 0;
12591270

12601271
if (cig_len) {
1261-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1272+
if (add_cigar(s, cig_len, cig_op) < 0)
1273+
goto err;
12621274
cig_len = 0;
12631275
}
12641276
switch (CRAM_MAJOR_VERS(fd->version)) {
@@ -1297,7 +1309,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
12971309
}
12981310
if (have_sc) {
12991311
if (r) return r;
1300-
cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP;
1312+
if (add_cigar(s, out_sz2, BAM_CSOFT_CLIP) < 0)
1313+
goto err;
13011314
cig_op = BAM_CSOFT_CLIP;
13021315
seq_pos += out_sz2;
13031316
}
@@ -1308,7 +1321,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
13081321
unsigned char base;
13091322
#ifdef USE_X
13101323
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1311-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1324+
if (add_cigar(s, cig_len, cig_op) < 0)
1325+
goto err;
13121326
cig_len = 0;
13131327
}
13141328
if (ds & CRAM_BS) {
@@ -1323,7 +1337,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
13231337
#else
13241338
int ref_base;
13251339
if (cig_len && cig_op != BAM_CMATCH) {
1326-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1340+
if (add_cigar(s, cig_len, cig_op) < 0)
1341+
goto err;
13271342
cig_len = 0;
13281343
}
13291344
if (ds & CRAM_BS) {
@@ -1365,7 +1380,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
13651380

13661381
case 'D': { // Deletion; DL
13671382
if (cig_len && cig_op != BAM_CDEL) {
1368-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1383+
if (add_cigar(s, cig_len, cig_op) < 0)
1384+
goto err;
13691385
cig_len = 0;
13701386
}
13711387
if (ds & CRAM_DL) {
@@ -1419,7 +1435,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
14191435
int32_t out_sz2 = 1;
14201436

14211437
if (cig_len && cig_op != BAM_CINS) {
1422-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1438+
if (add_cigar(s, cig_len, cig_op) < 0)
1439+
goto err;
14231440
cig_len = 0;
14241441
}
14251442

@@ -1441,7 +1458,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
14411458

14421459
case 'i': { // Insertion (single base); BA
14431460
if (cig_len && cig_op != BAM_CINS) {
1444-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1461+
if (add_cigar(s, cig_len, cig_op) < 0)
1462+
goto err;
14451463
cig_len = 0;
14461464
}
14471465
if (ds & CRAM_BA) {
@@ -1463,7 +1481,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
14631481
int32_t len = 1;
14641482

14651483
if (cig_len && cig_op != BAM_CMATCH) {
1466-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1484+
if (add_cigar(s, cig_len, cig_op) < 0)
1485+
goto err;
14671486
cig_len = 0;
14681487
}
14691488

@@ -1513,7 +1532,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
15131532
int32_t len = 1;
15141533

15151534
if (cig_len && cig_op != BAM_CMATCH) {
1516-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1535+
if (add_cigar(s, cig_len, cig_op) < 0)
1536+
goto err;
15171537
cig_len = 0;
15181538
}
15191539

@@ -1537,12 +1557,14 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
15371557
case 'B': { // Read base; BA, QS
15381558
#ifdef USE_X
15391559
if (cig_len && cig_op != BAM_CBASE_MISMATCH) {
1540-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1560+
if (add_cigar(s, cig_len, cig_op) < 0)
1561+
goto err;
15411562
cig_len = 0;
15421563
}
15431564
#else
15441565
if (cig_len && cig_op != BAM_CMATCH) {
1545-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1566+
if (add_cigar(s, cig_len, cig_op) < 0)
1567+
goto err;
15461568
cig_len = 0;
15471569
}
15481570
#endif
@@ -1601,7 +1623,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
16011623

16021624
case 'H': { // hard clip; HC
16031625
if (cig_len && cig_op != BAM_CHARD_CLIP) {
1604-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1626+
if (add_cigar(s, cig_len, cig_op) < 0)
1627+
goto err;
16051628
cig_len = 0;
16061629
}
16071630
if (ds & CRAM_HC) {
@@ -1618,7 +1641,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
16181641

16191642
case 'P': { // padding; PD
16201643
if (cig_len && cig_op != BAM_CPAD) {
1621-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1644+
if (add_cigar(s, cig_len, cig_op) < 0)
1645+
goto err;
16221646
cig_len = 0;
16231647
}
16241648
if (ds & CRAM_PD) {
@@ -1635,7 +1659,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
16351659

16361660
case 'N': { // Ref skip; RS
16371661
if (cig_len && cig_op != BAM_CREF_SKIP) {
1638-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1662+
if (add_cigar(s, cig_len, cig_op) < 0)
1663+
goto err;
16391664
cig_len = 0;
16401665
}
16411666
if (ds & CRAM_RS) {
@@ -1722,21 +1747,17 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
17221747
ref_pos += cr->len - seq_pos + 1;
17231748
}
17241749

1725-
if (ncigar+1 >= cigar_alloc) {
1726-
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1727-
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1728-
return -1;
1729-
s->cigar = cigar;
1730-
}
17311750
#ifdef USE_X
17321751
if (cig_len && cig_op != BAM_CBASE_MATCH) {
1733-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1752+
if (add_cigar(s, cig_len, cig_op) < 0)
1753+
goto err;
17341754
cig_len = 0;
17351755
}
17361756
cig_op = BAM_CBASE_MATCH;
17371757
#else
17381758
if (cig_len && cig_op != BAM_CMATCH) {
1739-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1759+
if (add_cigar(s, cig_len, cig_op) < 0)
1760+
goto err;
17401761
cig_len = 0;
17411762
}
17421763
cig_op = BAM_CMATCH;
@@ -1752,17 +1773,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
17521773
}
17531774

17541775
if (cig_len) {
1755-
if (ncigar >= cigar_alloc) {
1756-
cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024;
1757-
if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar))))
1758-
return -1;
1759-
s->cigar = cigar;
1760-
}
1761-
1762-
cigar[ncigar++] = (cig_len<<4) + cig_op;
1776+
if (add_cigar(s, cig_len, cig_op) < 0)
1777+
goto err;
17631778
}
17641779

1765-
cr->ncigar = ncigar - cr->cigar;
1780+
cr->ncigar = s->ncigar - cr->cigar;
17661781
cr->aend = ref_pos;
17671782

17681783
//printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos);
@@ -1787,10 +1802,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
17871802
}
17881803
}
17891804

1790-
s->cigar = cigar;
1791-
s->cigar_alloc = cigar_alloc;
1792-
s->ncigar = ncigar;
1793-
17941805
if (cr->cram_flags & CRAM_FLAG_NO_SEQ)
17951806
cr->len = 0;
17961807

@@ -1834,6 +1845,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s,
18341845
hts_log_error("CRAM CIGAR extends beyond slice reference extents");
18351846
return -1;
18361847

1848+
err:
18371849
block_err:
18381850
return -1;
18391851
}

cram/cram_encode.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2773,6 +2773,10 @@ static int process_one_read(cram_fd *fd, cram_container *c,
27732773

27742774
c->num_bases += cr->len;
27752775
cr->apos = bam_pos(b)+1;
2776+
if (cr->apos >= UINT32_MAX) {
2777+
hts_log_error("Sequence position out of valid range");
2778+
goto err;
2779+
}
27762780
if (c->pos_sorted) {
27772781
if (cr->apos < s->last_apos) {
27782782
c->pos_sorted = 0;

0 commit comments

Comments
 (0)