Skip to content

Commit a08e5f2

Browse files
committed
Merge Work around colons in reference names (PR #708)
2 parents 6e6cdfc + d26300e commit a08e5f2

File tree

13 files changed

+619
-146
lines changed

13 files changed

+619
-146
lines changed

Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ BUILT_TEST_PROGRAMS = \
8282
test/test-vcf-sweep \
8383
test/test-bcf-sr \
8484
test/fuzz/hts_open_fuzzer.o \
85-
test/test-bcf-translate
85+
test/test-bcf-translate \
86+
test/test-parse-reg
8687

8788
BUILT_THRASH_PROGRAMS = \
8889
test/thrash_threads1 \
@@ -377,6 +378,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
377378
test/fieldarith test/fieldarith.sam
378379
test/hfile
379380
test/test_bgzf test/bgziptest.txt
381+
test/test-parse-reg -t test/colons.bam
380382
cd test/tabix && ./test-tabix.sh tabix.tst
381383
cd test/mpileup && ./test-pileup.sh mpileup.tst
382384
REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq
@@ -413,6 +415,9 @@ test/test_realn: test/test_realn.o libhts.a
413415
test/test-regidx: test/test-regidx.o libhts.a
414416
$(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread
415417

418+
test/test-parse-reg: test/test-parse-reg.o libhts.a
419+
$(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread
420+
416421
test/test_view: test/test_view.o libhts.a
417422
$(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread
418423

@@ -439,6 +444,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h)
439444
test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h)
440445
test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h)
441446
test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
447+
test/test-parse-reg.o: test/test-parse-reg.c $(htslib_hts_h) $(htslib_sam_h)
442448
test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
443449
test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h)
444450
test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h)

NEWS

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,19 @@ Noteworthy changes in release a.b
4040
to create hts_reglist_t region lists from `chr:<from>-<to>` type region
4141
specifiers.
4242

43+
* An improved region parsing function hts_parse_region() is added. This
44+
is designed to help deal with ambiguities that arise when reference names
45+
contain colons followed by numbers. It does this by checking possible
46+
names against the known list of references in the input file. When even
47+
this method fails (for example when names "chr1" and "chr1:100-200" both
48+
exist) the reference name part can be quoted using curly braces. Thus
49+
"{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
50+
51+
Specialisations of this function sam_parse_region() and fai_parse_region()
52+
are also added for SAM/BAM/CRAM files and fasta/fastq files respectively.
53+
HTSlib iterator functions that accept a region specifier have been
54+
upgraded to use the new interface.
55+
4356
* hts_idx_load() and hts_open() have been changed to support indexes
4457
which are not stored in the same location as the indexed file. This works
4558
by appending a delimiter '##idx##' and the index file location to the

faidx.c

Lines changed: 42 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */
4343
#include "hts_internal.h"
4444

4545
typedef struct {
46+
int id; // faidx_t->name[id] is for this struct.
4647
uint32_t line_len, line_blen;
4748
uint64_t len;
4849
uint64_t seq_offset;
@@ -62,6 +63,13 @@ struct __faidx_t {
6263
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
6364
#endif
6465

66+
static int fai_name2id(void *v, const char *ref)
67+
{
68+
faidx_t *fai = (faidx_t *)v;
69+
khint_t k = kh_get(s, fai->hash, ref);
70+
return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
71+
}
72+
6573
static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
6674
{
6775
if (!name) {
@@ -89,6 +97,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len,
8997
}
9098
idx->name = tmp;
9199
}
100+
v->id = idx->n;
92101
idx->name[idx->n++] = name_key;
93102
v->len = len;
94103
v->line_len = line_len;
@@ -684,14 +693,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
684693

685694

686695
static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
687-
uint64_t offset, long beg, long end, int *len) {
696+
uint64_t offset, int64_t beg, int64_t end, int *len) {
688697
char *s;
689698
size_t l;
690699
int c = 0;
691-
int ret = bgzf_useek(fai->bgzf,
692-
offset
693-
+ beg / val->line_blen * val->line_len
694-
+ beg % val->line_blen, SEEK_SET);
700+
int ret;
701+
702+
if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
703+
hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
704+
*len = -1;
705+
return NULL;
706+
}
707+
708+
ret = bgzf_useek(fai->bgzf,
709+
offset
710+
+ beg / val->line_blen * val->line_len
711+
+ beg % val->line_blen, SEEK_SET);
695712

696713
if (ret < 0) {
697714
*len = -1;
@@ -721,85 +738,32 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
721738
return s;
722739
}
723740

724-
725-
static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) {
726-
char *s, *ep;
727-
size_t i, l, k, name_end;
741+
static int fai_get_val(const faidx_t *fai, const char *str,
742+
int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) {
728743
khiter_t iter;
729744
khash_t(s) *h;
730-
long beg, end;
745+
int id;
746+
int64_t beg, end;
731747

732-
beg = end = -1;
733-
h = fai->hash;
734-
name_end = l = strlen(str);
735-
s = (char*)malloc(l+1);
736-
if (!s) {
737-
*len = -1;
748+
if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
749+
hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
750+
*len = -2;
738751
return 1;
739752
}
740753

741-
// remove space
742-
for (i = k = 0; i < l; ++i)
743-
if (!isspace_c(str[i])) s[k++] = str[i];
744-
s[k] = 0;
745-
name_end = l = k;
746-
// determine the sequence name
747-
for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
748-
if (i > 0) name_end = i - 1;
749-
if (name_end < l) { // check if this is really the end
750-
int n_hyphen = 0;
751-
for (i = name_end + 1; i < l; ++i) {
752-
if (s[i] == '-') ++n_hyphen;
753-
else if (!isdigit_c(s[i]) && s[i] != ',') break;
754-
}
755-
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
756-
s[name_end] = 0;
757-
iter = kh_get(s, h, s);
758-
if (iter == kh_end(h)) { // cannot find the sequence name
759-
iter = kh_get(s, h, str); // try str as the name
760-
if (iter != kh_end(h)) {
761-
s[name_end] = ':';
762-
name_end = l;
763-
}
764-
}
765-
} else iter = kh_get(s, h, str);
766-
if(iter == kh_end(h)) {
767-
hts_log_warning("Reference %s not found in file, returning empty sequence", str);
768-
free(s);
754+
h = fai->hash;
755+
iter = kh_get(s, h, faidx_iseq(fai, id));
756+
if (!iter) {
757+
// should have already been caught above
758+
abort();
769759
*len = -2;
770760
return 1;
771761
}
772762
*val = kh_value(h, iter);
773-
// parse the interval
774-
if (name_end < l) {
775-
int save_errno = errno;
776-
errno = 0;
777-
for (i = k = name_end + 1; i < l; ++i)
778-
if (s[i] != ',') s[k++] = s[i];
779-
s[k] = 0;
780-
if (s[name_end + 1] == '-') {
781-
beg = 0;
782-
i = name_end + 2;
783-
} else {
784-
beg = strtol(s + name_end + 1, &ep, 10);
785-
for (i = ep - s; i < k;) if (s[i++] == '-') break;
786-
}
787-
end = i < k? strtol(s + i, &ep, 10) : val->len;
788-
if (beg > 0) --beg;
789-
// Check for out of range numbers. Only going to be a problem on
790-
// 32-bit platforms with >2Gb sequence length.
791-
if (errno == ERANGE && (uint64_t) val->len > LONG_MAX) {
792-
hts_log_error("Positions in range %s are too large for this platform", s);
793-
free(s);
794-
*len = -3;
795-
return 1;
796-
}
797-
errno = save_errno;
798-
} else beg = 0, end = val->len;
763+
799764
if (beg >= val->len) beg = val->len;
800765
if (end >= val->len) end = val->len;
801766
if (beg > end) beg = end;
802-
free(s);
803767

804768
*fbeg = beg;
805769
*fend = end;
@@ -811,7 +775,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *
811775
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
812776
{
813777
faidx1_t val;
814-
long beg, end;
778+
int64_t beg, end;
815779

816780
if (fai_get_val(fai, str, len, &val, &beg, &end)) {
817781
return NULL;
@@ -824,7 +788,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len)
824788

825789
char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
826790
faidx1_t val;
827-
long beg, end;
791+
int64_t beg, end;
828792

829793
if (fai_get_val(fai, str, len, &val, &beg, &end)) {
830794
return NULL;
@@ -924,3 +888,8 @@ int faidx_has_seq(const faidx_t *fai, const char *seq)
924888
return 1;
925889
}
926890

891+
const char *fai_parse_region(const faidx_t *fai, const char *s,
892+
int *tid, int64_t *beg, int64_t *end, int flags)
893+
{
894+
return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
895+
}

0 commit comments

Comments
 (0)