Skip to content
Merged
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ BUILT_TEST_PROGRAMS = \
test/test-vcf-sweep \
test/test-bcf-sr \
test/fuzz/hts_open_fuzzer.o \
test/test-bcf-translate
test/test-bcf-translate \
test/test-parse-reg

BUILT_THRASH_PROGRAMS = \
test/thrash_threads1 \
Expand Down Expand Up @@ -377,6 +378,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
test/fieldarith test/fieldarith.sam
test/hfile
test/test_bgzf test/bgziptest.txt
test/test-parse-reg -t test/colons.bam
cd test/tabix && ./test-tabix.sh tabix.tst
cd test/mpileup && ./test-pileup.sh mpileup.tst
REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq
Expand Down Expand Up @@ -413,6 +415,9 @@ test/test_realn: test/test_realn.o libhts.a
test/test-regidx: test/test-regidx.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread

test/test-parse-reg: test/test-parse-reg.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread

test/test_view: test/test_view.o libhts.a
$(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread

Expand All @@ -439,6 +444,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h)
test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h)
test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h)
test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h)
test/test-parse-reg.o: test/test-parse-reg.c $(htslib_hts_h) $(htslib_sam_h)
test/test-realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h)
test/test-regidx.o: test/test-regidx.c config.h $(htslib_regidx_h) $(hts_internal_h)
test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h)
Expand Down
115 changes: 42 additions & 73 deletions faidx.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */
#include "hts_internal.h"

typedef struct {
int id; // faidx_t->name[id] is for this struct.
uint32_t line_len, line_blen;
uint64_t len;
uint64_t seq_offset;
Expand All @@ -62,6 +63,13 @@ struct __faidx_t {
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif

static int fai_name2id(void *v, const char *ref)
{
faidx_t *fai = (faidx_t *)v;
khint_t k = kh_get(s, fai->hash, ref);
return k == kh_end(fai->hash) ? -1 : kh_val(fai->hash, k).id;
}

static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len, uint32_t line_len, uint32_t line_blen, uint64_t seq_offset, uint64_t qual_offset)
{
if (!name) {
Expand Down Expand Up @@ -89,6 +97,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len,
}
idx->name = tmp;
}
v->id = idx->n;
idx->name[idx->n++] = name_key;
v->len = len;
v->line_len = line_len;
Expand Down Expand Up @@ -684,14 +693,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {


static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
uint64_t offset, long beg, long end, int *len) {
uint64_t offset, int64_t beg, int64_t end, int *len) {
char *s;
size_t l;
int c = 0;
int ret = bgzf_useek(fai->bgzf,
offset
+ beg / val->line_blen * val->line_len
+ beg % val->line_blen, SEEK_SET);
int ret;

if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) {
hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end);
*len = -1;
return NULL;
}

ret = bgzf_useek(fai->bgzf,
offset
+ beg / val->line_blen * val->line_len
+ beg % val->line_blen, SEEK_SET);

if (ret < 0) {
*len = -1;
Expand Down Expand Up @@ -721,85 +738,32 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
return s;
}


static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *val, long *fbeg, long *fend) {
char *s, *ep;
size_t i, l, k, name_end;
static int fai_get_val(const faidx_t *fai, const char *str,
int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) {
khiter_t iter;
khash_t(s) *h;
long beg, end;
int id;
int64_t beg, end;

beg = end = -1;
h = fai->hash;
name_end = l = strlen(str);
s = (char*)malloc(l+1);
if (!s) {
*len = -1;
if (!fai_parse_region(fai, str, &id, &beg, &end, 0)) {
hts_log_warning("Reference %s not found in FASTA file, returning empty sequence", str);
*len = -2;
return 1;
}

// remove space
for (i = k = 0; i < l; ++i)
if (!isspace_c(str[i])) s[k++] = str[i];
s[k] = 0;
name_end = l = k;
// determine the sequence name
for (i = l; i > 0; --i) if (s[i - 1] == ':') break; // look for colon from the end
if (i > 0) name_end = i - 1;
if (name_end < l) { // check if this is really the end
int n_hyphen = 0;
for (i = name_end + 1; i < l; ++i) {
if (s[i] == '-') ++n_hyphen;
else if (!isdigit_c(s[i]) && s[i] != ',') break;
}
if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name
s[name_end] = 0;
iter = kh_get(s, h, s);
if (iter == kh_end(h)) { // cannot find the sequence name
iter = kh_get(s, h, str); // try str as the name
if (iter != kh_end(h)) {
s[name_end] = ':';
name_end = l;
}
}
} else iter = kh_get(s, h, str);
if(iter == kh_end(h)) {
hts_log_warning("Reference %s not found in file, returning empty sequence", str);
free(s);
h = fai->hash;
iter = kh_get(s, h, faidx_iseq(fai, id));
if (!iter) {
// should have already been caught above
abort();
*len = -2;
return 1;
}
*val = kh_value(h, iter);
// parse the interval
if (name_end < l) {
int save_errno = errno;
errno = 0;
for (i = k = name_end + 1; i < l; ++i)
if (s[i] != ',') s[k++] = s[i];
s[k] = 0;
if (s[name_end + 1] == '-') {
beg = 0;
i = name_end + 2;
} else {
beg = strtol(s + name_end + 1, &ep, 10);
for (i = ep - s; i < k;) if (s[i++] == '-') break;
}
end = i < k? strtol(s + i, &ep, 10) : val->len;
if (beg > 0) --beg;
// Check for out of range numbers. Only going to be a problem on
// 32-bit platforms with >2Gb sequence length.
if (errno == ERANGE && (uint64_t) val->len > LONG_MAX) {
hts_log_error("Positions in range %s are too large for this platform", s);
free(s);
*len = -3;
return 1;
}
errno = save_errno;
} else beg = 0, end = val->len;

if (beg >= val->len) beg = val->len;
if (end >= val->len) end = val->len;
if (beg > end) beg = end;
free(s);

*fbeg = beg;
*fend = end;
Expand All @@ -811,7 +775,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *
char *fai_fetch(const faidx_t *fai, const char *str, int *len)
{
faidx1_t val;
long beg, end;
int64_t beg, end;

if (fai_get_val(fai, str, len, &val, &beg, &end)) {
return NULL;
Expand All @@ -824,7 +788,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len)

char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) {
faidx1_t val;
long beg, end;
int64_t beg, end;

if (fai_get_val(fai, str, len, &val, &beg, &end)) {
return NULL;
Expand Down Expand Up @@ -924,3 +888,8 @@ int faidx_has_seq(const faidx_t *fai, const char *seq)
return 1;
}

const char *fai_parse_region(const faidx_t *fai, const char *s,
int *tid, int64_t *beg, int64_t *end, int flags)
{
return hts_parse_region(s, tid, beg, end, (hts_name2id_f)fai_name2id, (void *)fai, flags);
}
Loading