Skip to content

Commit 79f5ef1

Browse files
committed
Major rewrite of hts_parse_region.
The old function (hts_parse_reg) has been put back, meaning the new API can now have the getid func pointer as a mandatory requirement. Added tests. Tweaked sam_parse_region to have the flags parameter. This is still required as bcftools and samtools use different parameters (parsing style is tool based rather than file format based).
1 parent 79cd218 commit 79f5ef1

File tree

8 files changed

+393
-60
lines changed

8 files changed

+393
-60
lines changed

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ BUILT_TEST_PROGRAMS = \
7878
test/test-vcf-api \
7979
test/test-vcf-sweep \
8080
test/test-bcf-sr \
81-
test/test-bcf-translate
81+
test/test-bcf-translate \
82+
test/test-parse-reg
8283

8384
BUILT_THRASH_PROGRAMS = \
8485
test/thrash_threads1 \
@@ -358,6 +359,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS)
358359
test/fieldarith test/fieldarith.sam
359360
test/hfile
360361
test/test_bgzf test/bgziptest.txt
362+
test/test-parse-reg -t test/colons.bam
361363
cd test/tabix && ./test-tabix.sh tabix.tst
362364
REF_PATH=: test/sam test/ce.fa test/faidx.fa
363365
test/test-regidx
@@ -384,6 +386,9 @@ test/test_realn: test/test_realn.o libhts.a
384386
test/test-regidx: test/test-regidx.o libhts.a
385387
$(CC) $(LDFLAGS) -o $@ test/test-regidx.o libhts.a $(LIBS) -lpthread
386388

389+
test/test-parse-reg: test/test-parse-reg.o libhts.a
390+
$(CC) $(LDFLAGS) -o $@ test/test-parse-reg.o libhts.a $(LIBS) -lpthread
391+
387392
test/test_view: test/test_view.o libhts.a
388393
$(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread
389394

hts.c

Lines changed: 166 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2490,6 +2490,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
24902490
if (esign == '-') e = -e;
24912491
}
24922492

2493+
switch (*s) {
2494+
case 'k': case 'K': e += 3; s++; break;
2495+
case 'm': case 'M': e += 6; s++; break;
2496+
case 'g': case 'G': e += 9; s++; break;
2497+
}
2498+
24932499
e -= decimals;
24942500
while (e > 0) n *= 10, e--;
24952501
while (e < 0) lost += n % 10, n /= 10, e++;
@@ -2501,45 +2507,90 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
25012507
if (strend) {
25022508
*strend = (char *)s;
25032509
} else if (*s) {
2504-
hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
2510+
if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ','))
2511+
hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s);
25052512
}
25062513

25072514
return (sign == '+')? n : -n;
25082515
}
25092516

2517+
static void *hts_memrchr(const void *s, int c, size_t n) {
2518+
size_t i;
2519+
unsigned char *u = (unsigned char *)s;
2520+
for (i = n; i > 0; i--) {
2521+
if (u[i-1] == c)
2522+
return u+i-1;
2523+
}
2524+
2525+
return NULL;
2526+
}
2527+
25102528
/*
25112529
* A variant of hts_parse_reg which is reference-id aware. It uses
25122530
* the iterator name2id callbacks to validate the region tokenisation works.
25132531
*
25142532
* This is necessary due to GRCh38 HLA additions which have reference names
25152533
* like "HLA-DRB1*12:17".
25162534
*
2517-
* getid is optional and may be passed in as NULL. If given it is used to
2518-
* validate the reference name exists and is unambiguously parseable. If not
2519-
* given the best guess will be made but no has guarantees in validity.
2535+
* All parameters are mandatory.
2536+
*
2537+
* To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
2538+
* are reference names, we may quote using curly braces.
2539+
* Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
2540+
*
2541+
* Flags are used to control how parsing works, and can be one of the below.
2542+
*
2543+
* HTS_PARSE_LIST:
2544+
* If present, the region is assmed to be a comma separated list and
2545+
* position parsing will not contain commas (this implicitly
2546+
* clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
2547+
* On success the return pointer will be the start of the next region, ie
2548+
* the character after the comma. (If *ret != '\0' then the caller can
2549+
* assume another region is present in the list.)
2550+
*
2551+
* If not set then positions may contain commas. In this case the return
2552+
* value should point to the end of the string, or NULL on failure.
25202553
*
2521-
* To work around these issues quoting is also permitted via {ref}:start-end.
2522-
* In this case, the return value will point to '}' and not the end of the
2523-
* reference (but this is a useful indication that it started with '{').
2554+
* HTS_PARSE_ONE_COORD:
2555+
* If present, X:100 is treated as the single base pair region X:100-100.
2556+
* In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
2557+
* (This is the standard bcftools region convention.)
2558+
*
2559+
* When not set X:100 is considered to be X:100-<end> where <end> is
2560+
* the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are
2561+
* invalid.
2562+
* (This is the standard samtools region convention.)
2563+
*
2564+
* Note the supplied string expects 1 based inclusive coordinates, but the
2565+
* returned coordinates start from 0 and are half open, so pos0 is valid
2566+
* for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
25242567
*
25252568
* On success the end of the reference is returned (colon or end of string)
25262569
* beg/end will be set, plus tid if getid has been supplied.
25272570
* On failure NULL is returned.
25282571
*/
2529-
const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
2530-
hts_name2id_f getid, void *hdr)
2572+
const char *hts_parse_region(const char *s, int *tid, int *beg, int *end,
2573+
hts_name2id_f getid, void *hdr, int flags)
25312574
{
2532-
// FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped
2533-
// reads, and strictly have NULL as failure test?
2534-
int tid_, s_len = strlen(s); // int is sufficient given beg/end types
2535-
if (!tid) tid = &tid_; // simplifies code below
2575+
if (!s || !tid || !beg || !end || !getid)
2576+
return NULL;
25362577

2537-
const char *colon = NULL;
2578+
int s_len = strlen(s); // int is sufficient given beg/end types
2579+
kstring_t ks = { 0, 0, NULL };
2580+
2581+
const char *colon = NULL, *comma = NULL;
25382582
int quoted = 0;
25392583

2584+
if (flags & HTS_PARSE_LIST)
2585+
flags &= ~HTS_PARSE_THOUSANDS_SEP;
2586+
else
2587+
flags |= HTS_PARSE_THOUSANDS_SEP;
2588+
2589+
const char *s_end = s + s_len;
2590+
25402591
// Braced quoting of references is permitted to resolve ambiguities.
25412592
if (*s == '{') {
2542-
const char *close = strrchr(s, '}');
2593+
const char *close = memchr(s, '}', s_len);
25432594
if (!close) {
25442595
hts_log_error("Mismatching braces in \"%s\"", s);
25452596
return NULL;
@@ -2549,36 +2600,56 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25492600
if (close[1] == ':')
25502601
colon = close+1;
25512602
quoted = 1; // number of trailing characters to trim
2603+
2604+
// Truncate to this item only, if appropriate.
2605+
if (flags & HTS_PARSE_LIST) {
2606+
comma = strchr(close, ',');
2607+
if (comma) {
2608+
s_len = comma-s;
2609+
s_end = comma+1;
2610+
}
2611+
}
25522612
} else {
2553-
colon = strrchr(s, ':');
2613+
// Truncate to this item only, if appropriate.
2614+
if (flags & HTS_PARSE_LIST) {
2615+
comma = strchr(s, ',');
2616+
if (comma) {
2617+
s_len = comma-s;
2618+
s_end = comma+1;
2619+
}
2620+
}
2621+
2622+
colon = hts_memrchr(s, ':', s_len);
25542623
}
25552624

2625+
// No colon is simplest case; just check and return.
25562626
if (colon == NULL) {
25572627
*beg = 0; *end = INT_MAX;
2558-
if (getid) {
2559-
kstring_t ks = { 0, 0, NULL };
2560-
kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
2561-
if (!ks.s) {
2562-
*tid = -1;
2563-
return NULL;
2564-
}
2565-
2566-
*tid = getid(hdr, ks.s);
2567-
free(ks.s);
2568-
} else {
2569-
*tid = 0;
2628+
kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
2629+
if (!ks.s) {
2630+
*tid = -1;
2631+
return NULL;
25702632
}
2571-
return *tid >= 0 ? s + s_len : NULL;
2633+
2634+
*tid = getid(hdr, ks.s);
2635+
free(ks.s);
2636+
2637+
return *tid >= 0 ? s_end : NULL;
25722638
}
25732639

25742640
// Has a colon, but check whole name first.
2575-
if (!quoted && getid) {
2641+
if (!quoted) {
25762642
*beg = 0; *end = INT_MAX;
2577-
if ((*tid = getid(hdr, s)) >= 0) {
2643+
kputsn(s, s_len, &ks); // convert to nul terminated string
2644+
if (!ks.s) {
2645+
*tid = -1;
2646+
return NULL;
2647+
}
2648+
if ((*tid = getid(hdr, ks.s)) >= 0) {
25782649
// Entire name matches, but also check this isn't
25792650
// ambiguous. eg we have ref chr1 and ref chr1:100-200
25802651
// both present.
2581-
kstring_t ks = { 0, 0, NULL };
2652+
ks.l = 0;
25822653
kputsn(s, colon-s, &ks); // convert to nul terminated string
25832654
if (!ks.s) {
25842655
*tid = -1;
@@ -2594,39 +2665,81 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25942665
}
25952666
free(ks.s);
25962667

2597-
return s + s_len;
2668+
return s_end;
25982669
}
25992670
}
26002671

2601-
char *hyphen;
2602-
*beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2603-
if (*beg < 0) *beg = 0;
2604-
2605-
if (*hyphen == '\0') *end = INT_MAX;
2606-
else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2607-
else return NULL;
2672+
// Quoted, or unquoted and whole string isn't a name.
2673+
// Check the pre-colon part is valid.
2674+
ks.l = 0;
2675+
kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
2676+
if (!ks.s) {
2677+
*tid = -1;
2678+
return NULL;
2679+
}
2680+
*tid = getid(hdr, ks.s);
2681+
free(ks.s);
2682+
if (*tid < 0)
2683+
return NULL;
26082684

2609-
if (*beg >= *end) return NULL;
2610-
if (getid) {
2611-
kstring_t ks = { 0, 0, NULL };
2612-
kputsn(s, colon-s-quoted, &ks); // convert to nul terminated string
2613-
if (!ks.s) {
2614-
*tid = -1;
2685+
// Finally parse the post-colon coordinates
2686+
char *hyphen;
2687+
*beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
2688+
if (*beg < 0) {
2689+
if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
2690+
// interpret chr:-100 as chr:1-100
2691+
*end = *beg==-1 ? INT_MAX : -(*beg+1);
2692+
*beg = 0;
2693+
return s_end;
2694+
} else if (*hyphen == '-') {
2695+
*beg = 0;
2696+
} else {
2697+
hts_log_error("Unexpected string \"%s\" after region", hyphen);
26152698
return NULL;
26162699
}
2617-
*tid = getid(hdr, ks.s);
2618-
free(ks.s);
2619-
if (*tid < 0)
2700+
}
2701+
2702+
if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
2703+
*end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT_MAX;
2704+
} else if (*hyphen == '-') {
2705+
*end = hts_parse_decimal(hyphen+1, &hyphen, flags);
2706+
if (*hyphen != '\0' && *hyphen != ',') {
2707+
hts_log_error("Unexpected string \"%s\" after region", hyphen);
26202708
return NULL;
2709+
}
26212710
} else {
2622-
*tid = 0;
2711+
hts_log_error("Unexpected string \"%s\" after region", hyphen);
2712+
return NULL;
26232713
}
2624-
return colon;
2714+
2715+
if (*end == 0)
2716+
*end = INT_MAX; // interpret chr:100- as chr:100-<end>
2717+
2718+
if (*beg >= *end) return NULL;
2719+
2720+
return s_end;
26252721
}
26262722

2723+
// Next release we should mark this as deprecated?
2724+
// Use hts_parse_region above instead.
26272725
const char *hts_parse_reg(const char *s, int *beg, int *end)
26282726
{
2629-
return hts_parse_reg2(s, NULL, beg, end, NULL, NULL);
2727+
char *hyphen;
2728+
const char *colon = strrchr(s, ':');
2729+
if (colon == NULL) {
2730+
*beg = 0; *end = INT_MAX;
2731+
return s + strlen(s);
2732+
}
2733+
2734+
*beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1;
2735+
if (*beg < 0) *beg = 0;
2736+
2737+
if (*hyphen == '\0') *end = INT_MAX;
2738+
else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP);
2739+
else return NULL;
2740+
2741+
if (*beg >= *end) return NULL;
2742+
return colon;
26302743
}
26312744

26322745
hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
@@ -2638,7 +2751,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g
26382751
else if (strcmp(reg, "*") == 0)
26392752
return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);
26402753

2641-
if (!hts_parse_reg2(reg, &tid, &beg, &end, getid, hdr))
2754+
if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
26422755
return NULL;
26432756

26442757
return itr_query(idx, tid, beg, end, readrec);

htslib/hts.h

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,8 @@ int hts_idx_set_meta(hts_idx_t *idx, uint32_t l_meta, uint8_t *meta, int is_copy
641641

642642

643643
#define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers
644+
#define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end
645+
#define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP)
644646

645647
/// Parse a numeric string
646648
/** The number may be expressed in scientific notation, and optionally may
@@ -666,20 +668,31 @@ long long hts_parse_decimal(const char *str, char **strend, int flags);
666668

667669
typedef int (*hts_name2id_f)(void*, const char*);
668670
typedef const char *(*hts_id2name_f)(void*, int);
671+
669672
const char *hts_parse_reg(const char *str, int *beg, int *end);
670673

674+
//typedef struct {
675+
// const char *chr_beg, *chr_end;
676+
// int tid;
677+
// int start, end; // half open, zero based
678+
//} hts_parsed_region;
679+
//const char *hts_parse_region_(const char *str, hts_parsed_region *reg,
680+
// hts_name2id_f getid, void *hdr, int flags);
681+
682+
671683
/// Parse a "CHR:START-END"-style region string
672684
/** @param str String to be parsed
673685
@param tid Set on return (if not NULL) to be reference index (-1 if invalid)
674686
@param beg Set on return to the 0-based start of the region
675687
@param end Set on return to the 1-based end of the region
676688
@param getid Function pointer. Called if not NULL to set tid.
677689
@param hdr Caller data passed to getid.
690+
@param flags Bitwise HTS_PARSE_* flags listed above.
678691
@return Pointer to the colon or '\0' after the reference sequence name,
679692
or NULL if @a str could not be parsed.
680693
*/
681-
const char *hts_parse_reg2(const char *str, int *tid, int *beg, int *end,
682-
hts_name2id_f getid, void *hdr);
694+
const char *hts_parse_region(const char *str, int *tid, int *beg, int *end,
695+
hts_name2id_f getid, void *hdr, int flags);
683696

684697
hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec);
685698
void hts_itr_destroy(hts_itr_t *iter);

htslib/sam.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,7 +275,7 @@ typedef struct {
275275
int bam_hdr_write(BGZF *fp, const bam_hdr_t *h) HTS_RESULT_USED;
276276
void bam_hdr_destroy(bam_hdr_t *h);
277277
int bam_name2id(bam_hdr_t *h, const char *ref);
278-
const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end);
278+
const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end, int flags);
279279
bam_hdr_t* bam_hdr_dup(const bam_hdr_t *h0);
280280

281281
bam1_t *bam_init1(void);

sam.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,8 @@ int bam_name2id(bam_hdr_t *h, const char *ref)
285285
return k == kh_end(d)? -1 : kh_val(d, k);
286286
}
287287

288-
const char *sam_parse_reg(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end) {
289-
// FIXME: do we need to also use cram_name2id?
290-
return hts_parse_reg2(s, tid, beg, end, (hts_name2id_f)bam_name2id, h);
288+
const char *sam_parse_region(bam_hdr_t *h, const char *s, int *tid, int *beg, int *end, int flags) {
289+
return hts_parse_region(s, tid, beg, end, (hts_name2id_f)bam_name2id, h, flags);
291290
}
292291

293292
/*************************

test/colons.bam

268 Bytes
Binary file not shown.

test/colons.bam.bai

424 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)