@@ -2490,6 +2490,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
2490
2490
if (esign == '-' ) e = - e ;
2491
2491
}
2492
2492
2493
+ switch (* s ) {
2494
+ case 'k' : case 'K' : e += 3 ; s ++ ; break ;
2495
+ case 'm' : case 'M' : e += 6 ; s ++ ; break ;
2496
+ case 'g' : case 'G' : e += 9 ; s ++ ; break ;
2497
+ }
2498
+
2493
2499
e -= decimals ;
2494
2500
while (e > 0 ) n *= 10 , e -- ;
2495
2501
while (e < 0 ) lost += n % 10 , n /= 10 , e ++ ;
@@ -2501,45 +2507,90 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
2501
2507
if (strend ) {
2502
2508
* strend = (char * )s ;
2503
2509
} else if (* s ) {
2504
- hts_log_warning ("Ignoring unknown characters after %.*s[%s]" , (int )(s - str ), str , s );
2510
+ if ((flags & HTS_PARSE_THOUSANDS_SEP ) || (!(flags & HTS_PARSE_THOUSANDS_SEP ) && * s != ',' ))
2511
+ hts_log_warning ("Ignoring unknown characters after %.*s[%s]" , (int )(s - str ), str , s );
2505
2512
}
2506
2513
2507
2514
return (sign == '+' )? n : - n ;
2508
2515
}
2509
2516
2517
+ static void * hts_memrchr (const void * s , int c , size_t n ) {
2518
+ size_t i ;
2519
+ unsigned char * u = (unsigned char * )s ;
2520
+ for (i = n ; i > 0 ; i -- ) {
2521
+ if (u [i - 1 ] == c )
2522
+ return u + i - 1 ;
2523
+ }
2524
+
2525
+ return NULL ;
2526
+ }
2527
+
2510
2528
/*
2511
2529
* A variant of hts_parse_reg which is reference-id aware. It uses
2512
2530
* the iterator name2id callbacks to validate the region tokenisation works.
2513
2531
*
2514
2532
* This is necessary due to GRCh38 HLA additions which have reference names
2515
2533
* like "HLA-DRB1*12:17".
2516
2534
*
2517
- * getid is optional and may be passed in as NULL. If given it is used to
2518
- * validate the reference name exists and is unambiguously parseable. If not
2519
- * given the best guess will be made but no has guarantees in validity.
2535
+ * All parameters are mandatory.
2536
+ *
2537
+ * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200"
2538
+ * are reference names, we may quote using curly braces.
2539
+ * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example.
2540
+ *
2541
+ * Flags are used to control how parsing works, and can be one of the below.
2542
+ *
2543
+ * HTS_PARSE_LIST:
2544
+ * If present, the region is assmed to be a comma separated list and
2545
+ * position parsing will not contain commas (this implicitly
2546
+ * clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal).
2547
+ * On success the return pointer will be the start of the next region, ie
2548
+ * the character after the comma. (If *ret != '\0' then the caller can
2549
+ * assume another region is present in the list.)
2550
+ *
2551
+ * If not set then positions may contain commas. In this case the return
2552
+ * value should point to the end of the string, or NULL on failure.
2520
2553
*
2521
- * To work around these issues quoting is also permitted via {ref}:start-end.
2522
- * In this case, the return value will point to '}' and not the end of the
2523
- * reference (but this is a useful indication that it started with '{').
2554
+ * HTS_PARSE_ONE_COORD:
2555
+ * If present, X:100 is treated as the single base pair region X:100-100.
2556
+ * In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>.
2557
+ * (This is the standard bcftools region convention.)
2558
+ *
2559
+ * When not set X:100 is considered to be X:100-<end> where <end> is
2560
+ * the end of chromosome X (set to INT_MAX here). X:100- and X:-100 are
2561
+ * invalid.
2562
+ * (This is the standard samtools region convention.)
2563
+ *
2564
+ * Note the supplied string expects 1 based inclusive coordinates, but the
2565
+ * returned coordinates start from 0 and are half open, so pos0 is valid
2566
+ * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}"
2524
2567
*
2525
2568
* On success the end of the reference is returned (colon or end of string)
2526
2569
* beg/end will be set, plus tid if getid has been supplied.
2527
2570
* On failure NULL is returned.
2528
2571
*/
2529
- const char * hts_parse_reg2 (const char * s , int * tid , int * beg , int * end ,
2530
- hts_name2id_f getid , void * hdr )
2572
+ const char * hts_parse_region (const char * s , int * tid , int * beg , int * end ,
2573
+ hts_name2id_f getid , void * hdr , int flags )
2531
2574
{
2532
- // FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped
2533
- // reads, and strictly have NULL as failure test?
2534
- int tid_ , s_len = strlen (s ); // int is sufficient given beg/end types
2535
- if (!tid ) tid = & tid_ ; // simplifies code below
2575
+ if (!s || !tid || !beg || !end || !getid )
2576
+ return NULL ;
2536
2577
2537
- const char * colon = NULL ;
2578
+ int s_len = strlen (s ); // int is sufficient given beg/end types
2579
+ kstring_t ks = { 0 , 0 , NULL };
2580
+
2581
+ const char * colon = NULL , * comma = NULL ;
2538
2582
int quoted = 0 ;
2539
2583
2584
+ if (flags & HTS_PARSE_LIST )
2585
+ flags &= ~HTS_PARSE_THOUSANDS_SEP ;
2586
+ else
2587
+ flags |= HTS_PARSE_THOUSANDS_SEP ;
2588
+
2589
+ const char * s_end = s + s_len ;
2590
+
2540
2591
// Braced quoting of references is permitted to resolve ambiguities.
2541
2592
if (* s == '{' ) {
2542
- const char * close = strrchr (s , '}' );
2593
+ const char * close = memchr (s , '}' , s_len );
2543
2594
if (!close ) {
2544
2595
hts_log_error ("Mismatching braces in \"%s\"" , s );
2545
2596
return NULL ;
@@ -2549,36 +2600,56 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
2549
2600
if (close [1 ] == ':' )
2550
2601
colon = close + 1 ;
2551
2602
quoted = 1 ; // number of trailing characters to trim
2603
+
2604
+ // Truncate to this item only, if appropriate.
2605
+ if (flags & HTS_PARSE_LIST ) {
2606
+ comma = strchr (close , ',' );
2607
+ if (comma ) {
2608
+ s_len = comma - s ;
2609
+ s_end = comma + 1 ;
2610
+ }
2611
+ }
2552
2612
} else {
2553
- colon = strrchr (s , ':' );
2613
+ // Truncate to this item only, if appropriate.
2614
+ if (flags & HTS_PARSE_LIST ) {
2615
+ comma = strchr (s , ',' );
2616
+ if (comma ) {
2617
+ s_len = comma - s ;
2618
+ s_end = comma + 1 ;
2619
+ }
2620
+ }
2621
+
2622
+ colon = hts_memrchr (s , ':' , s_len );
2554
2623
}
2555
2624
2625
+ // No colon is simplest case; just check and return.
2556
2626
if (colon == NULL ) {
2557
2627
* beg = 0 ; * end = INT_MAX ;
2558
- if (getid ) {
2559
- kstring_t ks = { 0 , 0 , NULL };
2560
- kputsn (s , s_len - quoted , & ks ); // convert to nul terminated string
2561
- if (!ks .s ) {
2562
- * tid = -1 ;
2563
- return NULL ;
2564
- }
2565
-
2566
- * tid = getid (hdr , ks .s );
2567
- free (ks .s );
2568
- } else {
2569
- * tid = 0 ;
2628
+ kputsn (s , s_len - quoted , & ks ); // convert to nul terminated string
2629
+ if (!ks .s ) {
2630
+ * tid = -1 ;
2631
+ return NULL ;
2570
2632
}
2571
- return * tid >= 0 ? s + s_len : NULL ;
2633
+
2634
+ * tid = getid (hdr , ks .s );
2635
+ free (ks .s );
2636
+
2637
+ return * tid >= 0 ? s_end : NULL ;
2572
2638
}
2573
2639
2574
2640
// Has a colon, but check whole name first.
2575
- if (!quoted && getid ) {
2641
+ if (!quoted ) {
2576
2642
* beg = 0 ; * end = INT_MAX ;
2577
- if ((* tid = getid (hdr , s )) >= 0 ) {
2643
+ kputsn (s , s_len , & ks ); // convert to nul terminated string
2644
+ if (!ks .s ) {
2645
+ * tid = -1 ;
2646
+ return NULL ;
2647
+ }
2648
+ if ((* tid = getid (hdr , ks .s )) >= 0 ) {
2578
2649
// Entire name matches, but also check this isn't
2579
2650
// ambiguous. eg we have ref chr1 and ref chr1:100-200
2580
2651
// both present.
2581
- kstring_t ks = { 0 , 0 , NULL } ;
2652
+ ks . l = 0 ;
2582
2653
kputsn (s , colon - s , & ks ); // convert to nul terminated string
2583
2654
if (!ks .s ) {
2584
2655
* tid = -1 ;
@@ -2594,39 +2665,81 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
2594
2665
}
2595
2666
free (ks .s );
2596
2667
2597
- return s + s_len ;
2668
+ return s_end ;
2598
2669
}
2599
2670
}
2600
2671
2601
- char * hyphen ;
2602
- * beg = hts_parse_decimal (colon + 1 , & hyphen , HTS_PARSE_THOUSANDS_SEP ) - 1 ;
2603
- if (* beg < 0 ) * beg = 0 ;
2604
-
2605
- if (* hyphen == '\0' ) * end = INT_MAX ;
2606
- else if (* hyphen == '-' ) * end = hts_parse_decimal (hyphen + 1 , NULL , HTS_PARSE_THOUSANDS_SEP );
2607
- else return NULL ;
2672
+ // Quoted, or unquoted and whole string isn't a name.
2673
+ // Check the pre-colon part is valid.
2674
+ ks .l = 0 ;
2675
+ kputsn (s , colon - s - quoted , & ks ); // convert to nul terminated string
2676
+ if (!ks .s ) {
2677
+ * tid = -1 ;
2678
+ return NULL ;
2679
+ }
2680
+ * tid = getid (hdr , ks .s );
2681
+ free (ks .s );
2682
+ if (* tid < 0 )
2683
+ return NULL ;
2608
2684
2609
- if (* beg >= * end ) return NULL ;
2610
- if (getid ) {
2611
- kstring_t ks = { 0 , 0 , NULL };
2612
- kputsn (s , colon - s - quoted , & ks ); // convert to nul terminated string
2613
- if (!ks .s ) {
2614
- * tid = -1 ;
2685
+ // Finally parse the post-colon coordinates
2686
+ char * hyphen ;
2687
+ * beg = hts_parse_decimal (colon + 1 , & hyphen , flags ) - 1 ;
2688
+ if (* beg < 0 ) {
2689
+ if (isdigit (* hyphen ) || * hyphen == '\0' || * hyphen == ',' ) {
2690
+ // interpret chr:-100 as chr:1-100
2691
+ * end = * beg == -1 ? INT_MAX : - (* beg + 1 );
2692
+ * beg = 0 ;
2693
+ return s_end ;
2694
+ } else if (* hyphen == '-' ) {
2695
+ * beg = 0 ;
2696
+ } else {
2697
+ hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
2615
2698
return NULL ;
2616
2699
}
2617
- * tid = getid (hdr , ks .s );
2618
- free (ks .s );
2619
- if (* tid < 0 )
2700
+ }
2701
+
2702
+ if (* hyphen == '\0' || ((flags & HTS_PARSE_LIST ) && * hyphen == ',' )) {
2703
+ * end = flags & HTS_PARSE_ONE_COORD ? * beg + 1 : INT_MAX ;
2704
+ } else if (* hyphen == '-' ) {
2705
+ * end = hts_parse_decimal (hyphen + 1 , & hyphen , flags );
2706
+ if (* hyphen != '\0' && * hyphen != ',' ) {
2707
+ hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
2620
2708
return NULL ;
2709
+ }
2621
2710
} else {
2622
- * tid = 0 ;
2711
+ hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
2712
+ return NULL ;
2623
2713
}
2624
- return colon ;
2714
+
2715
+ if (* end == 0 )
2716
+ * end = INT_MAX ; // interpret chr:100- as chr:100-<end>
2717
+
2718
+ if (* beg >= * end ) return NULL ;
2719
+
2720
+ return s_end ;
2625
2721
}
2626
2722
2723
+ // Next release we should mark this as deprecated?
2724
+ // Use hts_parse_region above instead.
2627
2725
const char * hts_parse_reg (const char * s , int * beg , int * end )
2628
2726
{
2629
- return hts_parse_reg2 (s , NULL , beg , end , NULL , NULL );
2727
+ char * hyphen ;
2728
+ const char * colon = strrchr (s , ':' );
2729
+ if (colon == NULL ) {
2730
+ * beg = 0 ; * end = INT_MAX ;
2731
+ return s + strlen (s );
2732
+ }
2733
+
2734
+ * beg = hts_parse_decimal (colon + 1 , & hyphen , HTS_PARSE_THOUSANDS_SEP ) - 1 ;
2735
+ if (* beg < 0 ) * beg = 0 ;
2736
+
2737
+ if (* hyphen == '\0' ) * end = INT_MAX ;
2738
+ else if (* hyphen == '-' ) * end = hts_parse_decimal (hyphen + 1 , NULL , HTS_PARSE_THOUSANDS_SEP );
2739
+ else return NULL ;
2740
+
2741
+ if (* beg >= * end ) return NULL ;
2742
+ return colon ;
2630
2743
}
2631
2744
2632
2745
hts_itr_t * hts_itr_querys (const hts_idx_t * idx , const char * reg , hts_name2id_f getid , void * hdr , hts_itr_query_func * itr_query , hts_readrec_func * readrec )
@@ -2638,7 +2751,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g
2638
2751
else if (strcmp (reg , "*" ) == 0 )
2639
2752
return itr_query (idx , HTS_IDX_NOCOOR , 0 , 0 , readrec );
2640
2753
2641
- if (!hts_parse_reg2 (reg , & tid , & beg , & end , getid , hdr ))
2754
+ if (!hts_parse_region (reg , & tid , & beg , & end , getid , hdr , HTS_PARSE_THOUSANDS_SEP ))
2642
2755
return NULL ;
2643
2756
2644
2757
return itr_query (idx , tid , beg , end , readrec );
0 commit comments