@@ -43,6 +43,7 @@ DEALINGS IN THE SOFTWARE. */
43
43
#include "hts_internal.h"
44
44
45
45
typedef struct {
46
+ int id ; // faidx_t->name[id] is for this struct.
46
47
uint32_t line_len , line_blen ;
47
48
uint64_t len ;
48
49
uint64_t seq_offset ;
@@ -62,6 +63,13 @@ struct __faidx_t {
62
63
#define kroundup32 (x ) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
63
64
#endif
64
65
66
+ static int fai_name2id (void * v , const char * ref )
67
+ {
68
+ faidx_t * fai = (faidx_t * )v ;
69
+ khint_t k = kh_get (s , fai -> hash , ref );
70
+ return k == kh_end (fai -> hash ) ? -1 : kh_val (fai -> hash , k ).id ;
71
+ }
72
+
65
73
static inline int fai_insert_index (faidx_t * idx , const char * name , uint64_t len , uint32_t line_len , uint32_t line_blen , uint64_t seq_offset , uint64_t qual_offset )
66
74
{
67
75
if (!name ) {
@@ -89,6 +97,7 @@ static inline int fai_insert_index(faidx_t *idx, const char *name, uint64_t len,
89
97
}
90
98
idx -> name = tmp ;
91
99
}
100
+ v -> id = idx -> n ;
92
101
idx -> name [idx -> n ++ ] = name_key ;
93
102
v -> len = len ;
94
103
v -> line_len = line_len ;
@@ -684,14 +693,22 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) {
684
693
685
694
686
695
static char * fai_retrieve (const faidx_t * fai , const faidx1_t * val ,
687
- uint64_t offset , long beg , long end , int * len ) {
696
+ uint64_t offset , int64_t beg , int64_t end , int * len ) {
688
697
char * s ;
689
698
size_t l ;
690
699
int c = 0 ;
691
- int ret = bgzf_useek (fai -> bgzf ,
692
- offset
693
- + beg / val -> line_blen * val -> line_len
694
- + beg % val -> line_blen , SEEK_SET );
700
+ int ret ;
701
+
702
+ if ((uint64_t ) end - (uint64_t ) beg >= SIZE_MAX - 2 ) {
703
+ hts_log_error ("Range %" PRId64 "..%" PRId64 " too big" , beg , end );
704
+ * len = -1 ;
705
+ return NULL ;
706
+ }
707
+
708
+ ret = bgzf_useek (fai -> bgzf ,
709
+ offset
710
+ + beg / val -> line_blen * val -> line_len
711
+ + beg % val -> line_blen , SEEK_SET );
695
712
696
713
if (ret < 0 ) {
697
714
* len = -1 ;
@@ -721,85 +738,32 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val,
721
738
return s ;
722
739
}
723
740
724
-
725
- static int fai_get_val (const faidx_t * fai , const char * str , int * len , faidx1_t * val , long * fbeg , long * fend ) {
726
- char * s , * ep ;
727
- size_t i , l , k , name_end ;
741
+ static int fai_get_val (const faidx_t * fai , const char * str ,
742
+ int * len , faidx1_t * val , int64_t * fbeg , int64_t * fend ) {
728
743
khiter_t iter ;
729
744
khash_t (s ) * h ;
730
- long beg , end ;
745
+ int id ;
746
+ int64_t beg , end ;
731
747
732
- beg = end = -1 ;
733
- h = fai -> hash ;
734
- name_end = l = strlen (str );
735
- s = (char * )malloc (l + 1 );
736
- if (!s ) {
737
- * len = -1 ;
748
+ if (!fai_parse_region (fai , str , & id , & beg , & end , 0 )) {
749
+ hts_log_warning ("Reference %s not found in FASTA file, returning empty sequence" , str );
750
+ * len = -2 ;
738
751
return 1 ;
739
752
}
740
753
741
- // remove space
742
- for (i = k = 0 ; i < l ; ++ i )
743
- if (!isspace_c (str [i ])) s [k ++ ] = str [i ];
744
- s [k ] = 0 ;
745
- name_end = l = k ;
746
- // determine the sequence name
747
- for (i = l ; i > 0 ; -- i ) if (s [i - 1 ] == ':' ) break ; // look for colon from the end
748
- if (i > 0 ) name_end = i - 1 ;
749
- if (name_end < l ) { // check if this is really the end
750
- int n_hyphen = 0 ;
751
- for (i = name_end + 1 ; i < l ; ++ i ) {
752
- if (s [i ] == '-' ) ++ n_hyphen ;
753
- else if (!isdigit_c (s [i ]) && s [i ] != ',' ) break ;
754
- }
755
- if (i < l || n_hyphen > 1 ) name_end = l ; // malformated region string; then take str as the name
756
- s [name_end ] = 0 ;
757
- iter = kh_get (s , h , s );
758
- if (iter == kh_end (h )) { // cannot find the sequence name
759
- iter = kh_get (s , h , str ); // try str as the name
760
- if (iter != kh_end (h )) {
761
- s [name_end ] = ':' ;
762
- name_end = l ;
763
- }
764
- }
765
- } else iter = kh_get (s , h , str );
766
- if (iter == kh_end (h )) {
767
- hts_log_warning ("Reference %s not found in file, returning empty sequence" , str );
768
- free (s );
754
+ h = fai -> hash ;
755
+ iter = kh_get (s , h , faidx_iseq (fai , id ));
756
+ if (!iter ) {
757
+ // should have already been caught above
758
+ abort ();
769
759
* len = -2 ;
770
760
return 1 ;
771
761
}
772
762
* val = kh_value (h , iter );
773
- // parse the interval
774
- if (name_end < l ) {
775
- int save_errno = errno ;
776
- errno = 0 ;
777
- for (i = k = name_end + 1 ; i < l ; ++ i )
778
- if (s [i ] != ',' ) s [k ++ ] = s [i ];
779
- s [k ] = 0 ;
780
- if (s [name_end + 1 ] == '-' ) {
781
- beg = 0 ;
782
- i = name_end + 2 ;
783
- } else {
784
- beg = strtol (s + name_end + 1 , & ep , 10 );
785
- for (i = ep - s ; i < k ;) if (s [i ++ ] == '-' ) break ;
786
- }
787
- end = i < k ? strtol (s + i , & ep , 10 ) : val -> len ;
788
- if (beg > 0 ) -- beg ;
789
- // Check for out of range numbers. Only going to be a problem on
790
- // 32-bit platforms with >2Gb sequence length.
791
- if (errno == ERANGE && (uint64_t ) val -> len > LONG_MAX ) {
792
- hts_log_error ("Positions in range %s are too large for this platform" , s );
793
- free (s );
794
- * len = -3 ;
795
- return 1 ;
796
- }
797
- errno = save_errno ;
798
- } else beg = 0 , end = val -> len ;
763
+
799
764
if (beg >= val -> len ) beg = val -> len ;
800
765
if (end >= val -> len ) end = val -> len ;
801
766
if (beg > end ) beg = end ;
802
- free (s );
803
767
804
768
* fbeg = beg ;
805
769
* fend = end ;
@@ -811,7 +775,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, int *len, faidx1_t *
811
775
char * fai_fetch (const faidx_t * fai , const char * str , int * len )
812
776
{
813
777
faidx1_t val ;
814
- long beg , end ;
778
+ int64_t beg , end ;
815
779
816
780
if (fai_get_val (fai , str , len , & val , & beg , & end )) {
817
781
return NULL ;
@@ -824,7 +788,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len)
824
788
825
789
char * fai_fetchqual (const faidx_t * fai , const char * str , int * len ) {
826
790
faidx1_t val ;
827
- long beg , end ;
791
+ int64_t beg , end ;
828
792
829
793
if (fai_get_val (fai , str , len , & val , & beg , & end )) {
830
794
return NULL ;
@@ -924,3 +888,8 @@ int faidx_has_seq(const faidx_t *fai, const char *seq)
924
888
return 1 ;
925
889
}
926
890
891
+ const char * fai_parse_region (const faidx_t * fai , const char * s ,
892
+ int * tid , int64_t * beg , int64_t * end , int flags )
893
+ {
894
+ return hts_parse_region (s , tid , beg , end , (hts_name2id_f )fai_name2id , (void * )fai , flags );
895
+ }
0 commit comments