@@ -1814,6 +1814,135 @@ static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
1814
1814
(* reports )++ ;
1815
1815
}
1816
1816
1817
+ /**
1818
+ * updatephasing - updates 1st phasing based on other phasing status
1819
+ * @param p - pointer to phase value array
1820
+ * @param end - end of array
1821
+ * @param q - pointer to consumed data
1822
+ * @param set - whether to set (while read) or reset (for write)
1823
+ * @param samples - no. of samples in array
1824
+ * @param ploidy - no. of phasing values per sample
1825
+ * @param type - value type (one of BCF_BT_...)
1826
+ * Returns 0 on success and 1 on failure
1827
+ */
1828
+ static int updatephasing (uint8_t * p , uint8_t * end , uint8_t * * q , int set , int samples , int ploidy , int type )
1829
+ {
1830
+ int j , k , upd = set ? 1 : -1 ;
1831
+ size_t bytes ;
1832
+ for (j = 0 ; j < samples ; ++ j ) { //for each sample
1833
+ int anyunphased = 0 ;
1834
+ uint8_t * ptr1 = p ;
1835
+ int32_t al1 = 0 , val ;
1836
+ for (k = 0 ; k < ploidy ; ++ k ) { //in each ploidy
1837
+ switch (type ) {
1838
+ case BCF_BT_INT8 :
1839
+ val = * (uint8_t * )p ;
1840
+ break ;
1841
+ case BCF_BT_INT16 :
1842
+ val = * (uint16_t * )p ;
1843
+ break ;
1844
+ case BCF_BT_INT32 :
1845
+ val = * (uint32_t * )p ;
1846
+ break ;
1847
+ //wont have anything bigger than 32bit for GT
1848
+ default : //invalid
1849
+ return 1 ;
1850
+ break ;
1851
+ }
1852
+ if (!k ) al1 = val ;
1853
+ else if (!(val & 1 )) {
1854
+ anyunphased = 1 ;
1855
+ }
1856
+ //get to next phasing or skip the rest for this sample
1857
+ bytes = (anyunphased ? ploidy - k : 1 ) << bcf_type_shift [type ];
1858
+ if (end - p < bytes )
1859
+ return 1 ;
1860
+ p += bytes ;
1861
+ if (anyunphased ) {
1862
+ break ; //no further check required
1863
+ }
1864
+ }
1865
+ if (!anyunphased && al1 > 1 ) { //no other unphased
1866
+ /*set phased on read or make unphased on write as upto 4.3 1st
1867
+ phasing is not described explicitly and has to be inferred*/
1868
+ switch (type ) {
1869
+ case BCF_BT_INT8 :
1870
+ * (uint8_t * )ptr1 += upd ;
1871
+ break ;
1872
+ case BCF_BT_INT16 :
1873
+ * (uint16_t * )ptr1 += upd ;
1874
+ break ;
1875
+ case BCF_BT_INT32 :
1876
+ * (uint32_t * )ptr1 += upd ;
1877
+ break ;
1878
+ }
1879
+ }
1880
+ }
1881
+ * q = p ;
1882
+ return 0 ;
1883
+ }
1884
+
1885
+ /**
1886
+ * update44phasing - converts GT to/from v4.4 way representation
1887
+ * @param h - bcf header, to get version
1888
+ * @param v - pointer to bcf data
1889
+ * @param setreset - whether to set or reset
1890
+ * Returns 0 on success and -1 on failure
1891
+ * For data read, to be converted to v44, setreset to be 1. For data write, to
1892
+ * be converted to v < v44, setreset to be 0.
1893
+ * If the version in header is >= 4.4, no change is made. Otherwise 1st phasing
1894
+ * is set if there are no other unphased ones.
1895
+ */
1896
+ HTSLIB_EXPORT int update44phasing (bcf_hdr_t * h , bcf1_t * b , int setreset )
1897
+ {
1898
+ int i , idgt = -1 , ver = VCF_DEF , num , type ;
1899
+ uint8_t * ptr = NULL , * end = NULL ;
1900
+ if (!b ) return 1 ;
1901
+
1902
+ ver = bcf_get_version (h , "" );
1903
+ idgt = h ? bcf_hdr_id2int (h , BCF_DT_ID , "GT" ) : -1 ;
1904
+ if (ver >= VCF44 || idgt == -1 ) return 0 ; //no change required
1905
+
1906
+ if (b -> unpacked & BCF_UN_FMT ) { //unpacked, get from decoded data
1907
+ for (i = 0 ; i < b -> n_fmt ; i ++ )
1908
+ {
1909
+ if ( b -> d .fmt [i ].id == idgt ) {
1910
+ ptr = b -> d .fmt [i ].p ;
1911
+ end = ptr + b -> d .fmt [i ].p_len ;
1912
+ num = b -> d .fmt [i ].n ;
1913
+ type = b -> d .fmt [i ].type ;
1914
+ break ;
1915
+ }
1916
+ }
1917
+ } else { //get from indiv.s binary stream
1918
+ ptr = (uint8_t * ) b -> indiv .s ;
1919
+ end = ptr + b -> indiv .l ;
1920
+ int found = 0 ;
1921
+ for (i = 0 ; i < b -> n_fmt ; ++ i ) {
1922
+ int32_t key = -1 ;
1923
+ if (bcf_dec_typed_int1_safe (ptr , end , & ptr , & key ) != 0 ) return 1 ;
1924
+ if (bcf_dec_size_safe (ptr , end , & ptr , & num , & type ) != 0 ) return 1 ;
1925
+ if (type > BCF_BT_CHAR ) return 1 ; //invalid type
1926
+ if (idgt == key ) {
1927
+ found = 1 ;
1928
+ break ;
1929
+ } else { //skip and check next
1930
+ size_t bytes = ((size_t ) num << bcf_type_shift [type ]) * b -> n_sample ;
1931
+ if (end - ptr < bytes ) return 1 ;
1932
+ ptr += bytes ;
1933
+ }
1934
+ }
1935
+ if (!found ) {
1936
+ ptr = end = NULL ;
1937
+ }
1938
+ }
1939
+ if (ptr ) {
1940
+ //with GT and v < v44, need phase conversion
1941
+ if (updatephasing (ptr , end , & ptr , setreset , b -> n_sample , num , type )) return 1 ;
1942
+ }
1943
+ return 0 ;
1944
+ }
1945
+
1817
1946
static int bcf_record_check (const bcf_hdr_t * hdr , bcf1_t * rec ) {
1818
1947
uint8_t * ptr , * end ;
1819
1948
size_t bytes ;
@@ -1832,6 +1961,10 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1832
1961
(1 << BCF_BT_FLOAT ) |
1833
1962
(1 << BCF_BT_CHAR ));
1834
1963
int32_t max_id = hdr ? hdr -> n [BCF_DT_ID ] : 0 ;
1964
+ int idgt = -1 , ver = VCF_DEF ;
1965
+
1966
+ ver = bcf_get_version (hdr , NULL );
1967
+ idgt = hdr ? bcf_hdr_id2int (hdr , BCF_DT_ID , "GT" ) : -1 ;
1835
1968
1836
1969
// Check for valid contig ID
1837
1970
if (rec -> rid < 0
@@ -1941,9 +2074,16 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
1941
2074
bcf_record_check_err (hdr , rec , "type" , & reports , type );
1942
2075
err |= BCF_ERR_TAG_INVALID ;
1943
2076
}
1944
- bytes = ((size_t ) num << bcf_type_shift [type ]) * rec -> n_sample ;
1945
- if (end - ptr < bytes ) goto bad_indiv ;
1946
- ptr += bytes ;
2077
+ if (ver < VCF44 && idgt != -1 && idgt == key ) {
2078
+ //with GT and v < v44, need phase conversion
2079
+ if (updatephasing (ptr , end , & ptr , 1 , rec -> n_sample , num , type )) {
2080
+ err |= BCF_ERR_TAG_INVALID ;
2081
+ }
2082
+ } else {
2083
+ bytes = ((size_t ) num << bcf_type_shift [type ]) * rec -> n_sample ;
2084
+ if (end - ptr < bytes ) goto bad_indiv ;
2085
+ ptr += bytes ;
2086
+ }
1947
2087
}
1948
2088
1949
2089
if (!err && rec -> rlen < 0 ) {
@@ -2032,6 +2172,8 @@ int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_po
2032
2172
if (ret == 0 ) ret = bcf_record_check (NULL , v );
2033
2173
if (ret >= 0 )
2034
2174
* tid = v -> rid , * beg = v -> pos , * end = v -> pos + v -> rlen ;
2175
+ /*bcf data read may need conversion to vcf44 phasing format, as header is
2176
+ not availble here, it has to be done after this returns the data*/
2035
2177
return ret ;
2036
2178
}
2037
2179
@@ -2300,6 +2442,11 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
2300
2442
return -1 ;
2301
2443
}
2302
2444
2445
+ if (update44phasing (h , v , 0 )) { //reset phasing update made after read
2446
+ hts_log_error ("Failed to set prorper phasing at %s:%" PRIhts_pos "" , bcf_seqname_safe (h ,v ), v -> pos + 1 );
2447
+ return -1 ;
2448
+ }
2449
+
2303
2450
BGZF * fp = hfp -> fp .bgzf ;
2304
2451
uint8_t x [32 ];
2305
2452
u32_to_le (v -> shared .l + 24 , x ); // to include six 32-bit integers
@@ -3225,7 +3372,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
3225
3372
is_phased = (* t == '|' );
3226
3373
if (* t != '|' && * t != '/' ) break ;
3227
3374
}
3228
- if (ver >= VCF44 && !phasingprfx ) {
3375
+ if (!phasingprfx ) { //get GT in v44 way when no prefixed phasing
3229
3376
/* no explicit phasing for 1st allele, set based on
3230
3377
other alleles and ploidy */
3231
3378
if (ploidy == 1 ) { //implicitly phased
0 commit comments