Skip to content

Commit 455a96e

Browse files
committed
converting bcf 1st phase data as in v44
1 parent 4a11e1f commit 455a96e

File tree

4 files changed

+174
-7
lines changed

4 files changed

+174
-7
lines changed

htslib/vcf.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,6 +1518,21 @@ HTSLIB_EXPORT
15181518
int bcf_format_gt_v2(const bcf_hdr_t *hdr, bcf_fmt_t *fmt, int isample,
15191519
kstring_t *str) HTS_RESULT_USED;
15201520

1521+
1522+
/**
1523+
* update44phasing - converts GT to/from v4.4 way representation
1524+
* @param h - bcf header, to get version
1525+
* @param v - pointer to bcf data
1526+
* @param setreset - whether to set or reset
1527+
* Returns 0 on success and -1 on failure
1528+
* For data read, to be converted to v44, setreset to be 1. For data write, to
1529+
* be converted to v < v44, setreset to be 0.
1530+
* If the version in header is >= 4.4, no change is made. Otherwise 1st phasing
1531+
* is set if there are no other unphased ones.
1532+
*/
1533+
HTSLIB_EXPORT int update44phasing(bcf_hdr_t *h, bcf1_t *b, int setreset)
1534+
HTS_RESULT_USED;
1535+
15211536
static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
15221537
{
15231538
return bcf_format_gt_v2(NULL, fmt, isample, str);

synced_bcf_reader.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -708,6 +708,11 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
708708
ret = bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1]);
709709
if ( ret < -1 ) files->errnum = bcf_read_error;
710710
if ( ret < 0 ) break; // no more lines or an error
711+
//set phasing of 1st value as in vcf v44
712+
if ((ret = update44phasing(reader->header, reader->buffer[reader->nbuffer+1], 1))) {
713+
files->errnum = bcf_read_error;
714+
break;
715+
}
711716
bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
712717
}
713718

test/test.pl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -369,9 +369,9 @@ sub failed
369369
if ( defined $reason ) { print STDERR "\t$reason\n"; }
370370
print STDERR ".. failed ...\n\n";
371371
STDERR->flush();
372-
if ($$opts{fail_fast}) {
373-
die "\n";
374-
}
372+
#if ($$opts{fail_fast}) {
373+
# die "\n";
374+
#}
375375
}
376376
sub passed
377377
{

vcf.c

Lines changed: 151 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,6 +1814,135 @@ static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec,
18141814
(*reports)++;
18151815
}
18161816

1817+
/**
1818+
* updatephasing - updates 1st phasing based on other phasing status
1819+
* @param p - pointer to phase value array
1820+
* @param end - end of array
1821+
* @param q - pointer to consumed data
1822+
* @param set - whether to set (while read) or reset (for write)
1823+
* @param samples - no. of samples in array
1824+
* @param ploidy - no. of phasing values per sample
1825+
* @param type - value type (one of BCF_BT_...)
1826+
* Returns 0 on success and 1 on failure
1827+
*/
1828+
static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int set, int samples, int ploidy, int type)
1829+
{
1830+
int j, k, upd = set ? 1 : -1;
1831+
size_t bytes;
1832+
for (j = 0; j < samples; ++j) { //for each sample
1833+
int anyunphased = 0;
1834+
uint8_t *ptr1 = p;
1835+
int32_t al1 = 0, val;
1836+
for (k = 0; k < ploidy; ++k) { //in each ploidy
1837+
switch(type) {
1838+
case BCF_BT_INT8:
1839+
val = *(uint8_t*)p;
1840+
break;
1841+
case BCF_BT_INT16:
1842+
val = *(uint16_t*)p;
1843+
break;
1844+
case BCF_BT_INT32:
1845+
val = *(uint32_t*)p;
1846+
break;
1847+
//wont have anything bigger than 32bit for GT
1848+
default: //invalid
1849+
return 1;
1850+
break;
1851+
}
1852+
if (!k) al1 = val;
1853+
else if (!(val & 1)) {
1854+
anyunphased = 1;
1855+
}
1856+
//get to next phasing or skip the rest for this sample
1857+
bytes = (anyunphased ? ploidy - k : 1) << bcf_type_shift[type];
1858+
if (end - p < bytes)
1859+
return 1;
1860+
p += bytes;
1861+
if (anyunphased) {
1862+
break; //no further check required
1863+
}
1864+
}
1865+
if (!anyunphased && al1 > 1) { //no other unphased
1866+
/*set phased on read or make unphased on write as upto 4.3 1st
1867+
phasing is not described explicitly and has to be inferred*/
1868+
switch(type) {
1869+
case BCF_BT_INT8:
1870+
*(uint8_t*)ptr1 += upd;
1871+
break;
1872+
case BCF_BT_INT16:
1873+
*(uint16_t*)ptr1 += upd;
1874+
break;
1875+
case BCF_BT_INT32:
1876+
*(uint32_t*)ptr1 += upd;
1877+
break;
1878+
}
1879+
}
1880+
}
1881+
*q = p;
1882+
return 0;
1883+
}
1884+
1885+
/**
1886+
* update44phasing - converts GT to/from v4.4 way representation
1887+
* @param h - bcf header, to get version
1888+
* @param v - pointer to bcf data
1889+
* @param setreset - whether to set or reset
1890+
* Returns 0 on success and -1 on failure
1891+
* For data read, to be converted to v44, setreset to be 1. For data write, to
1892+
* be converted to v < v44, setreset to be 0.
1893+
* If the version in header is >= 4.4, no change is made. Otherwise 1st phasing
1894+
* is set if there are no other unphased ones.
1895+
*/
1896+
HTSLIB_EXPORT int update44phasing(bcf_hdr_t *h, bcf1_t *b, int setreset)
1897+
{
1898+
int i, idgt = -1, ver = VCF_DEF, num, type;
1899+
uint8_t *ptr = NULL, *end = NULL;
1900+
if (!b) return 1;
1901+
1902+
ver = bcf_get_version(h, "");
1903+
idgt = h ? bcf_hdr_id2int(h, BCF_DT_ID, "GT") : -1;
1904+
if (ver >= VCF44 || idgt == -1) return 0; //no change required
1905+
1906+
if (b->unpacked & BCF_UN_FMT) { //unpacked, get from decoded data
1907+
for (i=0; i<b->n_fmt; i++)
1908+
{
1909+
if ( b->d.fmt[i].id == idgt ) {
1910+
ptr = b->d.fmt[i].p;
1911+
end = ptr + b->d.fmt[i].p_len;
1912+
num = b->d.fmt[i].n;
1913+
type = b->d.fmt[i].type;
1914+
break;
1915+
}
1916+
}
1917+
} else { //get from indiv.s binary stream
1918+
ptr = (uint8_t *) b->indiv.s;
1919+
end = ptr + b->indiv.l;
1920+
int found = 0;
1921+
for (i = 0; i < b->n_fmt; ++i) {
1922+
int32_t key = -1;
1923+
if (bcf_dec_typed_int1_safe(ptr, end, &ptr, &key) != 0) return 1;
1924+
if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) return 1;
1925+
if (type > BCF_BT_CHAR) return 1; //invalid type
1926+
if (idgt == key) {
1927+
found = 1;
1928+
break;
1929+
} else { //skip and check next
1930+
size_t bytes = ((size_t) num << bcf_type_shift[type]) * b->n_sample;
1931+
if (end - ptr < bytes) return 1;
1932+
ptr += bytes;
1933+
}
1934+
}
1935+
if (!found) {
1936+
ptr = end = NULL;
1937+
}
1938+
}
1939+
if (ptr) {
1940+
//with GT and v < v44, need phase conversion
1941+
if (updatephasing(ptr, end, &ptr, setreset, b->n_sample, num, type)) return 1;
1942+
}
1943+
return 0;
1944+
}
1945+
18171946
static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
18181947
uint8_t *ptr, *end;
18191948
size_t bytes;
@@ -1832,6 +1961,10 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
18321961
(1 << BCF_BT_FLOAT) |
18331962
(1 << BCF_BT_CHAR));
18341963
int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0;
1964+
int idgt = -1, ver = VCF_DEF;
1965+
1966+
ver = bcf_get_version(hdr, NULL);
1967+
idgt = hdr ? bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1;
18351968

18361969
// Check for valid contig ID
18371970
if (rec->rid < 0
@@ -1941,9 +2074,16 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) {
19412074
bcf_record_check_err(hdr, rec, "type", &reports, type);
19422075
err |= BCF_ERR_TAG_INVALID;
19432076
}
1944-
bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
1945-
if (end - ptr < bytes) goto bad_indiv;
1946-
ptr += bytes;
2077+
if (ver < VCF44 && idgt != -1 && idgt == key) {
2078+
//with GT and v < v44, need phase conversion
2079+
if (updatephasing(ptr, end, &ptr, 1, rec->n_sample, num, type)) {
2080+
err |= BCF_ERR_TAG_INVALID;
2081+
}
2082+
} else {
2083+
bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample;
2084+
if (end - ptr < bytes) goto bad_indiv;
2085+
ptr += bytes;
2086+
}
19472087
}
19482088

19492089
if (!err && rec->rlen < 0) {
@@ -2032,6 +2172,8 @@ int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_po
20322172
if (ret == 0) ret = bcf_record_check(NULL, v);
20332173
if (ret >= 0)
20342174
*tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen;
2175+
/*bcf data read may need conversion to vcf44 phasing format, as header is
2176+
not availble here, it has to be done after this returns the data*/
20352177
return ret;
20362178
}
20372179

@@ -2300,6 +2442,11 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v)
23002442
return -1;
23012443
}
23022444

2445+
if (update44phasing(h, v, 0)) { //reset phasing update made after read
2446+
hts_log_error("Failed to set prorper phasing at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
2447+
return -1;
2448+
}
2449+
23032450
BGZF *fp = hfp->fp.bgzf;
23042451
uint8_t x[32];
23052452
u32_to_le(v->shared.l + 24, x); // to include six 32-bit integers
@@ -3225,7 +3372,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
32253372
is_phased = (*t == '|');
32263373
if (*t != '|' && *t != '/') break;
32273374
}
3228-
if (ver >= VCF44 && !phasingprfx) {
3375+
if (!phasingprfx) { //get GT in v44 way when no prefixed phasing
32293376
/* no explicit phasing for 1st allele, set based on
32303377
other alleles and ploidy */
32313378
if (ploidy == 1) { //implicitly phased

0 commit comments

Comments
 (0)