Skip to content

Commit ac70212

Browse files
committed
Revert most of the vcf_parse_info improvements.
The two we keep are the internal while loop to find the next ; or = instead of iterating back in the outer for loop, and memcmp instead of strcmp for "END". The strchr changes do help glibc on excessively long INFO tokens, seen in the GIAB truth set and GNOMAD files, but they have no impact on most mainstream VCF outputs. Furthermore, other C libraries, such as MUSL, are considerably slowed down by the use of strchr. Hence this isn't a particularly robust or warranted change.
1 parent 2318975 commit ac70212

File tree

1 file changed

+22
-43
lines changed

1 file changed

+22
-43
lines changed

vcf.c

Lines changed: 22 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3336,50 +3336,26 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
33363336

33373337
v->n_info = 0;
33383338
if (*(q-1) == ';') *(q-1) = 0;
3339-
3340-
// Parsing consists of processing key=value; or key; so we only need
3341-
// to track the next = and ; symbols, plus end of string.
3342-
// We could process 1 char at a time, but this is inefficient compared
3343-
// to strchr which can process word by word. Hence even doing two strchrs
3344-
// is quicker than byte by byte processing.
3345-
char *next_equals = strchr(p, '=');
3346-
char *next_semicolon = strchr(p, ';');
3347-
r = p;
3348-
while (*r) {
3349-
// Look for key=value or just key.
3350-
char *val, *end, *from;
3351-
key = r;
3352-
if (next_equals && (!next_semicolon || next_equals < next_semicolon)) {
3353-
// key=value;
3354-
*next_equals = 0;
3355-
from = val = next_equals+1;
3356-
// Prefetching d->keys[hash] helps here provided we avoid
3357-
// computing hash twice (needs API change), but not universally.
3358-
// It may be significant for other applications though so it's
3359-
// something to consider for the future.
3360-
} else {
3361-
// key;
3362-
val = NULL;
3363-
from = key;
3364-
}
3365-
3366-
// Update location of next ; and if used =
3367-
if (next_semicolon) {
3368-
end = next_semicolon;
3369-
r = end+1;
3370-
next_semicolon = strchr(end+1, ';');
3371-
if (val)
3372-
next_equals = strchr(end, '=');
3373-
} else {
3374-
// find nul location, starting from key or val.
3375-
r = end = from + strlen(from);
3339+
for (r = key = p;; ++r) {
3340+
int c;
3341+
char *val, *end;
3342+
while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
3343+
if (v->n_info == UINT16_MAX) {
3344+
hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
3345+
bcf_seqname_safe(h,v), v->pos+1);
3346+
v->errcode |= BCF_ERR_LIMITS;
3347+
goto fail;
33763348
}
3349+
val = end = NULL;
3350+
c = *r; *r = 0;
3351+
if (c == '=') {
3352+
val = r + 1;
33773353

3378-
*end = 0;
3379-
3380-
// We've now got key and val (maybe NULL), so process it
3354+
for (end = val; *end != ';' && *end != 0; ++end);
3355+
c = *end; *end = 0;
3356+
} else end = r;
3357+
if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO
33813358
k = kh_get(vdict, d, key);
3382-
// 15 is default (unknown) type. See bcf_idinfo_def at top
33833359
if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
33843360
{
33853361
hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
@@ -3481,8 +3457,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
34813457
} else {
34823458
bcf_enc_vint(str, n_val, a_val, -1);
34833459
}
3484-
if (n_val==1 && (val1!=bcf_int32_missing || is_int64) &&
3485-
memcmp(key, "END", 4) == 0)
3460+
if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
3461+
&& memcmp(key, "END", 4) == 0)
34863462
{
34873463
if ( val1 <= v->pos )
34883464
{
@@ -3508,6 +3484,9 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
35083484
bcf_enc_vfloat(str, n_val, val_f);
35093485
}
35103486
}
3487+
if (c == 0) break;
3488+
r = end;
3489+
key = r + 1;
35113490
}
35123491

35133492
free(a_val);

0 commit comments

Comments
 (0)