Skip to content

Commit d26300e

Browse files
committed
Update hts_reglist_create() to use hts_parse_region()
Hash table can now use tid (cast to khash32_t, which is unsigned) as key instead of the region name, avoiding some string copying.
1 parent 73eee10 commit d26300e

File tree

1 file changed

+35
-47
lines changed

1 file changed

+35
-47
lines changed

region.c

Lines changed: 35 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,10 @@ typedef struct reglist
3131
{
3232
uint32_t n, m;
3333
uint64_t *a;
34+
int tid;
3435
} reglist_t;
3536

36-
KHASH_MAP_INIT_STR(reg, reglist_t)
37+
KHASH_MAP_INIT_INT(reg, reglist_t)
3738
typedef kh_reg_t reghash_t;
3839

3940
static int compare_uint64 (const void * a, const void * b)
@@ -52,7 +53,7 @@ static void reg_print(reghash_t *h) {
5253
reglist_t *p;
5354
khint_t k;
5455
uint32_t i;
55-
const char *reg;
56+
khint32_t key;
5657
uint32_t beg, end;
5758

5859
if (!h) {
@@ -61,16 +62,16 @@ static void reg_print(reghash_t *h) {
6162
}
6263
for (k = kh_begin(h); k < kh_end(h); k++) {
6364
if (kh_exist(h,k)) {
64-
reg = kh_key(h,k);
65-
fprintf(stderr, "Region: '%s'\n", reg);
65+
key = kh_key(h,k);
66+
fprintf(stderr, "Region: key %u tid %d\n", key, p->tid);
6667
if ((p = &kh_val(h,k)) != NULL && p->n > 0) {
6768
for (i=0; i<p->n; i++) {
6869
beg = (uint32_t)(p->a[i]>>32);
6970
end = (uint32_t)(p->a[i]);
7071
fprintf(stderr, "\tinterval[%d]: %d-%d\n", i, beg, end);
7172
}
7273
} else {
73-
fprintf(stderr, "Region '%s' has no intervals!\n", reg);
74+
fprintf(stderr, "Region key %u has no intervals!\n", key);
7475
}
7576
}
7677
}
@@ -109,7 +110,7 @@ static int reg_compact(reghash_t *h) {
109110
return count;
110111
}
111112

112-
static int reg_insert(reghash_t *h, char *reg, unsigned int beg, unsigned int end) {
113+
static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) {
113114

114115
khint_t k;
115116
reglist_t *p;
@@ -118,17 +119,15 @@ static int reg_insert(reghash_t *h, char *reg, unsigned int beg, unsigned int en
118119
return -1;
119120

120121
// Put reg in the hash table if not already there
121-
k = kh_get(reg, h, reg); //looks strange, but only the second reg is the actual region name.
122+
k = kh_get(reg, h, tid);
122123
if (k == kh_end(h)) { // absent from the hash table
123124
int ret;
124-
char *s = strdup(reg);
125-
if (NULL == s) return -1;
126-
k = kh_put(reg, h, s, &ret);
125+
k = kh_put(reg, h, tid, &ret);
127126
if (-1 == ret) {
128-
free(s);
129127
return -1;
130128
}
131129
memset(&kh_val(h, k), 0, sizeof(reglist_t));
130+
kh_val(h, k).tid = tid;
132131
}
133132
p = &kh_val(h, k);
134133

@@ -156,7 +155,6 @@ static void reg_destroy(reghash_t *h) {
156155
for (k = 0; k < kh_end(h); ++k) {
157156
if (kh_exist(h, k)) {
158157
free(kh_val(h, k).a);
159-
free((char*)kh_key(h, k));
160158
}
161159
}
162160
kh_destroy(reg, h);
@@ -175,11 +173,10 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr
175173
hts_reglist_t *h_reglist = NULL;
176174

177175
khint_t k;
178-
int i, l_count = 0;
176+
int i, l_count = 0, tid;
179177
uint32_t j;
180-
char reg[1024];
181178
const char *q;
182-
int beg, end;
179+
int64_t beg, end;
183180

184181
/* First, transform the char array into a hash table */
185182
h = kh_init(reg);
@@ -189,65 +186,56 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr
189186
}
190187

191188
for (i=0; i<argc; i++) {
192-
q = hts_parse_reg(argv[i], &beg, &end);
193-
if (q) {
194-
if (q - argv[i] > sizeof(reg) - 1) {
195-
hts_log_error("Region name '%s' is too long (bigger than %d)", argv[i], (int) sizeof(reg) - 1);
196-
continue;
197-
}
198-
memcpy(reg, argv[i], q - argv[i]);
199-
reg[q - argv[i]] = 0;
189+
if (!strcmp(argv[i], ".")) {
190+
q = argv[i] + 1;
191+
tid = HTS_IDX_START; beg = 0; end = INT64_MAX;
192+
} else if (!strcmp(argv[i], "*")) {
193+
q = argv[i] + 1;
194+
tid = HTS_IDX_NOCOOR; beg = 0; end = INT64_MAX;
200195
} else {
201-
// not parsable as a region, but possibly a sequence named "foo:a"
202-
if (strlen(argv[i]) > sizeof(reg) - 1) {
203-
hts_log_error("Region name '%s' is too long (bigger than %d)", argv[i], (int) sizeof(reg) - 1);
204-
continue;
205-
}
206-
strcpy(reg, argv[i]);
207-
beg = 0; end = INT_MAX;
196+
q = hts_parse_region(argv[i], &tid, &beg, &end, getid, hdr,
197+
HTS_PARSE_THOUSANDS_SEP);
208198
}
199+
if (!q) {
200+
// not parsable as a region
201+
hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", argv[i]);
202+
continue;
203+
}
204+
205+
if (beg > INT_MAX) beg = INT_MAX; // Remove when fully 64-bit compliant
206+
if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant
209207

210-
if (reg_insert(h, reg, beg, end) != 0) {
208+
if (reg_insert(h, tid, beg, end) != 0) {
211209
hts_log_error("Error when inserting region='%s' in the bed hash table at address=%p", argv[i], (void *) h);
212210
goto fail;
213211
}
214212
}
215213

216214
*r_count = reg_compact(h);
217215
if (!*r_count)
218-
return NULL;
216+
goto fail;
219217

220218
/* Transform the hash table into a list */
221219
h_reglist = (hts_reglist_t *)calloc(*r_count, sizeof(hts_reglist_t));
222220
if (!h_reglist)
223-
return NULL;
221+
goto fail;
224222

225223
for (k = kh_begin(h); k < kh_end(h) && l_count < *r_count; k++) {
226224
if (!kh_exist(h,k) || !(p = &kh_val(h,k)))
227225
continue;
228226

229-
char *reg_name = (char *)kh_key(h,k);
230-
if (!strcmp(reg_name, ".")) {
231-
h_reglist[l_count].tid = HTS_IDX_START;
232-
} else if (!strcmp(reg_name, "*")) {
233-
h_reglist[l_count].tid = HTS_IDX_NOCOOR;
234-
} else {
235-
h_reglist[l_count].tid = getid(hdr, reg_name);
236-
if (h_reglist[l_count].tid < 0)
237-
hts_log_warning("Region '%s' specifies an unknown reference name. Continue anyway", reg_name);
238-
}
239-
240-
h_reglist[l_count].intervals = (hts_pair32_t *)calloc(p->n, sizeof(hts_pair32_t));
227+
h_reglist[l_count].tid = p->tid;
228+
h_reglist[l_count].intervals = calloc(p->n, sizeof(h_reglist[l_count].intervals[0]));
241229
if(!(h_reglist[l_count].intervals)) {
242-
hts_log_error("Could not allocate memory for intervals for region='%s'", kh_key(h,k));
230+
hts_log_error("Could not allocate memory for intervals");
243231
goto fail;
244232
}
245233
h_reglist[l_count].count = p->n;
246234
h_reglist[l_count].max_end = 0;
247235

248236
for (j = 0; j < p->n; j++) {
249237
h_reglist[l_count].intervals[j].beg = (uint32_t)(p->a[j]>>32);
250-
h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j]);
238+
h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j] & 0xffffffffU);
251239

252240
if (h_reglist[l_count].intervals[j].end > h_reglist[l_count].max_end)
253241
h_reglist[l_count].max_end = h_reglist[l_count].intervals[j].end;

0 commit comments

Comments
 (0)