Skip to content

Commit 1c9902b

Browse files
waynewayne
authored andcommitted
Added the MTF generation of the BWT for testcase of self - delimiting
codes benchmark.
1 parent da70b32 commit 1c9902b

File tree

18 files changed

+1082
-13
lines changed

18 files changed

+1082
-13
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
include ../../Make.helper
2+
SRC_DIR = src
3+
BIN_DIR = bin
4+
LIBS = -lsdsl
5+
RES_FILE = results/result.csv #result file of benchmark
6+
VAT_FILE = results/vat.csv #vector assignment table (vector name -> sdsl type)
7+
TC_FILE = results/tc.csv #test case table (contains only test case names)
8+
9+
#utility
10+
empty:=
11+
space:= $(empty) $(empty)
12+
comma:= ,
13+
14+
#load test cases
15+
TC_IDS := $(call config_ids,test_case.config)
16+
TC_SRC := $(foreach TC_ID,$(TC_IDS),\
17+
$(call config_select,test_case.config,$(TC_ID),2))
18+
TC_FILES := $(foreach TC_ID,$(TC_IDS),\
19+
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\
20+
../tmp/BWT_MTF.$(TC_ID),\
21+
$(call config_select,test_case.config,$(TC_ID),2)))
22+
23+
all: $(RES_FILE)
24+
25+
timing: $(RES_FILE)
26+
@cd visualize;make
27+
28+
#compilation of bwt - mtf - transform algorithm
29+
$(BIN_DIR)/gen_bwt_mtf: $(SRC_DIR)/gen_bwt_mtf.cpp
30+
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -L$(LIB_DIR) "$(SRC_DIR)/gen_bwt_mtf.cpp"\
31+
-I$(INC_DIR) -o "$(BIN_DIR)/gen_bwt_mtf" $(LIBS) -ldivsufsort -ldivsufsort64
32+
33+
#generation of MTF of BWT
34+
../tmp/BWT_MTF.%: $(TC_SRC) $(BIN_DIR)/gen_bwt_mtf
35+
$(eval TC_ID:=$*)
36+
$(eval TC_PATH:=$(call config_select,test_case.config,$(TC_ID),2))
37+
$(eval NUM_BYTE:=$(call config_select,test_case.config,$(TC_ID),5))
38+
@$(BIN_DIR)/gen_bwt_mtf $(TC_PATH) ../tmp/BWT_MTF.$(TC_ID) ../tmp $(NUM_BYTE)
39+
40+
#compilation and creation of vector assignment table
41+
$(BIN_DIR)/sdcbenchmark: $(SRC_DIR)/sdc_benchmark.cpp vectors.config compile_options.config
42+
$(eval VTYPES := $(subst $(space),$(comma),$(strip $(call config_column,vectors.config,2))))
43+
$(eval VNAMES := $(subst $(space),\"$(comma)\",$(strip $(call config_column,vectors.config,3))))
44+
$(eval VNAMES := $(addprefix {\",$(VNAMES)))
45+
$(eval VNAMES := $(addsuffix \"},$(VNAMES)))
46+
$(eval C_OPTIONS:=$(call config_ids,compile_options.config))
47+
@echo "Compiling build for vectors $(VNAMES)"
48+
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -DVTYPES="$(VTYPES)" -DVNAMES="$(VNAMES)" -L$(LIB_DIR)\
49+
"$(SRC_DIR)/sdc_benchmark.cpp" -I$(INC_DIR) -o "$(BIN_DIR)/sdcbenchmark" $(LIBS)
50+
$(eval V_IDS := $(call config_ids,vectors.config))
51+
$(eval V_ASSIGNMENTTABLE := $(subst $(space),\n,$(strip $(foreach V_ID,$(V_IDS),\
52+
$(call config_select,vectors.config,$(V_ID),3);$(call config_select,vectors.config,$(V_ID),2)))))
53+
@echo "Writing Vector Assignment Table"
54+
@echo "vector;sdsltype" > $(VAT_FILE)
55+
@echo "$(V_ASSIGNMENTTABLE)" >> $(VAT_FILE)
56+
57+
#execution and creation of test case table
58+
$(RES_FILE): test_case.config $(TC_FILES) $(BIN_DIR)/sdcbenchmark
59+
$(eval ARGS := $(foreach TC_ID,$(TC_IDS),\
60+
$(call config_select,test_case.config,$(TC_ID),3) $(space) \
61+
$(if $(findstring BWT_MTF,$(call config_select,test_case.config,$(TC_ID),6)),\
62+
../tmp/BWT_MTF.$(TC_ID),\
63+
$(call config_select,test_case.config,$(TC_ID),2)) $(space) \
64+
$(call config_select,test_case.config,$(TC_ID),5) ) )
65+
@echo "Executing Benchmark"
66+
@$(BIN_DIR)/sdcbenchmark $(ARGS) | tee $(RES_FILE)
67+
$(eval TC_TABLE := $(subst $(space),\n,$(strip $(call config_column,test_case.config,3))))
68+
@echo "Writing Test Case file"
69+
@echo "testcase\\nOverall" > $(TC_FILE)
70+
@echo "$(TC_TABLE)" >> $(TC_FILE)
71+
72+
include ../Make.download
73+
74+
clean-build:
75+
@echo "Remove executables"
76+
rm -f $(BIN_DIR)/*
77+
78+
clean-result:
79+
@echo "Remove results"
80+
rm -f results/*
81+
82+
cleanall: clean-build clean-result
83+
@cd visualize;make cleanall
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Benchmarking self delimiting codes
2+
3+
## Methodology
4+
5+
Explored dimensions:
6+
7+
* self - delimiting code implementations
8+
* test cases
9+
* methods (`encoding`, `decoding`)
10+
11+
## Directory structure
12+
13+
* [bin](./bin): Contains the executables of the project.
14+
* [results](./results): Contains the results of the experiments.
15+
* [src](./src): Contains the source code of the benchmark.
16+
* [visualize](./visualize): Contains LaTex files and a makefile for generating a report
17+
18+
## Prerequisites
19+
20+
* To run the test on larger test cases (>= 200 MB), you should have at least 2 GB
21+
of free memory (some vectors have very poor compression).
22+
* For the visualization you need the following software:
23+
- [pdflatex][LT] to generate the pdf reports.
24+
- [pgfplots][PGFP] version 1.10 installed in [LT] to generate plots in pdf reports.
25+
26+
## Usage
27+
28+
* `make timing` compiles the programs, downloads or generates
29+
the test instances, builds the compression vectors,
30+
runs the performance tests and generates a report located at
31+
`visualize/self_delimiting_codes.pdf`. The raw numbers of the encoding / decoding
32+
rates and compression can be found in the file `results/result.csv`.
33+
The used test cases can be found in file `results/tc.csv`.
34+
The tested vectors can be found in file `results/vat.csv`.
35+
The default benchmark took about 6 hours on my machine (Asus P50IJ
36+
Pentium(R) Dual-Core CPU T4500 @ 2.30GHz 2GB).
37+
* All created binaries and test results can be deleted
38+
by calling `make cleanall`.
39+
40+
## Customization of the benchmark
41+
42+
The project contains several configuration files:
43+
44+
* [vectors.config][VCONFIG]: Specify different compression vectors and their used coders.
45+
* [test_case.config][TCCONFIG]: Specify test instances by ID, path, LaTeX-name
46+
for the report, and download URL.
47+
* [compile_options.config][CCONFIG]: Specify compile options by option string.
48+
49+
Note that the benchmark will execute every combination of vectors and test cases.
50+
51+
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex"
52+
[PGFP]: http://www.ctan.org/pkg/pgfplots "pgfplots"
53+
[VCONFIG]: ./vectors.config "vectors.config"
54+
[TCCONFIG]: ./test_case.config "test_case.config"
55+
[CCONFIG]: ./compile_options.config "compile_options.config"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Compile options
2+
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <sdsl/suffix_arrays.hpp>
4+
#include <string>
5+
#include <vector>
6+
7+
using namespace sdsl;
8+
9+
//routine to save a vector in different formats, see lower implementations
10+
template<class INT_VECTOR, uint8_t num_byte>
11+
void saveVector(const INT_VECTOR &v, const char *dest);
12+
13+
//main function to generate MTF of BWT of an integer vector.
14+
// CSA_WT: used wavelet - tree - based suffix array implementation
15+
// INT_VECTOR: used integer vector for extracting BWT
16+
// num_byte: value indicating how result has to be opened / saved
17+
// srcfile: file from which to generate
18+
// destfile: file where to save result
19+
// tmpdir: directory used for temporary results
20+
// conf_bwt_key: key what is able to fetch bwt after suffix array construction
21+
template<class CSA_WT, class INT_VECTOR, uint8_t num_byte>
22+
void gen_bwt_mtf(const char *srcfile, const char *destfile, const char *tmpdir,
23+
const char *conf_bwt_key) {
24+
//utility for CSA generation
25+
cache_config cc(false, tmpdir, "gen_bwt_mtf_");
26+
INT_VECTOR bwt;
27+
28+
//create suffix array
29+
CSA_WT wt;
30+
construct(wt, srcfile, cc, num_byte);
31+
32+
//compute alphabet table from suffix array
33+
std::vector<uint64_t> alph_tbl( wt.sigma );
34+
for (uint64_t i = 0; i < wt.sigma; i++) {
35+
alph_tbl.push_back( wt.comp2char[i] );
36+
}
37+
38+
//fetch bwt
39+
load_from_file(bwt, cache_file_name(conf_bwt_key, cc));
40+
41+
//create mtf
42+
for (uint64_t i = 0; i < bwt.size(); i++) {
43+
uint64_t c = bwt[i];
44+
//find c in alphabet table and move it to front
45+
uint64_t j = 0;
46+
do {
47+
uint64_t tmp = alph_tbl[j];
48+
alph_tbl[j++] = c;
49+
c = tmp;
50+
} while (c != alph_tbl.front());
51+
//and write it's index to mtf transform of bwt
52+
bwt[i] = j-1;
53+
}
54+
55+
//save everything
56+
saveVector<INT_VECTOR, num_byte>( bwt, destfile );
57+
58+
//and free resources
59+
util::delete_all_files(cc.file_map);
60+
}
61+
62+
//functions for saving an integer vector in different formats
63+
//generic version (raw output)
64+
template<class INT_VECTOR, uint8_t num_byte>
65+
void saveVector(const INT_VECTOR &v, const char *dest) {
66+
std::ofstream out(dest);
67+
out.write((char *)v.data(), num_byte * v.size());
68+
}
69+
//serialization of integer vector
70+
template<>
71+
void saveVector<int_vector<>, 0>(const int_vector<> &v, const char *dest) {
72+
store_to_file(v, dest);
73+
}
74+
//decimal digits
75+
template<>
76+
void saveVector<int_vector<>, 'd'>(const int_vector<> &v, const char *dest) {
77+
std::ofstream out(dest);
78+
if (v.size()) out << v[0];
79+
for (uint64_t i = 1; i < v.size(); i++) {
80+
out << " " << v[i];
81+
}
82+
}
83+
84+
//main function
85+
int main(int argc, char* argv[]) {
86+
if (argc != 5) {
87+
std::cout<<"Usage: input_file output_file temp_dir num_byte" << std::endl;
88+
return 1;
89+
}
90+
std::cout << "Calculate MTF Transform of BWT of " << argv[1]
91+
<< " and store it to " << argv[2] << std::endl;
92+
93+
typedef csa_wt<> csa_wt_byte;
94+
typedef csa_wt<wt_int<>, 64, 64, sa_order_sa_sampling<>, int_vector<>, int_alphabet<>> csa_wt_int;
95+
96+
switch (argv[4][0]) {
97+
case 'd': //decimal digits
98+
gen_bwt_mtf<csa_wt_int, int_vector<>, 'd'>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
99+
return 0;
100+
case '0': //serialized integer vector
101+
gen_bwt_mtf<csa_wt_int, int_vector<>, 0>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
102+
return 0;
103+
case '1': //byte integer vector
104+
gen_bwt_mtf<csa_wt_byte, int_vector<8>, 1>(argv[1], argv[2], argv[3], conf::KEY_BWT);
105+
return 0;
106+
case '2': //2 byte integer vector
107+
gen_bwt_mtf<csa_wt_int, int_vector<>, 2>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
108+
return 0;
109+
case '4': //4 byte integer vector
110+
gen_bwt_mtf<csa_wt_int, int_vector<>, 4>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
111+
return 0;
112+
case '8': //8 byte integer vector
113+
gen_bwt_mtf<csa_wt_int, int_vector<>, 8>(argv[1], argv[2], argv[3], conf::KEY_BWT_INT);
114+
return 0;
115+
default:
116+
std::cout << "Illegal num_byte, allowed are 'd', 0, 1, 2, 4, 8" << std::endl;
117+
return 1;
118+
}
119+
}

0 commit comments

Comments
 (0)