Skip to content

Commit 4c825d6

Browse files
authored
[Store] Import Offset Allocator (#641)
Import a new offset allocator to improve the performance and allow merging the released memory.
1 parent 7e1ef1a commit 4c825d6

File tree

9 files changed

+2287
-2
lines changed

9 files changed

+2287
-2
lines changed

mooncake-store/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@ include_directories(
2020

2121
# Add subdirectories
2222
add_subdirectory(src)
23-
add_subdirectory(tests)
23+
add_subdirectory(tests)
24+
add_subdirectory(benchmarks)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Add allocator benchmark executable
2+
add_executable(allocator_bench allocator_bench.cpp)
3+
target_link_libraries(allocator_bench PRIVATE mooncake_store)
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#include <iostream>
2+
#include <random>
3+
#include <vector>
4+
#include <chrono>
5+
#include <algorithm>
6+
#include <numeric>
7+
#include <iomanip>
8+
9+
#include "offset_allocator/offset_allocator.hpp"
10+
11+
using namespace mooncake::offset_allocator;
12+
13+
class OffsetAllocatorBenchHelper {
14+
public:
15+
OffsetAllocatorBenchHelper(uint64_t baseAddress, uint32_t poolSize, uint32_t maxAllocs)
16+
: pool_size_(poolSize),
17+
allocated_size_(0),
18+
allocator_(OffsetAllocator::create(baseAddress, poolSize, maxAllocs)),
19+
rd_(),
20+
gen_(rd_()) {}
21+
22+
void allocate(uint32_t size) {
23+
while (true) {
24+
auto handle = allocator_->allocate(size);
25+
if (handle.has_value()) {
26+
allocated_.push_back(std::move(*handle));
27+
allocated_sizes_.push_back(size);
28+
allocated_size_ += size;
29+
break;
30+
}
31+
if (allocated_.size() == 0) {
32+
break;
33+
}
34+
std::uniform_int_distribution<uint32_t> dist(0,
35+
allocated_.size() - 1);
36+
auto index = dist(gen_);
37+
std::swap(allocated_[index], allocated_.back());
38+
std::swap(allocated_sizes_[index], allocated_sizes_.back());
39+
allocated_size_ -= allocated_sizes_.back();
40+
allocated_.pop_back();
41+
allocated_sizes_.pop_back();
42+
}
43+
}
44+
45+
double get_allocated_ratio() const {
46+
return static_cast<double>(allocated_size_) / pool_size_;
47+
}
48+
49+
private:
50+
uint64_t pool_size_;
51+
uint64_t allocated_size_;
52+
std::shared_ptr<OffsetAllocator> allocator_;
53+
std::vector<OffsetAllocationHandle> allocated_;
54+
std::vector<uint32_t> allocated_sizes_;
55+
std::random_device rd_;
56+
std::mt19937 gen_;
57+
};
58+
59+
template <typename BenchHelper>
60+
void uniform_size_allocation_benchmark() {
61+
std::cout << std::endl << "=== Uniform Size Allocation Benchmark ===" << std::endl;
62+
const size_t max_pool_size = 2ull * 1024 * 1024 * 1024;
63+
std::vector<uint32_t> allocation_sizes;
64+
for (uint32_t i = 32; i < (1 << 26); i *= 4) {
65+
allocation_sizes.push_back(i);
66+
}
67+
for (uint32_t i = 32; i < (1 << 26); i *= 4) {
68+
allocation_sizes.push_back(i - 17);
69+
}
70+
for (uint32_t i = 32; i < (1 << 26); i *= 4) {
71+
allocation_sizes.push_back(i + 17);
72+
}
73+
for (uint32_t i = 32; i < (1 << 26); i *= 4) {
74+
allocation_sizes.push_back(i * 0.9);
75+
}
76+
for (uint32_t i = 32; i < (1 << 26); i *= 4) {
77+
allocation_sizes.push_back(i * 1.1);
78+
}
79+
80+
for (auto alloc_size : allocation_sizes) {
81+
// For small allocation sizes, use a smaller pool size to avoid
82+
// benchmark runs too slow.
83+
size_t pool_size =
84+
alloc_size < 1024 ? max_pool_size / 16 : max_pool_size;
85+
size_t max_allocs = pool_size / alloc_size + 10;
86+
BenchHelper bench_helper(0x1000, pool_size, max_allocs);
87+
int warmup_num = pool_size / alloc_size;
88+
for (int i = 0; i < warmup_num; i++) {
89+
bench_helper.allocate(alloc_size);
90+
}
91+
92+
// START
93+
auto start_time = std::chrono::high_resolution_clock::now();
94+
double min_util_ratio = 1.0;
95+
double total_util_ratio = 0.0;
96+
int benchmark_num = 1000000;
97+
for (int i = 0; i < benchmark_num; i++) {
98+
bench_helper.allocate(alloc_size);
99+
double util_ratio = bench_helper.get_allocated_ratio();
100+
if (util_ratio < min_util_ratio) {
101+
min_util_ratio = util_ratio;
102+
}
103+
total_util_ratio += util_ratio;
104+
}
105+
auto end_time = std::chrono::high_resolution_clock::now();
106+
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time);
107+
// END
108+
double avg_util_ratio = total_util_ratio / benchmark_num;
109+
std::cout << "Alloc size: " << alloc_size
110+
<< ", min util ratio: " << min_util_ratio
111+
<< ", avg util ratio: " << avg_util_ratio
112+
<< ", time: " << duration.count() / benchmark_num << " ns" << std::endl;
113+
}
114+
}
115+
116+
template <typename BenchHelper>
117+
void random_size_allocation_benchmark() {
118+
std::cout << std::endl << "=== Random Size Allocation Benchmark ===" << std::endl;
119+
const size_t pool_size = 2ull * 1024 * 1024 * 1024;
120+
const size_t max_alloc_size = 1ull << 26;
121+
const size_t min_alloc_size = 1024;
122+
123+
std::random_device rd;
124+
std::mt19937 gen(rd());
125+
std::uniform_int_distribution<uint32_t> dist(min_alloc_size, max_alloc_size);
126+
127+
// Warmup
128+
size_t max_allocs = pool_size / min_alloc_size + 10;
129+
BenchHelper bench_helper(0x1000, pool_size, max_allocs);
130+
for (size_t warmup_size = 0; warmup_size < pool_size;) {
131+
size_t alloc_size = dist(gen);
132+
bench_helper.allocate(alloc_size);
133+
warmup_size += alloc_size;
134+
}
135+
136+
int benchmark_num = 1000000;
137+
std::vector<double> util_ratios;
138+
util_ratios.reserve(benchmark_num);
139+
140+
// Run benchmark
141+
auto start_time = std::chrono::high_resolution_clock::now();
142+
for (int i = 0; i < benchmark_num; i++) {
143+
size_t alloc_size = dist(gen);
144+
bench_helper.allocate(alloc_size);
145+
util_ratios.push_back(bench_helper.get_allocated_ratio());
146+
}
147+
auto end_time = std::chrono::high_resolution_clock::now();
148+
149+
// Calculate metrics
150+
const double avg_time_ns =
151+
std::chrono::duration_cast<std::chrono::nanoseconds>(end_time -
152+
start_time)
153+
.count() /
154+
static_cast<double>(benchmark_num);
155+
156+
std::sort(util_ratios.begin(), util_ratios.end());
157+
158+
const double min_util = util_ratios.front();
159+
const double max_util = util_ratios.back();
160+
const double p50 = util_ratios[util_ratios.size() * 0.50];
161+
const double p90 = util_ratios[util_ratios.size() * 0.10];
162+
const double p99 = util_ratios[util_ratios.size() * 0.01];
163+
164+
const double mean_util =
165+
std::accumulate(util_ratios.begin(), util_ratios.end(), 0.0) /
166+
util_ratios.size();
167+
168+
std::cout << std::fixed << std::setprecision(6);
169+
std::cout << "util ratio (min / p99 / p90 / p50 / max / avg): " << min_util
170+
<< " / " << p99 << " / " << p90 << " / " << p50 << " / "
171+
<< max_util << " / " << mean_util << std::endl;
172+
std::cout << "avg alloc time: " << avg_time_ns << " ns/op" << std::endl;
173+
}
174+
175+
int main() {
176+
std::cout << "=== OffsetAllocator Benchmark ===" << std::endl;
177+
uniform_size_allocation_benchmark<OffsetAllocatorBenchHelper>();
178+
random_size_allocation_benchmark<OffsetAllocatorBenchHelper>();
179+
}
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
# Allocator Memory Utilization Benchmark
2+
3+
## Execution
4+
5+
```bash
6+
./mooncake-store/benchmarks/allocator_bench
7+
```
8+
9+
## Result
10+
11+
- alloc size: The size of each object
12+
- utilization ratio: The total allocated size / total space
13+
- time: time in nanoseconds for each object allocation
14+
- OffsetAllocator optimization: whether round up the allocated size to a bin size
15+
16+
### Uniform size, size equals power of 2
17+
18+
**OffsetAllocator (After Optimization)**
19+
20+
```
21+
Alloc size: 32, min util ratio: 1, avg util ratio: 1, time: 544 ns
22+
Alloc size: 128, min util ratio: 1, avg util ratio: 1, time: 417 ns
23+
Alloc size: 512, min util ratio: 1, avg util ratio: 1, time: 174 ns
24+
Alloc size: 2048, min util ratio: 1, avg util ratio: 1, time: 406 ns
25+
Alloc size: 8192, min util ratio: 1, avg util ratio: 1, time: 180 ns
26+
Alloc size: 32768, min util ratio: 1, avg util ratio: 1, time: 133 ns
27+
Alloc size: 131072, min util ratio: 1, avg util ratio: 1, time: 109 ns
28+
Alloc size: 524288, min util ratio: 1, avg util ratio: 1, time: 100 ns
29+
Alloc size: 2097152, min util ratio: 1, avg util ratio: 1, time: 99 ns
30+
Alloc size: 8388608, min util ratio: 1, avg util ratio: 1, time: 99 ns
31+
Alloc size: 33554432, min util ratio: 1, avg util ratio: 1, time: 98 ns
32+
```
33+
34+
**OffsetAllocator (Before Optimization)**
35+
36+
```
37+
Alloc size: 32, min util ratio: 1, avg util ratio: 1, time: 539 ns
38+
Alloc size: 128, min util ratio: 1, avg util ratio: 1, time: 419 ns
39+
Alloc size: 512, min util ratio: 1, avg util ratio: 1, time: 217 ns
40+
Alloc size: 2048, min util ratio: 1, avg util ratio: 1, time: 408 ns
41+
Alloc size: 8192, min util ratio: 1, avg util ratio: 1, time: 175 ns
42+
Alloc size: 32768, min util ratio: 1, avg util ratio: 1, time: 130 ns
43+
Alloc size: 131072, min util ratio: 1, avg util ratio: 1, time: 107 ns
44+
Alloc size: 524288, min util ratio: 1, avg util ratio: 1, time: 99 ns
45+
Alloc size: 2097152, min util ratio: 1, avg util ratio: 1, time: 100 ns
46+
Alloc size: 8388608, min util ratio: 1, avg util ratio: 1, time: 98 ns
47+
Alloc size: 33554432, min util ratio: 1, avg util ratio: 1, time: 98 ns
48+
```
49+
50+
### Uniform size, size equals power of 2 +/- 17
51+
52+
**OffsetAllocator (After Optimization)**
53+
54+
```
55+
Alloc size: 15, min util ratio: 1, avg util ratio: 1, time: 568 ns
56+
Alloc size: 111, min util ratio: 0.991071, avg util ratio: 0.991071, time: 441 ns
57+
Alloc size: 495, min util ratio: 0.966797, avg util ratio: 0.966797, time: 178 ns
58+
Alloc size: 2031, min util ratio: 0.991699, avg util ratio: 0.991699, time: 418 ns
59+
Alloc size: 8175, min util ratio: 0.997925, avg util ratio: 0.997925, time: 170 ns
60+
Alloc size: 32751, min util ratio: 0.999481, avg util ratio: 0.999481, time: 133 ns
61+
Alloc size: 131055, min util ratio: 0.99987, avg util ratio: 0.99987, time: 109 ns
62+
Alloc size: 524271, min util ratio: 0.999968, avg util ratio: 0.999968, time: 100 ns
63+
Alloc size: 2097135, min util ratio: 0.999992, avg util ratio: 0.999992, time: 99 ns
64+
Alloc size: 8388591, min util ratio: 0.999998, avg util ratio: 0.999998, time: 98 ns
65+
Alloc size: 33554415, min util ratio: 0.999999, avg util ratio: 0.999999, time: 99 ns
66+
Alloc size: 49, min util ratio: 0.942308, avg util ratio: 0.942308, time: 508 ns
67+
Alloc size: 145, min util ratio: 0.906249, avg util ratio: 0.906249, time: 372 ns
68+
Alloc size: 529, min util ratio: 0.918399, avg util ratio: 0.918399, time: 172 ns
69+
Alloc size: 2065, min util ratio: 0.896267, avg util ratio: 0.896267, time: 403 ns
70+
Alloc size: 8209, min util ratio: 0.89073, avg util ratio: 0.89073, time: 174 ns
71+
Alloc size: 32785, min util ratio: 0.889347, avg util ratio: 0.889347, time: 131 ns
72+
Alloc size: 131089, min util ratio: 0.88897, avg util ratio: 0.88897, time: 105 ns
73+
Alloc size: 524305, min util ratio: 0.888701, avg util ratio: 0.888701, time: 102 ns
74+
Alloc size: 2097169, min util ratio: 0.888679, avg util ratio: 0.888679, time: 100 ns
75+
Alloc size: 8388625, min util ratio: 0.886721, avg util ratio: 0.886721, time: 100 ns
76+
Alloc size: 33554449, min util ratio: 0.875, avg util ratio: 0.875, time: 100 ns
77+
```
78+
79+
**OffsetAllocator (Before Optimization)**
80+
81+
```
82+
Alloc size: 15, min util ratio: 1, avg util ratio: 1, time: 566 ns
83+
Alloc size: 111, min util ratio: 0.669866, avg util ratio: 0.710845, time: 703 ns
84+
Alloc size: 495, min util ratio: 0.665779, avg util ratio: 0.676874, time: 238 ns
85+
Alloc size: 2031, min util ratio: 0.668333, avg util ratio: 0.705411, time: 637 ns
86+
Alloc size: 8175, min util ratio: 0.666175, avg util ratio: 0.676474, time: 242 ns
87+
Alloc size: 32751, min util ratio: 0.664435, avg util ratio: 0.669078, time: 168 ns
88+
Alloc size: 131055, min util ratio: 0.66062, avg util ratio: 0.667341, time: 124 ns
89+
Alloc size: 524271, min util ratio: 0.653055, avg util ratio: 0.666993, time: 118 ns
90+
Alloc size: 2097135, min util ratio: 0.64062, avg util ratio: 0.666873, time: 116 ns
91+
Alloc size: 8388591, min util ratio: 0.605468, avg util ratio: 0.667812, time: 115 ns
92+
Alloc size: 33554415, min util ratio: 0.5625, avg util ratio: 0.670944, time: 116 ns
93+
Alloc size: 49, min util ratio: 0.692229, avg util ratio: 0.753062, time: 1122 ns
94+
Alloc size: 145, min util ratio: 0.667789, avg util ratio: 0.700907, time: 572 ns
95+
Alloc size: 529, min util ratio: 0.66577, avg util ratio: 0.676238, time: 238 ns
96+
Alloc size: 2065, min util ratio: 0.667926, avg util ratio: 0.704884, time: 632 ns
97+
Alloc size: 8209, min util ratio: 0.665708, avg util ratio: 0.676372, time: 239 ns
98+
Alloc size: 32785, min util ratio: 0.664224, avg util ratio: 0.669058, time: 168 ns
99+
Alloc size: 131089, min util ratio: 0.659631, avg util ratio: 0.667287, time: 129 ns
100+
Alloc size: 524305, min util ratio: 0.652609, avg util ratio: 0.666884, time: 122 ns
101+
Alloc size: 2097169, min util ratio: 0.638677, avg util ratio: 0.666516, time: 120 ns
102+
Alloc size: 8388625, min util ratio: 0.60547, avg util ratio: 0.665131, time: 121 ns
103+
Alloc size: 33554449, min util ratio: 0.546875, avg util ratio: 0.660917, time: 120 ns
104+
```
105+
106+
### Uniform size, size equals power of 2 multiply 0.9 or 1.1
107+
108+
**OffsetAllocator (After Optimization)**
109+
110+
```
111+
Alloc size: 28, min util ratio: 1, avg util ratio: 1, time: 543 ns
112+
Alloc size: 115, min util ratio: 0.958333, avg util ratio: 0.958333, time: 418 ns
113+
Alloc size: 460, min util ratio: 0.958332, avg util ratio: 0.958332, time: 189 ns
114+
Alloc size: 1843, min util ratio: 0.959896, avg util ratio: 0.959896, time: 418 ns
115+
Alloc size: 7372, min util ratio: 0.959895, avg util ratio: 0.959895, time: 197 ns
116+
Alloc size: 29491, min util ratio: 0.959993, avg util ratio: 0.959993, time: 135 ns
117+
Alloc size: 117964, min util ratio: 0.959979, avg util ratio: 0.959979, time: 111 ns
118+
Alloc size: 471859, min util ratio: 0.959985, avg util ratio: 0.959985, time: 100 ns
119+
Alloc size: 1887436, min util ratio: 0.959765, avg util ratio: 0.959765, time: 99 ns
120+
Alloc size: 7549747, min util ratio: 0.959766, avg util ratio: 0.959766, time: 99 ns
121+
Alloc size: 30198988, min util ratio: 0.95625, avg util ratio: 0.95625, time: 99 ns
122+
Alloc size: 35, min util ratio: 0.972222, avg util ratio: 0.972222, time: 531 ns
123+
Alloc size: 140, min util ratio: 0.972222, avg util ratio: 0.972222, time: 397 ns
124+
Alloc size: 563, min util ratio: 0.977427, avg util ratio: 0.977427, time: 180 ns
125+
Alloc size: 2252, min util ratio: 0.97743, avg util ratio: 0.97743, time: 389 ns
126+
Alloc size: 9011, min util ratio: 0.977752, avg util ratio: 0.977752, time: 183 ns
127+
Alloc size: 36044, min util ratio: 0.977752, avg util ratio: 0.977752, time: 133 ns
128+
Alloc size: 144179, min util ratio: 0.977739, avg util ratio: 0.977739, time: 106 ns
129+
Alloc size: 576716, min util ratio: 0.977538, avg util ratio: 0.977538, time: 103 ns
130+
Alloc size: 2306867, min util ratio: 0.977539, avg util ratio: 0.977539, time: 99 ns
131+
Alloc size: 9227468, min util ratio: 0.975391, avg util ratio: 0.975391, time: 99 ns
132+
Alloc size: 36909875, min util ratio: 0.9625, avg util ratio: 0.9625, time: 100 ns
133+
```
134+
135+
**OffsetAllocator (Before Optimization)**
136+
137+
```
138+
Alloc size: 28, min util ratio: 1, avg util ratio: 1, time: 539 ns
139+
Alloc size: 115, min util ratio: 0.669299, avg util ratio: 0.709245, time: 701 ns
140+
Alloc size: 460, min util ratio: 0.665825, avg util ratio: 0.677532, time: 255 ns
141+
Alloc size: 1843, min util ratio: 0.669352, avg util ratio: 0.709202, time: 691 ns
142+
Alloc size: 7372, min util ratio: 0.66619, avg util ratio: 0.677401, time: 260 ns
143+
Alloc size: 29491, min util ratio: 0.664311, avg util ratio: 0.669511, time: 172 ns
144+
Alloc size: 117964, min util ratio: 0.661812, avg util ratio: 0.667356, time: 133 ns
145+
Alloc size: 471859, min util ratio: 0.654345, avg util ratio: 0.667048, time: 123 ns
146+
Alloc size: 1887436, min util ratio: 0.640722, avg util ratio: 0.666447, time: 121 ns
147+
Alloc size: 7549747, min util ratio: 0.611719, avg util ratio: 0.666847, time: 119 ns
148+
Alloc size: 30198988, min util ratio: 0.548437, avg util ratio: 0.669799, time: 125 ns
149+
Alloc size: 35, min util ratio: 0.7098, avg util ratio: 0.774162, time: 1306 ns
150+
Alloc size: 140, min util ratio: 0.667934, avg util ratio: 0.702151, time: 599 ns
151+
Alloc size: 563, min util ratio: 0.665599, avg util ratio: 0.675548, time: 239 ns
152+
Alloc size: 2252, min util ratio: 0.667371, avg util ratio: 0.701623, time: 601 ns
153+
Alloc size: 9011, min util ratio: 0.665485, avg util ratio: 0.675528, time: 244 ns
154+
Alloc size: 36044, min util ratio: 0.663248, avg util ratio: 0.668912, time: 170 ns
155+
Alloc size: 144179, min util ratio: 0.660308, avg util ratio: 0.666934, time: 127 ns
156+
Alloc size: 576716, min util ratio: 0.654467, avg util ratio: 0.66679, time: 122 ns
157+
Alloc size: 2306867, min util ratio: 0.633789, avg util ratio: 0.666159, time: 121 ns
158+
Alloc size: 9227468, min util ratio: 0.597266, avg util ratio: 0.666037, time: 118 ns
159+
Alloc size: 36909875, min util ratio: 0.55, avg util ratio: 0.669564, time: 121 ns
160+
```
161+
162+
### Random Size
163+
164+
**OffsetAllocator (After Optimization)**
165+
166+
```
167+
util ratio (min / p99 / p90 / p50 / max / avg): 0.544250 / 0.713338 / 0.779739 / 0.847867 / 0.952591 / 0.841576
168+
avg alloc time: 145.575738 ns/op
169+
```
170+
171+
**OffsetAllocator (Before Optimization)**
172+
173+
```
174+
util ratio (min / p99 / p90 / p50 / max / avg): 0.569255 / 0.712076 / 0.781224 / 0.855046 / 0.976057 / 0.848873
175+
avg alloc time: 142.508508 ns/op
176+
```

0 commit comments

Comments
 (0)