Skip to content

Commit 927c8a5

Browse files
bjorngjhogberg
andcommitted
Optimize construction of little-endian segments
In Erlang/OTP 26 (in #6031), the JIT learned to optimize binary construction such as: <<A:16/big, B:32/big, C:16/big>> The optimization is done on the native-code level, but the idea behind it can be illustrated in Erlang by rewriting the construction as follows: Acc0 = A, Acc1 = (Acc0 bsl 32) bor B, Acc = (Acc1 bsl 16) bor C, <<Acc:64/big>> When done in native code, the values of the segments is accumulated into a CPU register, which is then written to memory. This is faster than writing each segment to memory one at a time, especially if the sizes are not byte-sized as in the following example: <<A:6, B:6, C:6, D:6>> This commit introduces a similar optimization for little-endian integer segments. Example: <<A:16/little, B:32/little, C:16/little>> This expression can be rewritten as follows: Acc0 = C, Acc1 = (Acc0 bsl 32) bor B, Acc = (Acc1 bsl 16) bor A, <<Acc:64/little>> Note that this rewriting is only safe if all segments except the last one are byte-sized. Co-authored-by: John Högberg <[email protected]>
1 parent 69943fc commit 927c8a5

File tree

2 files changed

+155
-38
lines changed

2 files changed

+155
-38
lines changed

erts/emulator/beam/jit/beam_jit_bs.cpp

Lines changed: 123 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
#include "beam_jit_common.hpp"
2929
#include "beam_jit_bs.hpp"
3030

31+
#include <iterator>
32+
#include <numeric>
33+
3134
extern "C"
3235
{
3336
#include "beam_file.h"
@@ -86,70 +89,153 @@ std::vector<BscSegment> beam_jit_bsc_init(const Span<ArgVal> &args) {
8689
return segments;
8790
}
8891

92+
template<typename It>
93+
static auto fold_group(std::vector<BscSegment> &segs, It first, It last) {
94+
auto &back = segs.emplace_back(*first);
95+
96+
back.action = BscSegment::action::ACCUMULATE_FIRST;
97+
98+
return std::accumulate(std::next(first),
99+
last,
100+
back.effectiveSize,
101+
[&segs](Sint acc, const BscSegment &seg) {
102+
auto &back = segs.emplace_back(seg);
103+
104+
back.action = BscSegment::action::ACCUMULATE;
105+
106+
return acc + back.effectiveSize;
107+
});
108+
}
109+
110+
static void push_group(std::vector<BscSegment> &segs,
111+
std::vector<BscSegment>::const_iterator start,
112+
std::vector<BscSegment>::const_iterator end) {
113+
if (start < end) {
114+
auto groupSize = ((start->flags & BSF_LITTLE) != 0)
115+
? fold_group(segs,
116+
std::make_reverse_iterator(end),
117+
std::make_reverse_iterator(start))
118+
: fold_group(segs, start, end);
119+
120+
auto &seg = segs.emplace_back();
121+
122+
seg.type = am_integer;
123+
seg.action = BscSegment::action::STORE;
124+
seg.effectiveSize = groupSize;
125+
seg.flags = start->flags;
126+
}
127+
}
128+
129+
/*
130+
* Combine small segments into a group so that the values for the
131+
* segments can be combined into an accumulator register and then
132+
* written to memory. Here is an example in Erlang illustrating the
133+
* idea. Consider this binary construction example:
134+
*
135+
* <<A:16/big, B:32/big, C:16/big>>
136+
*
137+
* This can be rewritten as follows:
138+
*
139+
* Acc0 = A,
140+
* Acc1 = (Acc0 bsl 32) bor B,
141+
* Acc = (Acc1 bsl 16) bor C,
142+
* <<Acc:64/big>>
143+
*
144+
* Translated to native code, this is faster because the accumulating
145+
* is done in a CPU register, and then the result is written to memory.
146+
* For big-endian segments, this rewrite works even if sizes are not
147+
* byte-sized. For example:
148+
*
149+
* <<A:6, B:6, C:6, D:6>>
150+
*
151+
* Little-endian segments can be optimized in a similar way. Consider:
152+
*
153+
* <<A:16/little, B:32/little, C:16/little>>
154+
*
155+
* This can be rewritten like so:
156+
*
157+
* Acc0 = C,
158+
* Acc1 = (Acc0 bsl 32) bor B,
159+
* Acc = (Acc1 bsl 16) bor A,
160+
* <<Acc:64/little>>
161+
*
162+
* However, for little-endian segments, this rewriting will only work
163+
* if all segment sizes but the last one are byte-sized.
164+
*/
165+
89166
std::vector<BscSegment> beam_jit_bsc_combine_segments(
90167
const std::vector<BscSegment> segments) {
91168
std::vector<BscSegment> segs;
92169

93-
for (auto seg : segments) {
170+
auto group = segments.cend();
171+
Sint combinedSize = 0;
172+
173+
for (auto it = segments.cbegin(); it != segments.cend(); it++) {
174+
auto &seg = *it;
175+
94176
switch (seg.type) {
95177
case am_integer: {
96178
if (!(0 < seg.effectiveSize && seg.effectiveSize <= 64)) {
97179
/* Unknown or too large size. Handle using the default
98180
* DIRECT action. */
181+
push_group(segs, group, it);
182+
group = segments.cend();
183+
99184
segs.push_back(seg);
100185
continue;
101186
}
102187

103-
if (seg.flags & BSF_LITTLE || segs.size() == 0 ||
104-
segs.back().action == BscSegment::action::DIRECT) {
105-
/* There are no previous compatible ACCUMULATE / STORE
106-
* actions. Create the first ones. */
107-
seg.action = BscSegment::action::ACCUMULATE_FIRST;
108-
segs.push_back(seg);
109-
seg.action = BscSegment::action::STORE;
110-
segs.push_back(seg);
188+
/* The current segment has a known size not exceeding 64
189+
* bits. Try to add it to the current group. */
190+
if (group == segments.cend()) {
191+
group = it;
192+
193+
combinedSize = seg.effectiveSize;
111194
continue;
112195
}
113196

114-
auto prev = segs.back();
115-
if (prev.flags & BSF_LITTLE) {
116-
/* Little-endian segments cannot be combined with other
117-
* segments. Create new ACCUMULATE_FIRST / STORE actions. */
118-
seg.action = BscSegment::action::ACCUMULATE_FIRST;
119-
segs.push_back(seg);
120-
seg.action = BscSegment::action::STORE;
121-
segs.push_back(seg);
197+
/* There is already at least one segment in the group.
198+
* Append the current segment to the group only if it is
199+
* compatible and will fit. */
200+
201+
bool sameEndian =
202+
(seg.flags & BSF_LITTLE) == (group->flags & BSF_LITTLE);
203+
204+
/* Big-endian segments can always be grouped (if the size
205+
* does not exceed 64 bits). Little-endian segments can
206+
* only be grouped if all but the last segment are
207+
* byte-sized. */
208+
bool suitableSizes =
209+
((seg.flags & BSF_LITTLE) == 0 || combinedSize % 8 == 0);
210+
211+
if (sameEndian && combinedSize + seg.effectiveSize <= 64 &&
212+
suitableSizes) {
213+
combinedSize += seg.effectiveSize;
122214
continue;
123215
}
124216

125-
/* The current segment is compatible with the previous
126-
* segment. Try combining them. */
127-
if (prev.effectiveSize + seg.effectiveSize <= 64) {
128-
/* The combined values of the segments fit in the
129-
* accumulator. Insert an ACCUMULATE action for the
130-
* current segment before the pre-existing STORE
131-
* action. */
132-
segs.pop_back();
133-
prev.effectiveSize += seg.effectiveSize;
134-
seg.action = BscSegment::action::ACCUMULATE;
135-
segs.push_back(seg);
136-
segs.push_back(prev);
137-
} else {
138-
/* The size exceeds 64 bits. Can't combine. */
139-
seg.action = BscSegment::action::ACCUMULATE_FIRST;
140-
segs.push_back(seg);
141-
seg.action = BscSegment::action::STORE;
142-
segs.push_back(seg);
143-
}
217+
/*
218+
* Not possible to fit anything more into the group.
219+
* Flush the group and start a new group.
220+
*/
221+
push_group(segs, group, it);
222+
group = it;
223+
224+
combinedSize = seg.effectiveSize;
144225
break;
145226
}
146227
default:
228+
push_group(segs, group, it);
229+
group = segments.cend();
230+
147231
segs.push_back(seg);
148232
break;
149233
}
150234
}
151235

152-
/* Calculate bit offsets for each ACCUMULATE segment. */
236+
push_group(segs, group, segments.cend());
237+
238+
/* Calculate bit offsets for ACCUMULATE and STORE segments. */
153239

154240
Uint offset = 0;
155241
for (int i = segs.size() - 1; i >= 0; i--) {

erts/emulator/test/bs_construct_SUITE.erl

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,27 @@ l(I_13, I_big1) ->
212212
%% Test non-byte sizes and also that the value does not bleed
213213
%% into the previous segment.
214214
?T(<<17, I_big1:33>>, <<17, 197,49,128,73,1:1>>),
215-
?T(<<19, I_big1:39>>, <<19, 11,20,198,1,19:7>>)
215+
?T(<<19, I_big1:39>>, <<19, 11,20,198,1,19:7>>),
216+
217+
%% Test multiple little-endian segments.
218+
?T(<<I_big1:16/little, I_13:24/little>>,
219+
[147,0,13,0,0]),
220+
?T(<<I_big1:13/little, I_13:3/little, I_big1:16/little>>,
221+
[147,5,147,0]),
222+
?T(<<I_big1:16/little, I_13:24/little, I_big1:80/little>>,
223+
[147,0,13,0,0,147,0,99,138,5,229,249,42,184,98]),
224+
?T(<<I_big1:48/little, (I_big1 bsr 17):16/little>>,
225+
[147,0,99,138,5,229,49,197]),
226+
?T(<<I_big1:16/little, (I_big1 bsr 13):16/little,
227+
(I_big1 bsr 15):16/little, (I_big1 bsr 23):16/little>>,
228+
[147,0,24,83,198,20,20,11]),
229+
?T(<<I_big1:24/little, (I_big1 bsr 11):16/little,
230+
(I_big1 bsr 18):16/little, (I_big1 bsr 26):32/little>>,
231+
[147,0,99,96,76,152,98,98,65,121,190]),
232+
?T(<<0:5,I_big1:16/little, I_13:3/little>>,
233+
[4,152,5]),
234+
?T(<<0:5,I_big1:16/little, (I_big1 bsr 15):19/little>>,
235+
[4,152,6,48,163])
216236
].
217237

218238
native_3798() ->
@@ -842,6 +862,7 @@ dynamic_little(Bef, N, Int, Lpad, Rpad) ->
842862
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:(128-Bef-N)/little>>,
843863

844864
if
865+
%% Test unusual units.
845866
Bef rem 8 =:= 0 ->
846867
Bin = <<Lpad:(Bef div 8)/little-unit:8,
847868
Int:N/little,Rpad:(128-Bef-N)/little>>;
@@ -851,6 +872,16 @@ dynamic_little(Bef, N, Int, Lpad, Rpad) ->
851872
(128-Bef-N) rem 17 =:= 0 ->
852873
Aft = (128 - Bef - N) div 17,
853874
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:Aft/little-unit:17>>;
875+
876+
%% Test combinations of little-integer segments of fixed size.
877+
Bef =:= 33, N =:= 45 ->
878+
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:50/little>>;
879+
Bef =:= 16, N =:= 40 ->
880+
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:72/little>>;
881+
Bef =:= 16, N =:= 48 ->
882+
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:64/little>>;
883+
Bef =:= 65, N =:= 32 ->
884+
Bin = <<Lpad:Bef/little,Int:N/little,Rpad:31/little>>;
854885
true ->
855886
ok
856887
end,

0 commit comments

Comments
 (0)