Skip to content

Commit 1cbab34

Browse files
alexcrichtonbongjunj
authored andcommitted
x64: Use 8-bit jumps in pseudo-insts (bytecodealliance#11271)
* x64: Use 8-bit jumps in pseudo-insts Cranelift does not currently implement any form of "relaxation" of instructions where, for example, a 32-bit jump is shrunk to an 8-bit jump if the destination actually fits. In lieu of this Cranelift pessimistically emits 32-bit jumps on x64, for example, for all jumps between basic blocks. This is a difficult problem to solve in general but for pseudo-instructions it's a much more targeted problem which should be easier to solve. This commit updates all pseudo-instructions in the x64 backend to use 8-bit jumps instead of full 32-bit jumps within their code bodies. It's statically known that the instructions bodies being generate are all small enough to fit in 8 bits. This helps shrink the generated code for a number of instructions whenever a pseudo-inst is used instead of basic blocks. Optimizing jumps between basic blocks is left as a future optimization as it's likely to be much more difficult to implement than this. * Fix emit tests
1 parent 3fafbe0 commit 1cbab34

File tree

99 files changed

+995
-950
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+995
-950
lines changed

cranelift/codegen/src/isa/x64/inst/emit.rs

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ fn one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
3939
debug_assert_eq!(sink.cur_offset(), cond_disp_off + 4);
4040
}
4141

42+
/// Like `one_way_jmp` but only used if the destination is <=127 bytes away.
43+
fn short_one_way_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
44+
let cond_start = sink.cur_offset();
45+
let cond_disp_off = cond_start + 1;
46+
sink.use_label_at_offset(cond_disp_off, label, LabelUse::JmpRel8);
47+
emit_short_jcc_no_offset(sink, cc);
48+
debug_assert_eq!(sink.cur_offset(), cond_disp_off + 1);
49+
}
50+
4251
/// Like `one_way_jmp` above emitting a conditional jump, but also using
4352
/// `MachBuffer::add_cond_branch`.
4453
fn cond_jmp(sink: &mut MachBuffer<Inst>, cc: CC, label: MachLabel) {
@@ -86,6 +95,34 @@ fn emit_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {
8695
});
8796
}
8897

98+
fn emit_short_jcc_no_offset(sink: &mut MachBuffer<Inst>, cc: CC) {
99+
// See `emit_jcc_no_offset` above for comments about subtle mismatches in
100+
// `CC` and `jcc` naming.
101+
let inst: AsmInst = match cc {
102+
CC::Z => asm::inst::je_d8::new(0).into(),
103+
CC::NZ => asm::inst::jne_d8::new(0).into(),
104+
CC::B => asm::inst::jb_d8::new(0).into(),
105+
CC::NB => asm::inst::jae_d8::new(0).into(),
106+
CC::BE => asm::inst::jbe_d8::new(0).into(),
107+
CC::NBE => asm::inst::ja_d8::new(0).into(),
108+
CC::L => asm::inst::jl_d8::new(0).into(),
109+
CC::LE => asm::inst::jle_d8::new(0).into(),
110+
CC::NL => asm::inst::jge_d8::new(0).into(),
111+
CC::NLE => asm::inst::jg_d8::new(0).into(),
112+
CC::O => asm::inst::jo_d8::new(0).into(),
113+
CC::NO => asm::inst::jno_d8::new(0).into(),
114+
CC::P => asm::inst::jp_d8::new(0).into(),
115+
CC::NP => asm::inst::jnp_d8::new(0).into(),
116+
CC::S => asm::inst::js_d8::new(0).into(),
117+
CC::NS => asm::inst::jns_d8::new(0).into(),
118+
};
119+
inst.encode(&mut external::AsmCodeSink {
120+
sink,
121+
incoming_arg_offset: 0,
122+
slot_offset: 0,
123+
});
124+
}
125+
89126
/// Emits an unconditional branch.
90127
fn uncond_jmp(sink: &mut MachBuffer<Inst>, label: MachLabel) {
91128
let uncond_start = sink.cur_offset();
@@ -253,7 +290,7 @@ pub(crate) fn emit(
253290
// go to the `idiv`.
254291
let inst = Inst::cmp_mi_sxb(size, *divisor, -1);
255292
inst.emit(sink, info, state);
256-
one_way_jmp(sink, CC::NZ, do_op);
293+
short_one_way_jmp(sink, CC::NZ, do_op);
257294

258295
// ... otherwise the divisor is -1 and the result is always 0. This
259296
// is written to the destination register which will be %rax for
@@ -343,7 +380,7 @@ pub(crate) fn emit(
343380
let next = sink.get_label();
344381

345382
// Jump if cc is *not* set.
346-
one_way_jmp(sink, cc.invert(), next);
383+
short_one_way_jmp(sink, cc.invert(), next);
347384
Inst::gen_move(dst.map(|r| r.to_reg()), consequent.to_reg(), *ty)
348385
.emit(sink, info, state);
349386

@@ -421,7 +458,7 @@ pub(crate) fn emit(
421458
// jne .loop_start
422459
// TODO: Encoding the conditional jump as a short jump
423460
// could save us us 4 bytes here.
424-
one_way_jmp(sink, CC::NZ, loop_start);
461+
short_one_way_jmp(sink, CC::NZ, loop_start);
425462

426463
// The regular prologue code is going to emit a `sub` after this, so we need to
427464
// reset the stack pointer
@@ -939,8 +976,8 @@ pub(crate) fn emit(
939976

940977
cmp_op.emit(sink, info, state);
941978

942-
one_way_jmp(sink, CC::NZ, do_min_max);
943-
one_way_jmp(sink, CC::P, propagate_nan);
979+
short_one_way_jmp(sink, CC::NZ, do_min_max);
980+
short_one_way_jmp(sink, CC::P, propagate_nan);
944981

945982
// Ordered and equal. The operands are bit-identical unless they are zero
946983
// and negative zero. These instructions merge the sign bits in that
@@ -957,7 +994,7 @@ pub(crate) fn emit(
957994
sink.bind_label(propagate_nan, state.ctrl_plane_mut());
958995
add_op.emit(sink, info, state);
959996

960-
one_way_jmp(sink, CC::P, done);
997+
short_one_way_jmp(sink, CC::P, done);
961998

962999
sink.bind_label(do_min_max, state.ctrl_plane_mut());
9631000
min_max_op.emit(sink, info, state);
@@ -1020,7 +1057,7 @@ pub(crate) fn emit(
10201057
// TODO use tst src, src here.
10211058
asm::inst::cmpq_mi_sxb::new(src, 0).emit(sink, info, state);
10221059

1023-
one_way_jmp(sink, CC::L, handle_negative);
1060+
short_one_way_jmp(sink, CC::L, handle_negative);
10241061

10251062
// Handle a positive int64, which is the "easy" case: a signed conversion will do the
10261063
// right thing.
@@ -1159,14 +1196,14 @@ pub(crate) fn emit(
11591196
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 1);
11601197
inst.emit(sink, info, state);
11611198

1162-
one_way_jmp(sink, CC::NO, done); // no overflow => done
1199+
short_one_way_jmp(sink, CC::NO, done); // no overflow => done
11631200

11641201
// Check for NaN.
11651202
cmp_op.emit(sink, info, state);
11661203

11671204
if *is_saturating {
11681205
let not_nan = sink.get_label();
1169-
one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
1206+
short_one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN
11701207

11711208
// For NaN, emit 0.
11721209
let inst: AsmInst = match *dst_size {
@@ -1194,7 +1231,7 @@ pub(crate) fn emit(
11941231
inst.emit(sink, info, state);
11951232

11961233
// Jump if >= to done.
1197-
one_way_jmp(sink, CC::NB, done);
1234+
short_one_way_jmp(sink, CC::NB, done);
11981235

11991236
// Otherwise, put INT_MAX.
12001237
if *dst_size == OperandSize::Size64 {
@@ -1387,12 +1424,12 @@ pub(crate) fn emit(
13871424
inst.emit(sink, info, state);
13881425

13891426
let handle_large = sink.get_label();
1390-
one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
1427+
short_one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold
13911428

13921429
if *is_saturating {
13931430
// If not NaN jump over this 0-return, otherwise return 0
13941431
let not_nan = sink.get_label();
1395-
one_way_jmp(sink, CC::NP, not_nan);
1432+
short_one_way_jmp(sink, CC::NP, not_nan);
13961433

13971434
xor_op(dst, dst).emit(sink, info, state);
13981435

@@ -1413,7 +1450,7 @@ pub(crate) fn emit(
14131450
let inst = Inst::cmp_mi_sxb(*dst_size, Gpr::unwrap_new(dst.to_reg()), 0);
14141451
inst.emit(sink, info, state);
14151452

1416-
one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
1453+
short_one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done
14171454

14181455
if *is_saturating {
14191456
// The input was "small" (< 2**(width -1)), so the only way to get an integer
@@ -1448,7 +1485,7 @@ pub(crate) fn emit(
14481485

14491486
if *is_saturating {
14501487
let next_is_large = sink.get_label();
1451-
one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
1488+
short_one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large
14521489

14531490
// The input was "large" (>= 2**(width -1)), so the only way to get an integer
14541491
// overflow is because the input was too large: saturate to the max value.
@@ -1633,7 +1670,7 @@ pub(crate) fn emit(
16331670
inst.emit(sink, info, state);
16341671

16351672
// jnz again
1636-
one_way_jmp(sink, CC::NZ, again_label);
1673+
short_one_way_jmp(sink, CC::NZ, again_label);
16371674
}
16381675

16391676
Inst::Atomic128RmwSeq {
@@ -1753,7 +1790,7 @@ pub(crate) fn emit(
17531790
.emit(sink, info, state);
17541791

17551792
// jnz again
1756-
one_way_jmp(sink, CC::NZ, again_label);
1793+
short_one_way_jmp(sink, CC::NZ, again_label);
17571794
}
17581795

17591796
Inst::Atomic128XchgSeq {
@@ -1793,7 +1830,7 @@ pub(crate) fn emit(
17931830
.emit(sink, info, state);
17941831

17951832
// jnz again
1796-
one_way_jmp(sink, CC::NZ, again_label);
1833+
short_one_way_jmp(sink, CC::NZ, again_label);
17971834
}
17981835

17991836
Inst::ElfTlsGetAddr { symbol, dst } => {

cranelift/codegen/src/isa/x64/inst/emit_tests.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ fn test_x64_emit() {
292292
temp: w_r11.map(Gpr::unwrap_new),
293293
dst_old: w_rax.map(Gpr::unwrap_new),
294294
},
295-
"490FB6014989C34D0BDAF0450FB0190F85EFFFFFFF",
295+
"490FB6014989C34D0BDAF0450FB01975F3",
296296
"atomically { 8_bits_at_[%r9] Or= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }",
297297
));
298298
insns.push((
@@ -304,7 +304,7 @@ fn test_x64_emit() {
304304
temp: w_r11.map(Gpr::unwrap_new),
305305
dst_old: w_rax.map(Gpr::unwrap_new)
306306
},
307-
"490FB7014989C34D23DAF066450FB1190F85EEFFFFFF",
307+
"490FB7014989C34D23DAF066450FB11975F2",
308308
"atomically { 16_bits_at_[%r9] And= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
309309
));
310310
insns.push((
@@ -316,7 +316,7 @@ fn test_x64_emit() {
316316
temp: w_r11.map(Gpr::unwrap_new),
317317
dst_old: w_rax.map(Gpr::unwrap_new)
318318
},
319-
"418B014989C34D23DA49F7D3F0450FB1190F85ECFFFFFF",
319+
"418B014989C34D23DA49F7D3F0450FB11975F0",
320320
"atomically { 32_bits_at_[%r9] Nand= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
321321
));
322322
insns.push((
@@ -328,7 +328,7 @@ fn test_x64_emit() {
328328
temp: w_r11.map(Gpr::unwrap_new),
329329
dst_old: w_rax.map(Gpr::unwrap_new)
330330
},
331-
"418B014989C34539DA4D0F46DAF0450FB1190F85EBFFFFFF",
331+
"418B014989C34539DA4D0F46DAF0450FB11975EF",
332332
"atomically { 32_bits_at_[%r9] Umin= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
333333
));
334334
insns.push((
@@ -340,7 +340,7 @@ fn test_x64_emit() {
340340
temp: w_r11.map(Gpr::unwrap_new),
341341
dst_old: w_rax.map(Gpr::unwrap_new)
342342
},
343-
"498B014989C34D39DA4D0F4DDAF04D0FB1190F85EBFFFFFF",
343+
"498B014989C34D39DA4D0F4DDAF04D0FB11975EF",
344344
"atomically { 64_bits_at_[%r9] Smax= %r10; %rax = old_value_at_[%r9]; %r11, %rflags = trash }"
345345
));
346346

@@ -356,7 +356,7 @@ fn test_x64_emit() {
356356
dst_old_low: w_rax.map(Gpr::unwrap_new),
357357
dst_old_high: w_rdx.map(Gpr::unwrap_new),
358358
},
359-
"498B01498B51084889C34889D1490BDA490BCBF0490FC7090F85E9FFFFFF",
359+
"498B01498B51084889C34889D1490BDA490BCBF0490FC70975ED",
360360
"atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax Or %r11:%r10; 0(%r9) = %rcx:%rbx }",
361361
));
362362
insns.push((
@@ -370,7 +370,7 @@ fn test_x64_emit() {
370370
dst_old_low: w_rax.map(Gpr::unwrap_new),
371371
dst_old_high: w_rdx.map(Gpr::unwrap_new),
372372
},
373-
"498B01498B51084889C34889D14923DA4923CBF0490FC7090F85E9FFFFFF",
373+
"498B01498B51084889C34889D14923DA4923CBF0490FC70975ED",
374374
"atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax And %r11:%r10; 0(%r9) = %rcx:%rbx }"
375375
));
376376
insns.push((
@@ -384,7 +384,7 @@ fn test_x64_emit() {
384384
dst_old_low: w_rax.map(Gpr::unwrap_new),
385385
dst_old_high: w_rdx.map(Gpr::unwrap_new),
386386
},
387-
"498B01498B51084889C34889D14C39D3491BCB4889D1490F43DA490F43CBF0490FC7090F85DEFFFFFF",
387+
"498B01498B51084889C34889D14C39D3491BCB4889D1490F43DA490F43CBF0490FC70975E2",
388388
"atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax Umin %r11:%r10; 0(%r9) = %rcx:%rbx }"
389389
));
390390
insns.push((
@@ -398,7 +398,7 @@ fn test_x64_emit() {
398398
dst_old_low: w_rax.map(Gpr::unwrap_new),
399399
dst_old_high: w_rdx.map(Gpr::unwrap_new),
400400
},
401-
"498B01498B51084889C34889D14903DA4913CBF0490FC7090F85E9FFFFFF",
401+
"498B01498B51084889C34889D14903DA4913CBF0490FC70975ED",
402402
"atomically { %rdx:%rax = 0(%r9); %rcx:%rbx = %rdx:%rax Add %r11:%r10; 0(%r9) = %rcx:%rbx }"
403403
));
404404
insns.push((
@@ -409,7 +409,7 @@ fn test_x64_emit() {
409409
dst_old_low: w_rax.map(Gpr::unwrap_new),
410410
dst_old_high: w_rdx.map(Gpr::unwrap_new),
411411
},
412-
"498B01498B5108F0490FC7090F85F5FFFFFF",
412+
"498B01498B5108F0490FC70975F9",
413413
"atomically { %rdx:%rax = 0(%r9); 0(%r9) = %rcx:%rbx }",
414414
));
415415

cranelift/codegen/src/isa/x64/inst/mod.rs

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,14 @@ pub enum LabelUse {
15621562
/// next instruction (so the size of the payload -- 4 bytes -- is subtracted from the payload).
15631563
JmpRel32,
15641564

1565+
/// An 8-bit offset from location of relocation itself, added to the
1566+
/// existing value at that location.
1567+
///
1568+
/// Used for control flow instructions which consider an offset from the
1569+
/// start of the next instruction (so the size of the payload -- 1 byte --
1570+
/// is subtracted from the payload).
1571+
JmpRel8,
1572+
15651573
/// A 32-bit offset from location of relocation itself, added to the existing value at that
15661574
/// location.
15671575
PCRel32,
@@ -1573,18 +1581,21 @@ impl MachInstLabelUse for LabelUse {
15731581
fn max_pos_range(self) -> CodeOffset {
15741582
match self {
15751583
LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x7fff_ffff,
1584+
LabelUse::JmpRel8 => 0x7f,
15761585
}
15771586
}
15781587

15791588
fn max_neg_range(self) -> CodeOffset {
15801589
match self {
15811590
LabelUse::JmpRel32 | LabelUse::PCRel32 => 0x8000_0000,
1591+
LabelUse::JmpRel8 => 0x80,
15821592
}
15831593
}
15841594

15851595
fn patch_size(self) -> CodeOffset {
15861596
match self {
15871597
LabelUse::JmpRel32 | LabelUse::PCRel32 => 4,
1598+
LabelUse::JmpRel8 => 1,
15881599
}
15891600
}
15901601

@@ -1599,6 +1610,9 @@ impl MachInstLabelUse for LabelUse {
15991610
let value = pc_rel.wrapping_add(addend).wrapping_sub(4);
16001611
buffer.copy_from_slice(&value.to_le_bytes()[..]);
16011612
}
1613+
LabelUse::JmpRel8 => {
1614+
buffer[0] = buffer[0].wrapping_add(pc_rel as u8).wrapping_sub(1);
1615+
}
16021616
LabelUse::PCRel32 => {
16031617
let addend = u32::from_le_bytes([buffer[0], buffer[1], buffer[2], buffer[3]]);
16041618
let value = pc_rel.wrapping_add(addend);
@@ -1610,12 +1624,21 @@ impl MachInstLabelUse for LabelUse {
16101624
fn supports_veneer(self) -> bool {
16111625
match self {
16121626
LabelUse::JmpRel32 | LabelUse::PCRel32 => false,
1627+
1628+
// Technically this is possible to have a veneer because it can jump
1629+
// to a 32-bit jump which keeps going. That being said at this time
1630+
// this variant is only used in `emit.rs` for jumps that are already
1631+
// known to be short so it's a bug if we jump to a jump that's too
1632+
// far away. In the future if general-purpose basic-block
1633+
// terminators are switched to using short jumps to get promoted to
1634+
// a long jump then this may wish to change.
1635+
LabelUse::JmpRel8 => false,
16131636
}
16141637
}
16151638

16161639
fn veneer_size(self) -> CodeOffset {
16171640
match self {
1618-
LabelUse::JmpRel32 | LabelUse::PCRel32 => 0,
1641+
LabelUse::JmpRel32 | LabelUse::PCRel32 | LabelUse::JmpRel8 => 0,
16191642
}
16201643
}
16211644

@@ -1625,7 +1648,7 @@ impl MachInstLabelUse for LabelUse {
16251648

16261649
fn generate_veneer(self, _: &mut [u8], _: CodeOffset) -> (CodeOffset, LabelUse) {
16271650
match self {
1628-
LabelUse::JmpRel32 | LabelUse::PCRel32 => {
1651+
LabelUse::JmpRel32 | LabelUse::PCRel32 | LabelUse::JmpRel8 => {
16291652
panic!("Veneer not supported for JumpRel32 label-use.");
16301653
}
16311654
}

cranelift/filetests/filetests/isa/x64/cmp-mem-bug.clif

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ block0(v0: f64, v1: i64):
8181
; movzbq %dil, %rax
8282
; ucomisd %xmm1, %xmm0
8383
; movdqa %xmm0, %xmm2
84-
; jnp 0x2a
84+
; jnp 0x26
8585
; movaps %xmm2, %xmm0
86-
; je 0x33
86+
; je 0x2b
8787
; movaps %xmm2, %xmm0
8888
; movq %rbp, %rsp
8989
; popq %rbp

0 commit comments

Comments
 (0)