Summary
The classic CRC inner-loop idioms based on a conditional polynomial XOR can be
implemented more efficiently on x86 by using the carry flag produced by a
1-bit shift and then materializing the mask with sbb reg, reg.
LLVM currently misses this in two common forms:
- right-shift CRC step: 6 instructions instead of 4
- left-shift CRC step: 5 instructions instead of 4
This is a target-specific x86 codegen / peephole issue.
Reproducer
define i32 @crc32_step_mask(i32 %crc) {
%low = and i32 %crc, 1
%mask = sub i32 0, %low
%poly = and i32 %mask, -307974906
%shr = lshr i32 %crc, 1
%r = xor i32 %shr, %poly
ret i32 %r
}
define i32 @crc32_step_left(i32 %crc) {
%sign = icmp slt i32 %crc, 0
%poly = select i1 %sign, i32 79764919, i32 0
%shl = shl i32 %crc, 1
%r = xor i32 %shl, %poly
ret i32 %r
}Real-world Rust provenance from crc-rs
The local checkout of crc-rs has the actual leaf function in:
File: ~/crates/crc-rs/src/util.rs
pub const fn crc32(poly: u32, reflect: bool, mut value: u32) -> u32 {
if reflect {
let mut i = 0;
while i < 8 {
value = (value >> 1) ^ ((value & 1) * poly);
i += 1;
}
} else {
value <<= 24;
let mut i = 0;
while i < 8 {
value = (value << 1) ^ (((value >> 31) & 1) * poly);
i += 1;
}
}
value
}To inspect codegen directly from the crate, I added these public wrappers in
src/lib.rs:
#[inline(never)]
pub fn crc32_step_reflect_ieee(value: u32) -> u32 {
util::crc32(0xedb8_8320, true, value)
}
#[inline(never)]
pub fn crc32_step_nonreflect_ieee(value: u32) -> u32 {
util::crc32(0x04c1_1db7, false, value)
}and then used:
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_reflect_ieee --llvm
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_reflect_ieee --asm
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_nonreflect_ieee --llvm
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_nonreflect_ieee --asmFull LLVM IR from the actual crate
Reflected wrapper:
define noundef i32 @crc::crc32_step_reflect_ieee(i32 noundef %value) unnamed_addr #0 {
start:
%_7.i = lshr i32 %value, 1
%0 = trunc i32 %value to i1
%_9.i = select i1 %0, i32 -306674912, i32 0
%1 = xor i32 %_9.i, %_7.i
%_7.1.i = lshr i32 %1, 1
%2 = trunc i32 %_7.i to i1
%_9.1.i = select i1 %2, i32 -306674912, i32 0
%3 = xor i32 %_7.1.i, %_9.1.i
%_7.2.i = lshr i32 %3, 1
%4 = trunc i32 %_7.1.i to i1
%_9.2.i = select i1 %4, i32 -306674912, i32 0
%5 = xor i32 %_9.2.i, %_7.2.i
%_7.3.i = lshr i32 %5, 1
%6 = trunc i32 %_7.2.i to i1
%_9.3.i = select i1 %6, i32 -306674912, i32 0
%7 = xor i32 %_9.3.i, %_7.3.i
%_7.4.i = lshr i32 %7, 1
%8 = trunc i32 %_7.3.i to i1
%_9.4.i = select i1 %8, i32 -306674912, i32 0
%9 = xor i32 %_9.4.i, %_7.4.i
%_7.5.i = lshr i32 %9, 1
%10 = trunc i32 %_7.4.i to i1
%_9.5.i = select i1 %10, i32 -306674912, i32 0
%11 = xor i32 %_9.5.i, %_7.5.i
%_7.6.i = lshr i32 %11, 1
%12 = trunc i32 %_7.5.i to i1
%_9.6.i = select i1 %12, i32 -306674912, i32 0
%13 = xor i32 %_9.6.i, %_7.6.i
%_7.7.i = lshr i32 %13, 1
%14 = trunc i32 %_7.6.i to i1
%_16.7.i = select i1 %14, i32 -306674912, i32 0
%15 = xor i32 %_16.7.i, %_7.7.i
ret i32 %15
}Non-reflected wrapper:
define noundef i32 @crc::crc32_step_nonreflect_ieee(i32 noundef %value) unnamed_addr #0 {
start:
%_14.i = shl i32 %value, 25
%.mask.i = and i32 %value, 128
%isneg.not.i = icmp eq i32 %.mask.i, 0
%_16.i = select i1 %isneg.not.i, i32 0, i32 79764919
%0 = xor i32 %_16.i, %_14.i
%_14.1.i = shl i32 %0, 1
%isneg.1.i = icmp slt i32 %_14.i, 0
%_16.1.i = select i1 %isneg.1.i, i32 79764919, i32 0
%1 = xor i32 %_16.1.i, %_14.1.i
%_14.2.i = shl i32 %1, 1
%isneg.2.i = icmp slt i32 %_14.1.i, 0
%_16.2.i = select i1 %isneg.2.i, i32 79764919, i32 0
%2 = xor i32 %_16.2.i, %_14.2.i
%_14.3.i = shl i32 %2, 1
%isneg.3.i = icmp slt i32 %_14.2.i, 0
%_16.3.i = select i1 %isneg.3.i, i32 79764919, i32 0
%3 = xor i32 %_16.3.i, %_14.3.i
%_14.4.i = shl i32 %3, 1
%isneg.4.i = icmp slt i32 %_14.3.i, 0
%_16.4.i = select i1 %isneg.4.i, i32 79764919, i32 0
%4 = xor i32 %_16.4.i, %_14.4.i
%_14.5.i = shl i32 %4, 1
%isneg.5.i = icmp slt i32 %_14.4.i, 0
%_16.5.i = select i1 %isneg.5.i, i32 79764919, i32 0
%5 = xor i32 %_16.5.i, %_14.5.i
%_14.6.i = shl i32 %5, 1
%isneg.6.i = icmp slt i32 %_14.5.i, 0
%_16.6.i = select i1 %isneg.6.i, i32 79764919, i32 0
%6 = xor i32 %_16.6.i, %_14.6.i
%_14.7.i = shl i32 %6, 1
%isneg.7.i = icmp slt i32 %_14.6.i, 0
%_16.7.i = select i1 %isneg.7.i, i32 79764919, i32 0
%7 = xor i32 %_16.7.i, %_14.7.i
ret i32 %7
}The wrappers inline the real crc-rs body and unroll all 8 iterations, so this
is direct evidence from the crate rather than a synthetic standalone function.
Current x86_64 output from the actual crate
Reflected wrapper:
crc::crc32_step_reflect_ieee:
mov edx, edi
shr edx
xor eax, eax
test dil, 1
mov ecx, -306674912
mov esi, 0
cmovne esi, ecx
xor esi, edx
mov edx, esi
shr edx
bt edi, 1
mov edi, 0
cmovb edi, ecx
xor edi, edx
mov edx, edi
shr edx
bt esi, 1
mov esi, 0
cmovb esi, ecx
xor esi, edx
mov edx, esi
shr edx
bt edi, 1
mov edi, 0
cmovb edi, ecx
xor edi, edx
mov edx, edi
shr edx
bt esi, 1
mov esi, 0
cmovb esi, ecx
xor esi, edx
mov edx, esi
shr edx
bt edi, 1
mov edi, 0
cmovb edi, ecx
xor edi, edx
bt esi, 1
mov edx, 0
cmovb edx, ecx
mov esi, edi
shr esi, 2
shr edx
xor edx, esi
bt edi, 1
cmovb eax, ecx
xor eax, edx
retNon-reflected wrapper:
crc::crc32_step_nonreflect_ieee:
movsx ecx, dil
mov eax, edi
shl eax, 25
sar ecx, 7
and ecx, 79764919
xor ecx, eax
add ecx, ecx
sar eax, 31
and eax, 79764919
xor eax, ecx
add eax, eax
sar ecx, 31
and ecx, 79764919
xor ecx, eax
add ecx, ecx
sar eax, 31
and eax, 79764919
xor eax, ecx
add eax, eax
sar ecx, 31
and ecx, 79764919
xor ecx, eax
add ecx, ecx
sar eax, 31
and eax, 79764919
xor eax, ecx
add eax, eax
sar ecx, 31
and ecx, 79764919
xor ecx, eax
add ecx, ecx
sar eax, 31
and eax, 79764919
xor eax, ecx
retThe important point is that the emitted code is still using select/mask-style
materialization for each step rather than the shorter flag-based shr/add/shl + sbb idiom.
Expected x86_64 output for the actual crate
Reflected wrapper:
Each of the 8 steps should be expressible as:
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eaxSo an ideal full unrolled shape is:
crc::crc32_step_reflect_ieee:
movl %edi, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
shrl %eax
sbbl %edx, %edx
andl $-306674912, %edx
xorl %edx, %eax
retqNon-reflected wrapper:
The first step corresponds to the initial value <<= 24 followed by one
conditional polynomial XOR, so a good flag-based first step is:
shll $25, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eaxand the remaining 7 steps are:
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eaxSo an ideal full unrolled shape is:
crc::crc32_step_nonreflect_ieee:
movl %edi, %eax
shll $25, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
addl %eax, %eax
sbbl %edx, %edx
andl $79764919, %edx
xorl %edx, %eax
retqCurrent output
crc32_step_mask:
movl %edi, %eax
andl $1, %eax
negl %eax
andl $-307974906, %eax
shrl %edi
xorl %edi, %eax
retq
crc32_step_left:
leal (%rdi,%rdi), %eax
movl %edi, %ecx
sarl $31, %ecx
andl $79764919, %ecx
xorl %ecx, %eax
retqBetter output
Right-shift form:
shrl %edi
sbbl %eax, %eax
andl $-307974906, %eax
xorl %edi, %eax
retqLeft-shift form:
addl %edi, %edi
sbbl %eax, %eax
andl $79764919, %eax
xorl %edi, %eax
retqWhy this is valid
For the right-shift form:
shrl r, 1sets CF to the original low bitsbb eax, eaxturns that CF into0or-1- masking with the polynomial produces
polyor0
For the left-shift form:
addl r, rsets CF to the original sign bitsbb eax, eaxagain turns it into0or-1- masking with the polynomial produces the conditional XOR mask
The local note already lays out the proof carefully.
Why this seems strong
- the idiom is very common in CRC code
- the x86 flag behavior gives a clear 4-instruction target
- current lowering misses it in both right- and left-shift forms
Likely fix area
llvm/lib/Target/X86/X86ISelLowering.cpp- DAG combines or peepholes around flag-producing shifts and
sbb
Local references
- Source note:
llvm-validation/ch14-crc/bug-crc-shr-sbb.md - Harness:
llvm-validation/ch14-crc/ch14_crc.ll