issue

Summary

The classic CRC inner-loop idioms based on a conditional polynomial XOR can be implemented more efficiently on x86 by using the carry flag produced by a 1-bit shift and then materializing the mask with sbb reg, reg.

LLVM currently misses this in two common forms:

right-shift CRC step: 6 instructions instead of 4
left-shift CRC step: 5 instructions instead of 4

This is a target-specific x86 codegen / peephole issue.

Reproducer

define i32 @crc32_step_mask(i32 %crc) {
  %low   = and i32 %crc, 1
  %mask  = sub i32 0, %low
  %poly  = and i32 %mask, -307974906
  %shr   = lshr i32 %crc, 1
  %r     = xor i32 %shr, %poly
  ret i32 %r
}
 
define i32 @crc32_step_left(i32 %crc) {
  %sign  = icmp slt i32 %crc, 0
  %poly  = select i1 %sign, i32 79764919, i32 0
  %shl   = shl i32 %crc, 1
  %r     = xor i32 %shl, %poly
  ret i32 %r
}

Real-world Rust provenance from `crc-rs`

The local checkout of crc-rs has the actual leaf function in:

File: ~/crates/crc-rs/src/util.rs

pub const fn crc32(poly: u32, reflect: bool, mut value: u32) -> u32 {
    if reflect {
        let mut i = 0;
        while i < 8 {
            value = (value >> 1) ^ ((value & 1) * poly);
            i += 1;
        }
    } else {
        value <<= 24;
 
        let mut i = 0;
        while i < 8 {
            value = (value << 1) ^ (((value >> 31) & 1) * poly);
            i += 1;
        }
    }
    value
}

To inspect codegen directly from the crate, I added these public wrappers in src/lib.rs:

#[inline(never)]
pub fn crc32_step_reflect_ieee(value: u32) -> u32 {
    util::crc32(0xedb8_8320, true, value)
}
 
#[inline(never)]
pub fn crc32_step_nonreflect_ieee(value: u32) -> u32 {
    util::crc32(0x04c1_1db7, false, value)
}

and then used:

cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_reflect_ieee --llvm
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_reflect_ieee --asm
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_nonreflect_ieee --llvm
cargo asm --manifest-path ~/crates/crc-rs/Cargo.toml --lib crc::crc32_step_nonreflect_ieee --asm

Full LLVM IR from the actual crate

Reflected wrapper:

define noundef i32 @crc::crc32_step_reflect_ieee(i32 noundef %value) unnamed_addr #0 {
start:
  %_7.i = lshr i32 %value, 1
  %0 = trunc i32 %value to i1
  %_9.i = select i1 %0, i32 -306674912, i32 0
  %1 = xor i32 %_9.i, %_7.i
  %_7.1.i = lshr i32 %1, 1
  %2 = trunc i32 %_7.i to i1
  %_9.1.i = select i1 %2, i32 -306674912, i32 0
  %3 = xor i32 %_7.1.i, %_9.1.i
  %_7.2.i = lshr i32 %3, 1
  %4 = trunc i32 %_7.1.i to i1
  %_9.2.i = select i1 %4, i32 -306674912, i32 0
  %5 = xor i32 %_9.2.i, %_7.2.i
  %_7.3.i = lshr i32 %5, 1
  %6 = trunc i32 %_7.2.i to i1
  %_9.3.i = select i1 %6, i32 -306674912, i32 0
  %7 = xor i32 %_9.3.i, %_7.3.i
  %_7.4.i = lshr i32 %7, 1
  %8 = trunc i32 %_7.3.i to i1
  %_9.4.i = select i1 %8, i32 -306674912, i32 0
  %9 = xor i32 %_9.4.i, %_7.4.i
  %_7.5.i = lshr i32 %9, 1
  %10 = trunc i32 %_7.4.i to i1
  %_9.5.i = select i1 %10, i32 -306674912, i32 0
  %11 = xor i32 %_9.5.i, %_7.5.i
  %_7.6.i = lshr i32 %11, 1
  %12 = trunc i32 %_7.5.i to i1
  %_9.6.i = select i1 %12, i32 -306674912, i32 0
  %13 = xor i32 %_9.6.i, %_7.6.i
  %_7.7.i = lshr i32 %13, 1
  %14 = trunc i32 %_7.6.i to i1
  %_16.7.i = select i1 %14, i32 -306674912, i32 0
  %15 = xor i32 %_16.7.i, %_7.7.i
  ret i32 %15
}

Non-reflected wrapper:

define noundef i32 @crc::crc32_step_nonreflect_ieee(i32 noundef %value) unnamed_addr #0 {
start:
  %_14.i = shl i32 %value, 25
  %.mask.i = and i32 %value, 128
  %isneg.not.i = icmp eq i32 %.mask.i, 0
  %_16.i = select i1 %isneg.not.i, i32 0, i32 79764919
  %0 = xor i32 %_16.i, %_14.i
  %_14.1.i = shl i32 %0, 1
  %isneg.1.i = icmp slt i32 %_14.i, 0
  %_16.1.i = select i1 %isneg.1.i, i32 79764919, i32 0
  %1 = xor i32 %_16.1.i, %_14.1.i
  %_14.2.i = shl i32 %1, 1
  %isneg.2.i = icmp slt i32 %_14.1.i, 0
  %_16.2.i = select i1 %isneg.2.i, i32 79764919, i32 0
  %2 = xor i32 %_16.2.i, %_14.2.i
  %_14.3.i = shl i32 %2, 1
  %isneg.3.i = icmp slt i32 %_14.2.i, 0
  %_16.3.i = select i1 %isneg.3.i, i32 79764919, i32 0
  %3 = xor i32 %_16.3.i, %_14.3.i
  %_14.4.i = shl i32 %3, 1
  %isneg.4.i = icmp slt i32 %_14.3.i, 0
  %_16.4.i = select i1 %isneg.4.i, i32 79764919, i32 0
  %4 = xor i32 %_16.4.i, %_14.4.i
  %_14.5.i = shl i32 %4, 1
  %isneg.5.i = icmp slt i32 %_14.4.i, 0
  %_16.5.i = select i1 %isneg.5.i, i32 79764919, i32 0
  %5 = xor i32 %_16.5.i, %_14.5.i
  %_14.6.i = shl i32 %5, 1
  %isneg.6.i = icmp slt i32 %_14.5.i, 0
  %_16.6.i = select i1 %isneg.6.i, i32 79764919, i32 0
  %6 = xor i32 %_16.6.i, %_14.6.i
  %_14.7.i = shl i32 %6, 1
  %isneg.7.i = icmp slt i32 %_14.6.i, 0
  %_16.7.i = select i1 %isneg.7.i, i32 79764919, i32 0
  %7 = xor i32 %_16.7.i, %_14.7.i
  ret i32 %7
}

The wrappers inline the real crc-rs body and unroll all 8 iterations, so this is direct evidence from the crate rather than a synthetic standalone function.

Current x86_64 output from the actual crate

Reflected wrapper:

crc::crc32_step_reflect_ieee:
	mov edx, edi
	shr edx
	xor eax, eax
	test dil, 1
	mov ecx, -306674912
	mov esi, 0
	cmovne esi, ecx
	xor esi, edx
	mov edx, esi
	shr edx
	bt edi, 1
	mov edi, 0
	cmovb edi, ecx
	xor edi, edx
	mov edx, edi
	shr edx
	bt esi, 1
	mov esi, 0
	cmovb esi, ecx
	xor esi, edx
	mov edx, esi
	shr edx
	bt edi, 1
	mov edi, 0
	cmovb edi, ecx
	xor edi, edx
	mov edx, edi
	shr edx
	bt esi, 1
	mov esi, 0
	cmovb esi, ecx
	xor esi, edx
	mov edx, esi
	shr edx
	bt edi, 1
	mov edi, 0
	cmovb edi, ecx
	xor edi, edx
	bt esi, 1
	mov edx, 0
	cmovb edx, ecx
	mov esi, edi
	shr esi, 2
	shr edx
	xor edx, esi
	bt edi, 1
	cmovb eax, ecx
	xor eax, edx
	ret

Non-reflected wrapper:

crc::crc32_step_nonreflect_ieee:
	movsx ecx, dil
	mov eax, edi
	shl eax, 25
	sar ecx, 7
	and ecx, 79764919
	xor ecx, eax
	add ecx, ecx
	sar eax, 31
	and eax, 79764919
	xor eax, ecx
	add eax, eax
	sar ecx, 31
	and ecx, 79764919
	xor ecx, eax
	add ecx, ecx
	sar eax, 31
	and eax, 79764919
	xor eax, ecx
	add eax, eax
	sar ecx, 31
	and ecx, 79764919
	xor ecx, eax
	add ecx, ecx
	sar eax, 31
	and eax, 79764919
	xor eax, ecx
	add eax, eax
	sar ecx, 31
	and ecx, 79764919
	xor ecx, eax
	add ecx, ecx
	sar eax, 31
	and eax, 79764919
	xor eax, ecx
	ret

The important point is that the emitted code is still using select/mask-style materialization for each step rather than the shorter flag-based shr/add/shl + sbb idiom.

Expected x86_64 output for the actual crate

Reflected wrapper:

Each of the 8 steps should be expressible as:

shrl  %eax
sbbl  %edx, %edx
andl  $-306674912, %edx
xorl  %edx, %eax

So an ideal full unrolled shape is:

crc::crc32_step_reflect_ieee:
	movl  %edi, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
 
	shrl  %eax
	sbbl  %edx, %edx
	andl  $-306674912, %edx
	xorl  %edx, %eax
	retq

Non-reflected wrapper:

The first step corresponds to the initial value <<= 24 followed by one conditional polynomial XOR, so a good flag-based first step is:

shll  $25, %eax
sbbl  %edx, %edx
andl  $79764919, %edx
xorl  %edx, %eax

and the remaining 7 steps are:

addl  %eax, %eax
sbbl  %edx, %edx
andl  $79764919, %edx
xorl  %edx, %eax

So an ideal full unrolled shape is:

crc::crc32_step_nonreflect_ieee:
	movl  %edi, %eax
 
	shll  $25, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
 
	addl  %eax, %eax
	sbbl  %edx, %edx
	andl  $79764919, %edx
	xorl  %edx, %eax
	retq

Current output

crc32_step_mask:
    movl  %edi, %eax
    andl  $1, %eax
    negl  %eax
    andl  $-307974906, %eax
    shrl  %edi
    xorl  %edi, %eax
    retq
 
crc32_step_left:
    leal  (%rdi,%rdi), %eax
    movl  %edi, %ecx
    sarl  $31, %ecx
    andl  $79764919, %ecx
    xorl  %ecx, %eax
    retq

Better output

Right-shift form:

    shrl  %edi
    sbbl  %eax, %eax
    andl  $-307974906, %eax
    xorl  %edi, %eax
    retq

Left-shift form:

    addl  %edi, %edi
    sbbl  %eax, %eax
    andl  $79764919, %eax
    xorl  %edi, %eax
    retq

Why this is valid

For the right-shift form:

shrl r, 1 sets CF to the original low bit
sbb eax, eax turns that CF into 0 or -1
masking with the polynomial produces poly or 0

For the left-shift form:

addl r, r sets CF to the original sign bit
sbb eax, eax again turns it into 0 or -1
masking with the polynomial produces the conditional XOR mask

The local note already lays out the proof carefully.

Why this seems strong

the idiom is very common in CRC code
the x86 flag behavior gives a clear 4-instruction target
current lowering misses it in both right- and left-shift forms

Likely fix area

llvm/lib/Target/X86/X86ISelLowering.cpp
DAG combines or peepholes around flag-producing shifts and sbb

Local references

Source note: llvm-validation/ch14-crc/bug-crc-shr-sbb.md
Harness: llvm-validation/ch14-crc/ch14_crc.ll

Takashi's Notes

Explorer

issue

Summary

Reproducer

Real-world Rust provenance from `crc-rs`

Full LLVM IR from the actual crate

Current x86_64 output from the actual crate

Expected x86_64 output for the actual crate

Current output

Better output

Why this is valid

Why this seems strong

Likely fix area

Local references

Graph View

Table of Contents

Takashi's Notes

Explorer

issue

Summary

Reproducer

Real-world Rust provenance from crc-rs

Full LLVM IR from the actual crate

Current x86_64 output from the actual crate

Expected x86_64 output for the actual crate

Current output

Better output

Why this is valid

Why this seems strong

Likely fix area

Local references

Graph View

Table of Contents

Real-world Rust provenance from `crc-rs`