Summary
The classic Hacker’s Delight flp2 / clp2 smear networks are not
canonicalized to bit-scan forms on LLVM current head.
Both clang and rustc emit the expected portable IR shape, and llc generates
correct code, but the code stays as a long ladder of shifts and ORs. An
equivalent ctlz-based formulation lowers to much shorter lzcnt + shrx /
shlx code on x86_64 with BMI1/BMI2/LZCNT.
C repro
#include <stdint.h>
uint32_t flp2_hd(uint32_t x) {
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x - (x >> 1);
}
uint32_t clp2_hd(uint32_t x) {
x -= 1;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x + 1;
}Rust repro
#[unsafe(no_mangle)]
pub fn flp2_hd(mut x: u32) -> u32 {
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x - (x >> 1)
}
#[unsafe(no_mangle)]
pub fn clp2_hd(mut x: u32) -> u32 {
x -= 1;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x + 1
}Reduced LLVM IR repro
target triple = "x86_64-unknown-linux-gnu"
define i32 @flp2_hd(i32 %x) {
entry:
%a1 = lshr i32 %x, 1
%a2 = or i32 %x, %a1
%b1 = lshr i32 %a2, 2
%b2 = or i32 %a2, %b1
%c1 = lshr i32 %b2, 4
%c2 = or i32 %b2, %c1
%d1 = lshr i32 %c2, 8
%d2 = or i32 %c2, %d1
%e1 = lshr i32 %d2, 16
%e2 = or i32 %d2, %e1
%f = lshr i32 %e2, 1
%r = sub i32 %e2, %f
ret i32 %r
}
define i32 @clp2_hd(i32 %x) {
entry:
%xm1 = add i32 %x, -1
%a1 = lshr i32 %xm1, 1
%a2 = or i32 %xm1, %a1
%b1 = lshr i32 %a2, 2
%b2 = or i32 %a2, %b1
%c1 = lshr i32 %b2, 4
%c2 = or i32 %b2, %c1
%d1 = lshr i32 %c2, 8
%d2 = or i32 %c2, %d1
%e1 = lshr i32 %d2, 16
%e2 = or i32 %d2, %e1
%r = add i32 %e2, 1
ret i32 %r
}Optimized IR
opt -passes='default<O2>' leaves both networks intact:
define i32 @flp2_hd(i32 %x) {
entry:
%a1 = lshr i32 %x, 1
%a2 = or i32 %a1, %x
%b1 = lshr i32 %a2, 2
%b2 = or i32 %b1, %a2
%c1 = lshr i32 %b2, 4
%c2 = or i32 %c1, %b2
%d1 = lshr i32 %c2, 8
%d2 = or i32 %d1, %c2
%e1 = lshr i32 %d2, 16
%e2 = or i32 %e1, %d2
%f = lshr i32 %e2, 1
%r = sub i32 %e2, %f
ret i32 %r
}
define i32 @clp2_hd(i32 %x) {
entry:
%xm1 = add i32 %x, -1
%a1 = lshr i32 %xm1, 1
%a2 = or i32 %a1, %xm1
%b1 = lshr i32 %a2, 2
%b2 = or i32 %b1, %a2
%c1 = lshr i32 %b2, 4
%c2 = or i32 %c1, %b2
%d1 = lshr i32 %c2, 8
%d2 = or i32 %d1, %c2
%e1 = lshr i32 %d2, 16
%e2 = or i32 %e1, %d2
%r = add i32 %e2, 1
ret i32 %r
}Current x86_64 output
flp2_hd:
movl %edi, %eax
shrl %eax
orl %edi, %eax
movl %eax, %ecx
shrl $2, %ecx
orl %eax, %ecx
movl %ecx, %eax
shrl $4, %eax
orl %ecx, %eax
movl %eax, %ecx
shrl $8, %ecx
orl %eax, %ecx
movl %ecx, %eax
shrl $16, %eax
orl %ecx, %eax
movl %eax, %ecx
shrl %ecx
subl %ecx, %eax
retq
clp2_hd:
leal -1(%rdi), %eax
movl %eax, %ecx
shrl %ecx
orl %eax, %ecx
movl %ecx, %eax
shrl $2, %eax
orl %ecx, %eax
movl %eax, %ecx
shrl $4, %ecx
orl %eax, %ecx
movl %ecx, %edx
shrl $8, %edx
orl %ecx, %edx
movl %edx, %eax
shrl $16, %eax
orl %edx, %eax
incl %eax
retqBetter x86_64 shape from equivalent ctlz formulations
flp2:
lzcntl %edi, %eax
movl $-2147483648, %ecx
shrxl %eax, %ecx, %eax
cmovbl %edi, %eax
retqclp2:
decl %edi
movl $1, %ecx
lzcntl %edi, %eax
negb %al
shlxl %eax, %ecx, %eax
retqWhy this seems reportable
- clang and rustc both reach the same portable IR shape
- LLVM already recognizes nearby power-of-two idioms in this chapter
- there is a clear equivalent formulation that lowers much better on x86_64
The missing piece looks like canonicalization from the classic Hacker’s Delight smear networks to a bit-scan form.