issue

Summary

The classic Hacker’s Delight flp2 / clp2 smear networks are not canonicalized to bit-scan forms on LLVM current head.

Both clang and rustc emit the expected portable IR shape, and llc generates correct code, but the code stays as a long ladder of shifts and ORs. An equivalent ctlz-based formulation lowers to much shorter lzcnt + shrx / shlx code on x86_64 with BMI1/BMI2/LZCNT.

C repro

#include <stdint.h>
 
uint32_t flp2_hd(uint32_t x) {
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return x - (x >> 1);
}
 
uint32_t clp2_hd(uint32_t x) {
  x -= 1;
  x |= x >> 1;
  x |= x >> 2;
  x |= x >> 4;
  x |= x >> 8;
  x |= x >> 16;
  return x + 1;
}

Rust repro

#[unsafe(no_mangle)]
pub fn flp2_hd(mut x: u32) -> u32 {
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    x - (x >> 1)
}
 
#[unsafe(no_mangle)]
pub fn clp2_hd(mut x: u32) -> u32 {
    x -= 1;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    x + 1
}

Reduced LLVM IR repro

target triple = "x86_64-unknown-linux-gnu"
 
define i32 @flp2_hd(i32 %x) {
entry:
  %a1 = lshr i32 %x, 1
  %a2 = or i32 %x, %a1
  %b1 = lshr i32 %a2, 2
  %b2 = or i32 %a2, %b1
  %c1 = lshr i32 %b2, 4
  %c2 = or i32 %b2, %c1
  %d1 = lshr i32 %c2, 8
  %d2 = or i32 %c2, %d1
  %e1 = lshr i32 %d2, 16
  %e2 = or i32 %d2, %e1
  %f = lshr i32 %e2, 1
  %r = sub i32 %e2, %f
  ret i32 %r
}
 
define i32 @clp2_hd(i32 %x) {
entry:
  %xm1 = add i32 %x, -1
  %a1 = lshr i32 %xm1, 1
  %a2 = or i32 %xm1, %a1
  %b1 = lshr i32 %a2, 2
  %b2 = or i32 %a2, %b1
  %c1 = lshr i32 %b2, 4
  %c2 = or i32 %b2, %c1
  %d1 = lshr i32 %c2, 8
  %d2 = or i32 %c2, %d1
  %e1 = lshr i32 %d2, 16
  %e2 = or i32 %d2, %e1
  %r = add i32 %e2, 1
  ret i32 %r
}

Optimized IR

opt -passes='default<O2>' leaves both networks intact:

define i32 @flp2_hd(i32 %x) {
entry:
  %a1 = lshr i32 %x, 1
  %a2 = or i32 %a1, %x
  %b1 = lshr i32 %a2, 2
  %b2 = or i32 %b1, %a2
  %c1 = lshr i32 %b2, 4
  %c2 = or i32 %c1, %b2
  %d1 = lshr i32 %c2, 8
  %d2 = or i32 %d1, %c2
  %e1 = lshr i32 %d2, 16
  %e2 = or i32 %e1, %d2
  %f = lshr i32 %e2, 1
  %r = sub i32 %e2, %f
  ret i32 %r
}
 
define i32 @clp2_hd(i32 %x) {
entry:
  %xm1 = add i32 %x, -1
  %a1 = lshr i32 %xm1, 1
  %a2 = or i32 %a1, %xm1
  %b1 = lshr i32 %a2, 2
  %b2 = or i32 %b1, %a2
  %c1 = lshr i32 %b2, 4
  %c2 = or i32 %c1, %b2
  %d1 = lshr i32 %c2, 8
  %d2 = or i32 %d1, %c2
  %e1 = lshr i32 %d2, 16
  %e2 = or i32 %e1, %d2
  %r = add i32 %e2, 1
  ret i32 %r
}

Current x86_64 output

flp2_hd:
	movl	%edi, %eax
	shrl	%eax
	orl	%edi, %eax
	movl	%eax, %ecx
	shrl	$2, %ecx
	orl	%eax, %ecx
	movl	%ecx, %eax
	shrl	$4, %eax
	orl	%ecx, %eax
	movl	%eax, %ecx
	shrl	$8, %ecx
	orl	%eax, %ecx
	movl	%ecx, %eax
	shrl	$16, %eax
	orl	%ecx, %eax
	movl	%eax, %ecx
	shrl	%ecx
	subl	%ecx, %eax
	retq
 
clp2_hd:
	leal	-1(%rdi), %eax
	movl	%eax, %ecx
	shrl	%ecx
	orl	%eax, %ecx
	movl	%ecx, %eax
	shrl	$2, %eax
	orl	%ecx, %eax
	movl	%eax, %ecx
	shrl	$4, %ecx
	orl	%eax, %ecx
	movl	%ecx, %edx
	shrl	$8, %edx
	orl	%ecx, %edx
	movl	%edx, %eax
	shrl	$16, %eax
	orl	%edx, %eax
	incl	%eax
	retq

Better x86_64 shape from equivalent `ctlz` formulations

flp2:

lzcntl	%edi, %eax
movl	$-2147483648, %ecx
shrxl	%eax, %ecx, %eax
cmovbl	%edi, %eax
retq

clp2:

decl	%edi
movl	$1, %ecx
lzcntl	%edi, %eax
negb	%al
shlxl	%eax, %ecx, %eax
retq

Why this seems reportable

clang and rustc both reach the same portable IR shape
LLVM already recognizes nearby power-of-two idioms in this chapter
there is a clear equivalent formulation that lowers much better on x86_64

The missing piece looks like canonicalization from the classic Hacker’s Delight smear networks to a bit-scan form.

Takashi's Notes

Explorer

issue

Summary

C repro

Rust repro

Reduced LLVM IR repro

Optimized IR

Current x86_64 output

Better x86_64 shape from equivalent `ctlz` formulations

Why this seems reportable

Graph View

Table of Contents

Takashi's Notes

Explorer

issue

Summary

C repro

Rust repro

Reduced LLVM IR repro

Optimized IR

Current x86_64 output

Better x86_64 shape from equivalent ctlz formulations

Why this seems reportable

Graph View

Table of Contents

Better x86_64 shape from equivalent `ctlz` formulations