Summary
The classic Hacker’s Delight parallel popcount network is not canonicalized to
llvm.ctpop on LLVM current head.
Both clang and rustc emit the expected arithmetic tree. opt -O2 leaves it
intact, and llc emits the full shift/add/and sequence rather than popcnt.
This is a strong canonicalization miss because LLVM already has ctpop and x86
has a direct popcnt instruction.
C repro
#include <stdint.h>
uint32_t pop32_hd(uint32_t x) {
x = x - ((x >> 1) & 0x55555555u);
x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u);
x = (x + (x >> 4)) & 0x0F0F0F0Fu;
x = x + (x >> 8);
x = x + (x >> 16);
return x & 0x3Fu;
}Rust repro
#[unsafe(no_mangle)]
pub fn pop32_hd(mut x: u32) -> u32 {
x = x - ((x >> 1) & 0x5555_5555);
x = (x & 0x3333_3333) + ((x >> 2) & 0x3333_3333);
x = (x + (x >> 4)) & 0x0f0f_0f0f;
x = x + (x >> 8);
x = x + (x >> 16);
x & 0x3f
}Reduced LLVM IR repro
target triple = "x86_64-unknown-linux-gnu"
define i32 @pop32_hd(i32 %x) {
entry:
%a = lshr i32 %x, 1
%b = and i32 %a, 1431655765
%c = sub i32 %x, %b
%d = and i32 %c, 858993459
%e = lshr i32 %c, 2
%f = and i32 %e, 858993459
%g = add i32 %d, %f
%h = lshr i32 %g, 4
%i = add i32 %g, %h
%j = and i32 %i, 252645135
%k = lshr i32 %j, 8
%l = add i32 %j, %k
%m = lshr i32 %l, 16
%n = add i32 %l, %m
%r = and i32 %n, 63
ret i32 %r
}Optimized IR
opt -passes='default<O2>' still leaves the arithmetic tree rather than
introducing llvm.ctpop.i32:
define range(i32 0, 64) i32 @pop32_hd(i32 %x) {
entry:
%a = lshr i32 %x, 1
%b = and i32 %a, 1431655765
%c = sub i32 %x, %b
%d = and i32 %c, 858993459
%e = lshr i32 %c, 2
%f = and i32 %e, 858993459
%g = add nuw nsw i32 %f, %d
%h = lshr i32 %g, 4
%i = add nuw nsw i32 %h, %g
%j = and i32 %i, 252645135
%k = lshr i32 %j, 8
%l = add nuw nsw i32 %k, %j
%m = lshr i32 %l, 16
%n = add nuw nsw i32 %m, %l
%r = and i32 %n, 63
ret i32 %r
}Current x86_64 output
pop32_hd:
movl %edi, %eax
shrl %eax
andl $1431655765, %eax
subl %eax, %edi
movl %edi, %eax
andl $858993459, %eax
shrl $2, %edi
andl $858993459, %edi
addl %eax, %edi
movl %edi, %eax
shrl $4, %eax
addl %edi, %eax
andl $252645135, %eax
movl %eax, %ecx
shrl $8, %ecx
addl %eax, %ecx
movl %ecx, %eax
shrl $16, %eax
addl %ecx, %eax
andl $63, %eax
retqIdeal x86_64 output
popcntl %edi, %eax
retqWhy this seems reportable
- clang and rustc both produce the same recognizable IR pattern
- LLVM already has a canonical
ctpopintrinsic - x86 has direct hardware support
This looks like a straightforward InstCombine canonicalization miss.