Summary

The classic Hacker’s Delight parallel popcount network is not canonicalized to llvm.ctpop on LLVM current head.

Both clang and rustc emit the expected arithmetic tree. opt -O2 leaves it intact, and llc emits the full shift/add/and sequence rather than popcnt. This is a strong canonicalization miss because LLVM already has ctpop and x86 has a direct popcnt instruction.

C repro

#include <stdint.h>
 
uint32_t pop32_hd(uint32_t x) {
  x = x - ((x >> 1) & 0x55555555u);
  x = (x & 0x33333333u) + ((x >> 2) & 0x33333333u);
  x = (x + (x >> 4)) & 0x0F0F0F0Fu;
  x = x + (x >> 8);
  x = x + (x >> 16);
  return x & 0x3Fu;
}

Rust repro

#[unsafe(no_mangle)]
pub fn pop32_hd(mut x: u32) -> u32 {
    x = x - ((x >> 1) & 0x5555_5555);
    x = (x & 0x3333_3333) + ((x >> 2) & 0x3333_3333);
    x = (x + (x >> 4)) & 0x0f0f_0f0f;
    x = x + (x >> 8);
    x = x + (x >> 16);
    x & 0x3f
}

Reduced LLVM IR repro

target triple = "x86_64-unknown-linux-gnu"
 
define i32 @pop32_hd(i32 %x) {
entry:
  %a = lshr i32 %x, 1
  %b = and i32 %a, 1431655765
  %c = sub i32 %x, %b
  %d = and i32 %c, 858993459
  %e = lshr i32 %c, 2
  %f = and i32 %e, 858993459
  %g = add i32 %d, %f
  %h = lshr i32 %g, 4
  %i = add i32 %g, %h
  %j = and i32 %i, 252645135
  %k = lshr i32 %j, 8
  %l = add i32 %j, %k
  %m = lshr i32 %l, 16
  %n = add i32 %l, %m
  %r = and i32 %n, 63
  ret i32 %r
}

Optimized IR

opt -passes='default<O2>' still leaves the arithmetic tree rather than introducing llvm.ctpop.i32:

define range(i32 0, 64) i32 @pop32_hd(i32 %x) {
entry:
  %a = lshr i32 %x, 1
  %b = and i32 %a, 1431655765
  %c = sub i32 %x, %b
  %d = and i32 %c, 858993459
  %e = lshr i32 %c, 2
  %f = and i32 %e, 858993459
  %g = add nuw nsw i32 %f, %d
  %h = lshr i32 %g, 4
  %i = add nuw nsw i32 %h, %g
  %j = and i32 %i, 252645135
  %k = lshr i32 %j, 8
  %l = add nuw nsw i32 %k, %j
  %m = lshr i32 %l, 16
  %n = add nuw nsw i32 %m, %l
  %r = and i32 %n, 63
  ret i32 %r
}

Current x86_64 output

pop32_hd:
	movl	%edi, %eax
	shrl	%eax
	andl	$1431655765, %eax
	subl	%eax, %edi
	movl	%edi, %eax
	andl	$858993459, %eax
	shrl	$2, %edi
	andl	$858993459, %edi
	addl	%eax, %edi
	movl	%edi, %eax
	shrl	$4, %eax
	addl	%edi, %eax
	andl	$252645135, %eax
	movl	%eax, %ecx
	shrl	$8, %ecx
	addl	%eax, %ecx
	movl	%ecx, %eax
	shrl	$16, %eax
	addl	%ecx, %eax
	andl	$63, %eax
	retq

Ideal x86_64 output

popcntl	%edi, %eax
retq

Why this seems reportable

  • clang and rustc both produce the same recognizable IR pattern
  • LLVM already has a canonical ctpop intrinsic
  • x86 has direct hardware support

This looks like a straightforward InstCombine canonicalization miss.