issue

Summary

LLVM current head misses the natural x86 scalar high-half multiply idiom for portable IR that computes the upper 32 bits of a 32x32 product.

Both clang and rustc emit the same reduced IR:

unsigned: zext i32 -> i64, mul i64, lshr 32, trunc
signed: sext i32 -> i64, mul i64, ashr/lshr 32, trunc

llc keeps the widened imulq shape and extracts the high half with a shift, instead of using the implicit %edx:%eax result of 32-bit mul / imul.

C repro

#include <stdint.h>
 
uint32_t mul_hi_u32(uint32_t a, uint32_t b) {
  return (uint32_t)(((uint64_t)a * (uint64_t)b) >> 32);
}
 
int32_t mul_hi_s32(int32_t a, int32_t b) {
  return (int32_t)(((int64_t)a * (int64_t)b) >> 32);
}

Rust repro

#[unsafe(no_mangle)]
pub fn mul_hi_u32(a: u32, b: u32) -> u32 {
    (((a as u64) * (b as u64)) >> 32) as u32
}
 
#[unsafe(no_mangle)]
pub fn mul_hi_s32(a: i32, b: i32) -> i32 {
    (((a as i64) * (b as i64)) >> 32) as i32
}

Reduced LLVM IR repro

target triple = "x86_64-unknown-linux-gnu"
 
define i32 @mul_hi_u32(i32 %a, i32 %b) {
entry:
  %a64 = zext i32 %a to i64
  %b64 = zext i32 %b to i64
  %prod = mul i64 %a64, %b64
  %hi = lshr i64 %prod, 32
  %r = trunc i64 %hi to i32
  ret i32 %r
}
 
define i32 @mul_hi_s32(i32 %a, i32 %b) {
entry:
  %a64 = sext i32 %a to i64
  %b64 = sext i32 %b to i64
  %prod = mul nsw i64 %a64, %b64
  %hi = ashr i64 %prod, 32
  %r = trunc i64 %hi to i32
  ret i32 %r
}

Optimized IR

opt -passes='default<O2>' keeps the widened multiply form:

define range(i32 0, -1) i32 @mul_hi_u32(i32 %a, i32 %b) {
entry:
  %a64 = zext i32 %a to i64
  %b64 = zext i32 %b to i64
  %prod = mul nuw i64 %b64, %a64
  %hi = lshr i64 %prod, 32
  %r = trunc nuw i64 %hi to i32
  ret i32 %r
}
 
define range(i32 -1073741824, 1073741825) i32 @mul_hi_s32(i32 %a, i32 %b) {
entry:
  %a64 = sext i32 %a to i64
  %b64 = sext i32 %b to i64
  %prod = mul nsw i64 %b64, %a64
  %hi = lshr i64 %prod, 32
  %r = trunc nuw i64 %hi to i32
  ret i32 %r
}

Current x86_64 output

Unsigned:

mul_hi_u32:
	movl	%edi, %ecx
	movl	%esi, %eax
	imulq	%rcx, %rax
	shrq	$32, %rax
	retq

Signed:

mul_hi_s32:
	movslq	%edi, %rcx
	movslq	%esi, %rax
	imulq	%rcx, %rax
	shrq	$32, %rax
	retq

Ideal x86_64 output

Unsigned:

mul_hi_u32:
	movl	%edi, %eax
	mull	%esi
	movl	%edx, %eax
	retq

Signed:

mul_hi_s32:
	movl	%edi, %eax
	imull	%esi
	movl	%edx, %eax
	retq

Why this seems reportable

clang and rustc both produce the same portable IR
the x86 idiom is very clear and target-specific
the current sequence is correct but strictly less direct than the implicit high-half multiply result already provided by the ISA

Takashi's Notes

Explorer