Summary
LLVM current head misses the natural x86 scalar high-half multiply idiom for portable IR that computes the upper 32 bits of a 32x32 product.
Both clang and rustc emit the same reduced IR:
- unsigned:
zext i32 -> i64,mul i64,lshr 32,trunc - signed:
sext i32 -> i64,mul i64,ashr/lshr 32,trunc
llc keeps the widened imulq shape and extracts the high half with a shift,
instead of using the implicit %edx:%eax result of 32-bit mul / imul.
C repro
#include <stdint.h>
uint32_t mul_hi_u32(uint32_t a, uint32_t b) {
return (uint32_t)(((uint64_t)a * (uint64_t)b) >> 32);
}
int32_t mul_hi_s32(int32_t a, int32_t b) {
return (int32_t)(((int64_t)a * (int64_t)b) >> 32);
}Rust repro
#[unsafe(no_mangle)]
pub fn mul_hi_u32(a: u32, b: u32) -> u32 {
(((a as u64) * (b as u64)) >> 32) as u32
}
#[unsafe(no_mangle)]
pub fn mul_hi_s32(a: i32, b: i32) -> i32 {
(((a as i64) * (b as i64)) >> 32) as i32
}Reduced LLVM IR repro
target triple = "x86_64-unknown-linux-gnu"
define i32 @mul_hi_u32(i32 %a, i32 %b) {
entry:
%a64 = zext i32 %a to i64
%b64 = zext i32 %b to i64
%prod = mul i64 %a64, %b64
%hi = lshr i64 %prod, 32
%r = trunc i64 %hi to i32
ret i32 %r
}
define i32 @mul_hi_s32(i32 %a, i32 %b) {
entry:
%a64 = sext i32 %a to i64
%b64 = sext i32 %b to i64
%prod = mul nsw i64 %a64, %b64
%hi = ashr i64 %prod, 32
%r = trunc i64 %hi to i32
ret i32 %r
}Optimized IR
opt -passes='default<O2>' keeps the widened multiply form:
define range(i32 0, -1) i32 @mul_hi_u32(i32 %a, i32 %b) {
entry:
%a64 = zext i32 %a to i64
%b64 = zext i32 %b to i64
%prod = mul nuw i64 %b64, %a64
%hi = lshr i64 %prod, 32
%r = trunc nuw i64 %hi to i32
ret i32 %r
}
define range(i32 -1073741824, 1073741825) i32 @mul_hi_s32(i32 %a, i32 %b) {
entry:
%a64 = sext i32 %a to i64
%b64 = sext i32 %b to i64
%prod = mul nsw i64 %b64, %a64
%hi = lshr i64 %prod, 32
%r = trunc nuw i64 %hi to i32
ret i32 %r
}Current x86_64 output
Unsigned:
mul_hi_u32:
movl %edi, %ecx
movl %esi, %eax
imulq %rcx, %rax
shrq $32, %rax
retqSigned:
mul_hi_s32:
movslq %edi, %rcx
movslq %esi, %rax
imulq %rcx, %rax
shrq $32, %rax
retqIdeal x86_64 output
Unsigned:
mul_hi_u32:
movl %edi, %eax
mull %esi
movl %edx, %eax
retqSigned:
mul_hi_s32:
movl %edi, %eax
imull %esi
movl %edx, %eax
retqWhy this seems reportable
- clang and rustc both produce the same portable IR
- the x86 idiom is very clear and target-specific
- the current sequence is correct but strictly less direct than the implicit high-half multiply result already provided by the ISA