Summary
LLVM current head misses the natural x86 EDX:EAX / r32 idiom for unsigned
64-by-32 division formed from a (hi:lo) pair, even when an explicit
llvm.assume(hi < d) proves the quotient fits in 32 bits.
Both clang and rustc can emit the relevant reduced IR shape using an
assumption. llc currently rebuilds a 64-bit numerator and uses divq,
instead of directly lowering to divl with %edx:%eax.
C repro
#include <stdint.h>
uint32_t udiv_hi_lo(uint32_t hi, uint32_t lo, uint32_t d) {
__builtin_assume(hi < d);
return (uint32_t)(((((uint64_t)hi) << 32) | (uint64_t)lo) / (uint64_t)d);
}Rust repro
#![feature(core_intrinsics)]
extern crate core;
#[unsafe(no_mangle)]
pub unsafe fn udiv_hi_lo(hi: u32, lo: u32, d: u32) -> u32 {
unsafe { core::intrinsics::assume(hi < d) };
((((hi as u64) << 32) | (lo as u64)) / (d as u64)) as u32
}Reduced LLVM IR repro
target triple = "x86_64-unknown-linux-gnu"
declare void @llvm.assume(i1 noundef)
define i32 @udiv_hi_lo(i32 %hi, i32 %lo, i32 %d) {
entry:
%ok = icmp ult i32 %hi, %d
call void @llvm.assume(i1 %ok)
%hi64 = zext i32 %hi to i64
%hi_sh = shl i64 %hi64, 32
%lo64 = zext i32 %lo to i64
%num = or i64 %hi_sh, %lo64
%d64 = zext i32 %d to i64
%q = udiv i64 %num, %d64
%r = trunc i64 %q to i32
ret i32 %r
}Optimized IR
opt -passes='default<O2>' keeps the same basic shape:
define i32 @udiv_hi_lo(i32 %hi, i32 %lo, i32 %d) {
entry:
%ok = icmp ult i32 %hi, %d
call void @llvm.assume(i1 %ok)
%hi64 = zext i32 %hi to i64
%hi_sh = shl nuw i64 %hi64, 32
%lo64 = zext i32 %lo to i64
%num = or disjoint i64 %hi_sh, %lo64
%d64 = zext i32 %d to i64
%q = udiv i64 %num, %d64
%r = trunc i64 %q to i32
ret i32 %r
}Current x86_64 output
udiv_hi_lo:
movl %esi, %eax
shlq $32, %rdi
orq %rdi, %rax
movl %edx, %ecx
xorl %edx, %edx
divq %rcx
retqIdeal x86_64 output
udiv_hi_lo:
movl %edx, %ecx
movl %edi, %edx
movl %esi, %eax
divl %ecx
retqWhy this seems reportable
- clang and rustc can both reach the right IR pattern
- the assumption explicitly proves the standard
hi < dprecondition for 64-by-32 division using%edx:%eax - the current lowering is correct but misses the obvious x86 scalar idiom