Summary

LLVM current head misses the natural x86 EDX:EAX / r32 idiom for unsigned 64-by-32 division formed from a (hi:lo) pair, even when an explicit llvm.assume(hi < d) proves the quotient fits in 32 bits.

Both clang and rustc can emit the relevant reduced IR shape using an assumption. llc currently rebuilds a 64-bit numerator and uses divq, instead of directly lowering to divl with %edx:%eax.

C repro

#include <stdint.h>
 
uint32_t udiv_hi_lo(uint32_t hi, uint32_t lo, uint32_t d) {
  __builtin_assume(hi < d);
  return (uint32_t)(((((uint64_t)hi) << 32) | (uint64_t)lo) / (uint64_t)d);
}

Rust repro

#![feature(core_intrinsics)]
 
extern crate core;
 
#[unsafe(no_mangle)]
pub unsafe fn udiv_hi_lo(hi: u32, lo: u32, d: u32) -> u32 {
    unsafe { core::intrinsics::assume(hi < d) };
    ((((hi as u64) << 32) | (lo as u64)) / (d as u64)) as u32
}

Reduced LLVM IR repro

target triple = "x86_64-unknown-linux-gnu"
 
declare void @llvm.assume(i1 noundef)
 
define i32 @udiv_hi_lo(i32 %hi, i32 %lo, i32 %d) {
entry:
  %ok = icmp ult i32 %hi, %d
  call void @llvm.assume(i1 %ok)
  %hi64 = zext i32 %hi to i64
  %hi_sh = shl i64 %hi64, 32
  %lo64 = zext i32 %lo to i64
  %num = or i64 %hi_sh, %lo64
  %d64 = zext i32 %d to i64
  %q = udiv i64 %num, %d64
  %r = trunc i64 %q to i32
  ret i32 %r
}

Optimized IR

opt -passes='default<O2>' keeps the same basic shape:

define i32 @udiv_hi_lo(i32 %hi, i32 %lo, i32 %d) {
entry:
  %ok = icmp ult i32 %hi, %d
  call void @llvm.assume(i1 %ok)
  %hi64 = zext i32 %hi to i64
  %hi_sh = shl nuw i64 %hi64, 32
  %lo64 = zext i32 %lo to i64
  %num = or disjoint i64 %hi_sh, %lo64
  %d64 = zext i32 %d to i64
  %q = udiv i64 %num, %d64
  %r = trunc i64 %q to i32
  ret i32 %r
}

Current x86_64 output

udiv_hi_lo:
	movl	%esi, %eax
	shlq	$32, %rdi
	orq	%rdi, %rax
	movl	%edx, %ecx
	xorl	%edx, %edx
	divq	%rcx
	retq

Ideal x86_64 output

udiv_hi_lo:
	movl	%edx, %ecx
	movl	%edi, %edx
	movl	%esi, %eax
	divl	%ecx
	retq

Why this seems reportable

  • clang and rustc can both reach the right IR pattern
  • the assumption explicitly proves the standard hi < d precondition for 64-by-32 division using %edx:%eax
  • the current lowering is correct but misses the obvious x86 scalar idiom