-
Notifications
You must be signed in to change notification settings - Fork 11
Description
On a preliminary experiment (https://godbolt.org/z/nMeMEzszK) u128 is not optimized correctly by Rust / LLVM even when compiling with arguments -C opt-level=3 -C target-cpu=skylake.
Example
pub fn test(x: u128) -> u128 {
x >> 3
}is compiled to:
example::test:
mov rdx, rsi
mov rax, rdi
shrd rax, rsi, 3 ; Lat: 1 RTP: 0.5 Port 1*p1
shr rdx, 3 ; Lat: 3 RTP: 1.0 Port: 1*p06
retWhile I'd expect the compiler to exploit the vpsrldq instruction like:
use std::arch::x86_64::{
__m128i,
_mm_srli_si128
};
pub unsafe fn test2(x: __m128i) -> __m128i {
_mm_srli_si128(x, 3)
}to obtain:
example::test2:
mov rax, rdi
vmovdqa xmm0, xmmword ptr [rsi]
vpsrldq xmm0, xmm0, 3 ; Lat: 1 RTP: 1.0 Port: 1*p5
vmovdqa xmmword ptr [rdi], xmm0
retThe Latencies and the Reciprocal ThroughPut I signed in the Assembly are taken from https://uops.info for the Skylake architecture.
This fail of optimizing results in spending 3 cycles, since shr and shrd are independent and use different ports they are executed at the same time, instead of 1, resulting in a x3 slowdown of this operation.
This is not due to the compilation target since vpsrldq is an SSE2 instruction that skylake supports:
$ rustc --print cfg -C target-cpu=skylake
debug_assertions
panic="unwind"
target_abi=""
target_arch="x86_64"
target_endian="little"
target_env="gnu"
target_family="unix"
target_feature="adx"
target_feature="aes"
target_feature="avx"
target_feature="avx2"
target_feature="bmi1"
target_feature="bmi2"
target_feature="cmpxchg16b"
target_feature="ermsb"
target_feature="f16c"
target_feature="fma"
target_feature="fxsr"
target_feature="llvm14-builtins-abi"
target_feature="lzcnt"
target_feature="movbe"
target_feature="pclmulqdq"
target_feature="popcnt"
target_feature="rdrand"
target_feature="rdseed"
target_feature="sse"
target_feature="sse2" <--------------------
target_feature="sse3"
target_feature="sse4.1"
target_feature="sse4.2"
target_feature="ssse3"
target_feature="xsave"
target_feature="xsavec"
target_feature="xsaveopt"
target_feature="xsaves"
target_has_atomic="16"
target_has_atomic="32"
target_has_atomic="64"
target_has_atomic="8"
target_has_atomic="ptr"
target_has_atomic_equal_alignment="16"
target_has_atomic_equal_alignment="32"
target_has_atomic_equal_alignment="64"
target_has_atomic_equal_alignment="8"
target_has_atomic_equal_alignment="ptr"
target_has_atomic_load_store="16"
target_has_atomic_load_store="32"
target_has_atomic_load_store="64"
target_has_atomic_load_store="8"
target_has_atomic_load_store="ptr"
target_os="linux"
target_pointer_width="64"
target_thread_local
target_vendor="unknown"
unix
We should discuss how to proceed with the implementation.