.section .text.lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,"ax",@progbits
.globl lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core
.p2align 4, 0x90
.type lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,@function
lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core:
.cfi_startproc
push r15
.cfi_def_cfa_offset 16
push r14
.cfi_def_cfa_offset 24
push r12
.cfi_def_cfa_offset 32
push rbx
.cfi_def_cfa_offset 40
sub rsp, 4096
.cfi_adjust_cfa_offset 4096
mov qword ptr [rsp], 0
sub rsp, 1032
.cfi_def_cfa_offset 5168
.cfi_offset rbx, -40
.cfi_offset r12, -32
.cfi_offset r14, -24
.cfi_offset r15, -16
mov r15, rdx
mov r14, rsi
mov rbx, rdi
lea rdi, [rsp + 3080]
mov r12, qword ptr [rip + memset@GOTPCREL]
mov edx, 1024
xor esi, esi
call r12
lea rdi, [rsp + 8]
mov edx, 3072
xor esi, esi
call r12
lea rax, [r14 + r15]
cmp r15, 24
jbe .LBB1_1
lea rcx, [rax - 24]
cmp rcx, r14
jbe .LBB1_1
jmp .LBB1_6
.p2align 4, 0x90
.LBB1_7:
movzx edi, dl
inc dword ptr [rsp + 4*rdi + 8]
mov edi, edx
shr edi, 6
and edi, 1020
inc dword ptr [rsp + rdi + 1032]
mov edi, edx
shr edi, 14
and edi, 1020
inc dword ptr [rsp + rdi + 2056]
mov edi, edx
shr edi, 22
and edi, -4
inc dword ptr [rsp + rdi + 3080]
mov rdi, rdx
shr rdi, 32
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 8]
mov rdi, rdx
shr rdi, 40
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 1032]
mov rdi, rdx
shr rdi, 48
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 2056]
shr rdx, 56
inc dword ptr [rsp + 4*rdx + 3080]
movzx edx, sil
inc dword ptr [rsp + 4*rdx + 8]
mov edx, esi
shr edx, 6
and edx, 1020
inc dword ptr [rsp + rdx + 1032]
mov edx, esi
shr edx, 14
and edx, 1020
inc dword ptr [rsp + rdx + 2056]
mov edx, esi
shr edx, 22
and edx, -4
inc dword ptr [rsp + rdx + 3080]
mov rdx, rsi
shr rdx, 32
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 8]
mov rdx, rsi
shr rdx, 40
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 1032]
mov rdx, rsi
shr rdx, 48
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 2056]
shr rsi, 56
inc dword ptr [rsp + 4*rsi + 3080]
.LBB1_11:
add r14, 16
cmp r14, rcx
jae .LBB1_1
.LBB1_6:
mov rdx, qword ptr [r14]
mov rsi, qword ptr [r14 + 8]
cmp rdx, rsi
jne .LBB1_7
mov rdi, rdx
shl rdi, 8
xor rdi, rdx
movzx esi, dl
cmp rdi, 256
jae .LBB1_10
add dword ptr [rsp + 4*rsi + 8], 16
jmp .LBB1_11
.p2align 4, 0x90
.LBB1_10:
add dword ptr [rsp + 4*rsi + 8], 2
mov esi, edx
shr esi, 6
and esi, 1020
add dword ptr [rsp + rsi + 1032], 2
mov esi, edx
shr esi, 14
and esi, 1020
add dword ptr [rsp + rsi + 2056], 2
mov esi, edx
shr esi, 22
and esi, -4
add dword ptr [rsp + rsi + 3080], 2
mov rsi, rdx
shr rsi, 32
movzx esi, sil
add dword ptr [rsp + 4*rsi + 8], 2
mov rsi, rdx
shr rsi, 40
movzx esi, sil
add dword ptr [rsp + 4*rsi + 1032], 2
mov rsi, rdx
shr rsi, 48
movzx esi, sil
add dword ptr [rsp + 4*rsi + 2056], 2
shr rdx, 56
add dword ptr [rsp + 4*rdx + 3080], 2
jmp .LBB1_11
.p2align 4, 0x90
.LBB1_8:
movzx ecx, byte ptr [r14]
inc dword ptr [rsp + 4*rcx + 8]
inc r14
.LBB1_1:
cmp r14, rax
jb .LBB1_8
lea rdi, [rsp + 4104]
lea rsi, [rsp + 8]
mov edx, 1024
call qword ptr [rip + memcpy@GOTPCREL]
xor eax, eax
.p2align 4, 0x90
.LBB1_3:
mov ecx, dword ptr [rsp + rax + 1036]
add ecx, dword ptr [rsp + rax + 12]
mov edx, dword ptr [rsp + rax + 1040]
add edx, dword ptr [rsp + rax + 16]
mov esi, dword ptr [rsp + rax + 1032]
mov edi, dword ptr [rsp + rax + 1044]
add edi, dword ptr [rsp + rax + 20]
add ecx, dword ptr [rsp + rax + 2060]
add edx, dword ptr [rsp + rax + 2064]
add edi, dword ptr [rsp + rax + 2068]
add ecx, dword ptr [rsp + rax + 3084]
add edx, dword ptr [rsp + rax + 3088]
add edi, dword ptr [rsp + rax + 3092]
add esi, dword ptr [rsp + rax + 8]
add esi, dword ptr [rsp + rax + 2056]
add esi, dword ptr [rsp + rax + 3080]
mov dword ptr [rsp + rax + 4104], esi
mov dword ptr [rsp + rax + 4108], ecx
mov dword ptr [rsp + rax + 4112], edx
mov dword ptr [rsp + rax + 4116], edi
add rax, 16
cmp rax, 1024
jne .LBB1_3
lea rsi, [rsp + 4104]
mov edx, 1024
mov rdi, rbx
call qword ptr [rip + memcpy@GOTPCREL]
mov rax, rbx
add rsp, 5128
.cfi_def_cfa_offset 40
pop rbx
.cfi_def_cfa_offset 32
pop r12
.cfi_def_cfa_offset 24
pop r14
.cfi_def_cfa_offset 16
pop r15
.cfi_def_cfa_offset 8
ret
```
If you build a `bench`, `bin` or disable LTO for the lib:
```assembly
.section .text.lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,"ax",@progbits
.globl lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core
.p2align 4, 0x90
.type lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core,@function
lossless_transform_utils::histogram::histogram32::histogram_nonaliased_withruns_core:
.cfi_startproc
push r15
.cfi_def_cfa_offset 16
push r14
.cfi_def_cfa_offset 24
push r12
.cfi_def_cfa_offset 32
push rbx
.cfi_def_cfa_offset 40
sub rsp, 4096
.cfi_adjust_cfa_offset 4096
mov qword ptr [rsp], 0
sub rsp, 1032
.cfi_def_cfa_offset 5168
.cfi_offset rbx, -40
.cfi_offset r12, -32
.cfi_offset r14, -24
.cfi_offset r15, -16
mov r15, rdx
mov r14, rsi
mov rbx, rdi
lea rdi, [rsp + 3080]
mov r12, qword ptr [rip + memset@GOTPCREL]
mov edx, 1024
xor esi, esi
call r12
lea rdi, [rsp + 8]
mov edx, 3072
xor esi, esi
call r12
lea rax, [r14 + r15]
cmp r15, 24
jbe .LBB1_1
lea rcx, [rax - 24]
cmp rcx, r14
ja .LBB1_9
.LBB1_1:
mov rcx, r14
sub rcx, rax
jae .LBB1_5
mov edx, eax
sub edx, r14d
and edx, 3
je .LBB1_4
.p2align 4, 0x90
.LBB1_3:
movzx esi, byte ptr [r14]
inc dword ptr [rsp + 4*rsi + 8]
inc r14
dec rdx
jne .LBB1_3
.LBB1_4:
cmp rcx, -4
ja .LBB1_5
.p2align 4, 0x90
.LBB1_11:
movzx ecx, byte ptr [r14]
inc dword ptr [rsp + 4*rcx + 8]
movzx ecx, byte ptr [r14 + 1]
inc dword ptr [rsp + 4*rcx + 8]
movzx ecx, byte ptr [r14 + 2]
inc dword ptr [rsp + 4*rcx + 8]
movzx ecx, byte ptr [r14 + 3]
inc dword ptr [rsp + 4*rcx + 8]
add r14, 4
cmp r14, rax
jb .LBB1_11
.LBB1_5:
lea rdi, [rsp + 4104]
lea rsi, [rsp + 8]
mov edx, 1024
call qword ptr [rip + memcpy@GOTPCREL]
xor eax, eax
.p2align 4, 0x90
.LBB1_6:
movdqu xmm0, xmmword ptr [rsp + rax + 3080]
movdqu xmm1, xmmword ptr [rsp + rax + 2056]
paddd xmm1, xmm0
movdqu xmm0, xmmword ptr [rsp + rax + 1032]
movdqu xmm2, xmmword ptr [rsp + rax + 8]
paddd xmm2, xmm0
paddd xmm2, xmm1
movdqu xmm0, xmmword ptr [rsp + rax + 24]
movdqu xmmword ptr [rsp + rax + 4104], xmm2
movdqu xmm1, xmmword ptr [rsp + rax + 3096]
movdqu xmm2, xmmword ptr [rsp + rax + 2072]
paddd xmm2, xmm1
movdqu xmm1, xmmword ptr [rsp + rax + 1048]
paddd xmm1, xmm0
paddd xmm1, xmm2
movdqu xmmword ptr [rsp + rax + 4120], xmm1
add rax, 32
cmp rax, 1024
jne .LBB1_6
lea rsi, [rsp + 4104]
mov edx, 1024
mov rdi, rbx
call qword ptr [rip + memcpy@GOTPCREL]
mov rax, rbx
add rsp, 5128
.cfi_def_cfa_offset 40
pop rbx
.cfi_def_cfa_offset 32
pop r12
.cfi_def_cfa_offset 24
pop r14
.cfi_def_cfa_offset 16
pop r15
.cfi_def_cfa_offset 8
ret
.p2align 4, 0x90
.LBB1_10:
.cfi_def_cfa_offset 5168
movzx edi, dl
inc dword ptr [rsp + 4*rdi + 8]
mov edi, edx
shr edi, 6
and edi, 1020
inc dword ptr [rsp + rdi + 1032]
mov edi, edx
shr edi, 14
and edi, 1020
inc dword ptr [rsp + rdi + 2056]
mov edi, edx
shr edi, 22
and edi, -4
inc dword ptr [rsp + rdi + 3080]
mov rdi, rdx
shr rdi, 32
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 8]
mov rdi, rdx
shr rdi, 40
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 1032]
mov rdi, rdx
shr rdi, 48
movzx edi, dil
inc dword ptr [rsp + 4*rdi + 2056]
shr rdx, 56
inc dword ptr [rsp + 4*rdx + 3080]
movzx edx, sil
inc dword ptr [rsp + 4*rdx + 8]
mov edx, esi
shr edx, 6
and edx, 1020
inc dword ptr [rsp + rdx + 1032]
mov edx, esi
shr edx, 14
and edx, 1020
inc dword ptr [rsp + rdx + 2056]
mov edx, esi
shr edx, 22
and edx, -4
inc dword ptr [rsp + rdx + 3080]
mov rdx, rsi
shr rdx, 32
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 8]
mov rdx, rsi
shr rdx, 40
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 1032]
mov rdx, rsi
shr rdx, 48
movzx edx, dl
inc dword ptr [rsp + 4*rdx + 2056]
shr rsi, 56
inc dword ptr [rsp + 4*rsi + 3080]
.LBB1_14:
add r14, 16
cmp r14, rcx
jae .LBB1_1
.LBB1_9:
mov rdx, qword ptr [r14]
mov rsi, qword ptr [r14 + 8]
cmp rdx, rsi
jne .LBB1_10
mov r8, rdx
shl r8, 8
xor r8, rdx
movzx esi, dl
mov edi, esi
mov edi, dword ptr [rsp + 4*rdi + 8]
cmp r8, 256
jae .LBB1_13
add edi, 16
mov dword ptr [rsp + 4*rsi + 8], edi
jmp .LBB1_14
.p2align 4, 0x90
.LBB1_13:
add edi, 2
mov dword ptr [rsp + 4*rsi + 8], edi
mov esi, edx
shr esi, 6
and esi, 1020
add dword ptr [rsp + rsi + 1032], 2
mov esi, edx
shr esi, 14
and esi, 1020
add dword ptr [rsp + rsi + 2056], 2
mov esi, edx
shr esi, 22
and esi, -4
add dword ptr [rsp + rsi + 3080], 2
mov rsi, rdx
shr rsi, 32
movzx esi, sil
add dword ptr [rsp + 4*rsi + 8], 2
mov rsi, rdx
shr rsi, 40
movzx esi, sil
add dword ptr [rsp + 4*rsi + 1032], 2
mov rsi, rdx
shr rsi, 48
movzx esi, sil
add dword ptr [rsp + 4*rsi + 2056], 2
shr rdx, 56
add dword ptr [rsp + 4*rdx + 3080], 2
jmp .LBB1_14
Apologies for the long assembly, it's the best example I have on hand that I can think of off the top of my head.
This isn't a help request or anything of the sort; I was just wondering if this behaviour is worth documenting somewhere.
There's something I've noticed using
cargo-show-asmin the past 2-3 months.There are occasional cases where building a
libcrate with LTO doesn't produce the same results as building a bin/bench crate for a given function. (Note: Turning off LTO makes the code match onlibcrates).Below is a simple example based on an in-progress repo of mine:
Or if you'd prefer repo and commit, here.
Will need to add
no_mangleas usual.Building with
LTOenabled for release inCargo.tomlgives:Apologies for the long assembly, it's the best example I have on hand that I can think of off the top of my head.
The code respects
opt-level, but certain optimisations are missed; typically auto-vectorization from the one or two times I've ran into this issue. Host isLinux x86-64, but OS nortarget-cpuseems to have any impact here.This isn't a help request or anything of the sort; I was just wondering if this behaviour is worth documenting somewhere.