Takže tam bola diskusia o písaní čistejšie a zrozumiteľnejšie (pre iného človeka) kód, než sa snažiť, aby kód viac montáž-friendly.
Ja som v prospech písania pochopiteľné kód s výraznou akcie, samostatné responsibilies, zodpovedať C++ Usmernenia, atď. Áno, všeobecne, jeden by mal v prvom rade starať o algoritmická zložitosť a napísať kód pre abstraktný stroj. Zostavovatelia sú sofistikované dosť, aby elegantné optimalizácie kompilátora.
Tak som si na jednej strane kus kódu, ktorý nemám nájsť čistej alebo ľahko pochopiť zo prvý pohľad (z Stroustroup knihy, afaic):
void Window::move_to_top(Shape& s)
{
for (int i = 0; i < shapes.size(); ++i)
if (&s == shapes[i]) {
for (++i; i < shapes.size(); ++i)
shapes[i - 1] = shapes[i];
shapes[shapes.size() - 1] = &s;
return;
}
}
A ďalšie verzie, ktoré považujem sa viac intuitívne a ľahšie pochopiť, pomocou stanard knižnica algoritmy.
void Window::move_to_top(Shape& s)
{
auto iterShape = std::find(shapes.begin(), shapes.end(), &s);
std::rotate(iterShape, std::next(iterShape), shapes.end());
}
Obe verzie majú lineárna zložitosť a v podstate to isté (oboch predpokladať, že Shape &s
a je uložený v poli). Avšak, s gcc -O3
:
"Manual" verzia:
Window::move_to_top(Shape&):
mov r8, QWORD PTR [rdi+8]
mov rcx, QWORD PTR [rdi]
mov rdi, r8
sub rdi, rcx
sar rdi, 3
je .L1
mov eax, 1
jmp .L7
.L3:
lea rdx, [rax+1]
cmp rdi, rax
je .L1
mov rax, rdx
.L7:
mov edx, eax
cmp QWORD PTR [rcx-8+rax*8], rsi
jne .L3
add edx, 1
movsx rdx, edx
cmp rdi, rax
jbe .L6
.L5:
mov rax, QWORD PTR [rcx+rax*8]
mov QWORD PTR [rcx-16+rdx*8], rax
mov rax, rdx
add rdx, 1
cmp rdi, rax
ja .L5
.L6:
mov QWORD PTR [r8-8], rsi
ret
.L1:
ret
STL verzia:
Window::move_to_top(Shape&):
push r12
push rbp
push rbx
mov rax, QWORD PTR [rdi+8]
mov rbp, QWORD PTR [rdi]
mov rdx, rax
sub rdx, rbp
mov rcx, rdx
sar rdx, 5
sar rcx, 3
test rdx, rdx
jle .L2
sal rdx, 5
add rdx, rbp
jmp .L7
.L76:
cmp rsi, QWORD PTR [rbp+8]
je .L72
cmp rsi, QWORD PTR [rbp+16]
je .L73
cmp rsi, QWORD PTR [rbp+24]
je .L74
add rbp, 32
cmp rbp, rdx
je .L75
.L7:
cmp rsi, QWORD PTR [rbp+0]
jne .L76
.L3:
lea rdx, [rbp+8]
cmp rax, rdx
je .L1
sub rax, rbp
cmp rax, 16
je .L14
.L35:
sar rax, 3
mov esi, 1
.L15:
mov rdi, rax
sub rdi, rsi
cmp rsi, rdi
jge .L16
.L78:
cmp rsi, 1
je .L77
lea rcx, [0+rsi*8]
lea rdx, [rbp+0+rcx]
test rdi, rdi
jle .L19
lea r8, [rbp+16]
cmp rdx, r8
setnb r8b
add rcx, 16
test rcx, rcx
setle cl
or r8b, cl
je .L36
lea rcx, [rdi-1]
cmp rcx, 1
jbe .L36
mov r9, rdi
xor ecx, ecx
xor r8d, r8d
shr r9
.L21:
movdqu xmm0, XMMWORD PTR [rbp+0+rcx]
movdqu xmm1, XMMWORD PTR [rdx+rcx]
add r8, 1
movups XMMWORD PTR [rbp+0+rcx], xmm1
movups XMMWORD PTR [rdx+rcx], xmm0
add rcx, 16
cmp r9, r8
jne .L21
mov r9, rdi
and r9, -2
lea rcx, [0+r9*8]
lea r8, [rbp+0+rcx]
add rdx, rcx
cmp rdi, r9
je .L24
mov rcx, QWORD PTR [r8]
mov r9, QWORD PTR [rdx]
mov QWORD PTR [r8], r9
mov QWORD PTR [rdx], rcx
.L24:
lea rbp, [rbp+0+rdi*8]
.L19:
cqo
idiv rsi
test rdx, rdx
je .L1
mov rax, rsi
sub rsi, rdx
mov rdi, rax
sub rdi, rsi
cmp rsi, rdi
jl .L78
.L16:
lea rdx, [0+rax*8]
lea r11, [rbp+0+rdx]
cmp rdi, 1
je .L79
lea rcx, [0+rdi*8]
mov r10, r11
sub r10, rcx
test rsi, rsi
jle .L37
mov rbx, rdx
lea r8, [rdx-16]
sub rbx, rcx
cmp rbx, r8
lea r9, [rbx-16]
setle cl
cmp rdx, r9
setle dl
or cl, dl
je .L38
lea rdx, [rsi-1]
cmp rdx, 1
jbe .L38
mov rbx, rsi
add r9, rbp
add r8, rbp
xor ecx, ecx
shr rbx
xor edx, edx
.L31:
movdqu xmm0, XMMWORD PTR [r9+rcx]
movdqu xmm2, XMMWORD PTR [r8+rcx]
add rdx, 1
movups XMMWORD PTR [r9+rcx], xmm2
movups XMMWORD PTR [r8+rcx], xmm0
sub rcx, 16
cmp rdx, rbx
jne .L31
mov rdx, rsi
and rdx, -2
mov rcx, rdx
neg rcx
sal rcx, 3
cmp rsi, rdx
je .L34
sub rcx, 8
lea rdx, [r10+rcx]
add rcx, r11
mov r8, QWORD PTR [rdx]
mov r9, QWORD PTR [rcx]
mov QWORD PTR [rdx], r9
mov QWORD PTR [rcx], r8
.L34:
neg rsi
lea rbp, [r10+rsi*8]
.L29:
cqo
idiv rdi
mov rsi, rdx
test rdx, rdx
je .L1
mov rax, rdi
jmp .L15
.L14:
movdqu xmm0, XMMWORD PTR [rbp+0]
shufpd xmm0, xmm0, 1
movups XMMWORD PTR [rbp+0], xmm0
.L1:
pop rbx
pop rbp
pop r12
ret
.L38:
mov rdx, -8
xor ecx, ecx
.L30:
mov r8, QWORD PTR [r10+rdx]
mov r9, QWORD PTR [r11+rdx]
add rcx, 1
mov QWORD PTR [r10+rdx], r9
mov QWORD PTR [r11+rdx], r8
sub rdx, 8
cmp rsi, rcx
jne .L30
jmp .L34
.L36:
xor ecx, ecx
.L20:
mov r8, QWORD PTR [rbp+0+rcx*8]
mov r9, QWORD PTR [rdx+rcx*8]
mov QWORD PTR [rbp+0+rcx*8], r9
mov QWORD PTR [rdx+rcx*8], r8
add rcx, 1
cmp rdi, rcx
jne .L20
jmp .L24
.L37:
mov rbp, r10
jmp .L29
.L75:
mov rcx, rax
sub rcx, rbp
sar rcx, 3
.L2:
cmp rcx, 2
je .L8
cmp rcx, 3
je .L9
cmp rcx, 1
je .L10
.L11:
mov rbp, rax
xor eax, eax
jmp .L35
.L9:
cmp rsi, QWORD PTR [rbp+0]
je .L3
add rbp, 8
.L8:
cmp rsi, QWORD PTR [rbp+0]
je .L3
add rbp, 8
.L10:
cmp rsi, QWORD PTR [rbp+0]
jne .L11
jmp .L3
.L72:
add rbp, 8
jmp .L3
.L73:
add rbp, 16
jmp .L3
.L74:
add rbp, 24
jmp .L3
.L77:
sal rax, 3
lea rsi, [rbp+8]
mov r12, QWORD PTR [rbp+0]
lea rbx, [rbp+0+rax]
cmp rbx, rsi
je .L18
lea rdx, [rax-8]
mov rdi, rbp
call memmove
.L18:
mov QWORD PTR [rbx-8], r12
jmp .L1
.L79:
lea rax, [r11-8]
mov rbx, QWORD PTR [r11-8]
cmp rbp, rax
je .L28
sub rax, rbp
mov rdi, r11
mov rsi, rbp
mov rdx, rax
sub rdi, rax
call memmove
.L28:
mov QWORD PTR [rbp+0], rbx
jmp .L1
https://godbolt.org/z/P83YT37f8
Tak, najdôležitejšia otázka je: znamená to, že STL verzia je pomalšie? Ak áno, o akú veľkosť?
A ďalšia otázka by bola: Prečo je zhromaždenie tak, nafúknutý s -O3?
Tak som urobil niekoľko benchmarking:
#include <cstddef>
struct Shape
{
static size_t count;
size_t id = count++;
};
size_t Shape::count = 0;
#include <vector>
struct Window
{
std::vector<Shape*> shapes;
void move_to_top_manual(Shape& s);
void move_to_top(Shape& s);
};
void Window::move_to_top_manual(Shape& s)
{
for (int i = 0; i < shapes.size(); ++i)
if (&s == shapes[i])
{
for (++i; i < shapes.size(); ++i)
shapes[i - 1] = shapes[i];
shapes[shapes.size() - 1] = &s;
return;
}
}
#include <algorithm>
void Window::move_to_top(Shape& s)
{
auto iterShape = std::find(shapes.begin(), shapes.end(), &s);
std::rotate(iterShape, std::next(iterShape), shapes.end());
}
#include <chrono>
#include <string>
#include <iostream>
int main(int argc, char **argv)
{
size_t indx = argc < 2 ? 0 : std::stoll(argv[1]);
std::vector<Shape> shapes{size_t(5000000)};
Window window{};
window.shapes.reserve(shapes.size());
for(auto &s : shapes)
window.shapes.push_back(std::addressof(s));
Window windowsManual = window;
std::chrono::nanoseconds stlDuration{},
manualDuration{};
{
Shape &moveToTop = *window.shapes[indx];
auto timePointBegin = std::chrono::steady_clock::now();
window.move_to_top(moveToTop);
auto timePointEnd = std::chrono::steady_clock::now();
stlDuration = timePointEnd - timePointBegin;
std::cout << "STL nanosec: " << stlDuration.count() << std::endl;
if (&moveToTop != window.shapes.back())
return -1;
}
{
Shape &moveToTop = *windowsManual.shapes[indx];
auto timePointBegin = std::chrono::steady_clock::now();
windowsManual.move_to_top_manual(moveToTop);
auto timePointEnd = std::chrono::steady_clock::now();
manualDuration = timePointEnd - timePointBegin;
std::cout << "Manual nanosec: " << manualDuration.count() << std::endl;
if (&moveToTop != windowsManual.shapes.back())
return -1;
}
std::cout << "STL/Manual = " << double(stlDuration.count()) / manualDuration.count();
return 0;
}
V priemere s MSVC STL verzia 1.5-2.0-krát pomalšie ako Manuálna.
STL nanosec: 19559200
Manual nanosec: 10636300
STL/Manual = 1.83891
STL nanosec: 20507700
Manual nanosec: 11418600
STL/Manual = 1.79599
Manual nanosec: 11685300
STL/Manual = 1.70025
Tak, to je pomalší s MSVC.
S MinGW64 To je zhruba rovnaká:
STL nanosec: 9069300
Manual nanosec: 7860500
STL/Manual = 1.15378
STL nanosec: 8865300
Manual nanosec: 11290500
STL/Manual = 0.7852
STL nanosec: 10713700
Manual nanosec: 8235200
STL/Manual = 1.30096
Beží to na godbolt vytvára nečakané výsledky:
STL nanosec: 3551159
Manual nanosec: 30056271
STL/Manual = 0.11815