void arrayReverse()
{
__declspec(align(16)) float data[] = { 3.f, 5.f, 4.f, 9.f };
__asm
{
movaps xmm0, data;
shufps xmm0, xmm0, 1Bh;
movaps data, xmm0;
}
for (int i = 0; i < 4; ++i)
{
std::cout << data[i] << "\t";
}
std::cout << std::endl;
}
The code between __asm {....} is inline assembly code.
1. movaps xmm0, data ----> Moves aligned packed data from storage to xmm0 register
2. shufps xmm0, xmm0, 1Bh; -----> Switches the order of data elements in reverse with 1Bh(00011011b) mask. Leveraging single instruction on pack of data (SIMD) in effect
3. movaps data, xmm0; -----> Move the data back from register to storage.
C++ implementation of reversing an array(Not leveraging SSE):
=============================================
void arrayReverseByC_Plus_Plus()
{
float data[] = { 3.f, 5.f, 4.f, 9.f };
int start = 0;
int end = 3;
while (start < end)
{
float temp = data[start];
data[start] = data[end];
data[end] = temp;
start++;
end--;
}
for (int i = 0; i < 4; ++i)
{
std::cout << data[i] << "\t";
}
std::cout << std::endl;
}
Assembly code generated without optimization (/Od) by visual studio 2019:
?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT
; 93 : {
push ebp
mov ebp, esp
sub esp, 32 ; 00000020H
; 94 : float data[] = { 3.f, 5.f, 4.f, 9.f };
movss DWORD PTR _data$[ebp], xmm0
movss xmm0, DWORD PTR __real@40a00000
movss DWORD PTR _data$[ebp+4], xmm0
movss xmm0, DWORD PTR __real@40800000
movss DWORD PTR _data$[ebp+8], xmm0
movss xmm0, DWORD PTR __real@41100000
movss DWORD PTR _data$[ebp+12], xmm0
(SIMD register loading is scalar)
; 95 : int start = 0;
mov DWORD PTR _start$[ebp], 0
; 96 : int end = 3;
mov DWORD PTR _end$[ebp], 3
$LN2@arrayRever:
; 97 :
; 98 : while (start < end)
mov eax, DWORD PTR _start$[ebp]
cmp eax, DWORD PTR _end$[ebp]
jge SHORT $LN3@arrayRever
; 99 : {
; 100 : float temp = data[start];
mov ecx, DWORD PTR _start$[ebp]
movss xmm0, DWORD PTR _data$[ebp+ecx*4]
movss DWORD PTR _temp$1[ebp], xmm0
; 101 : data[start] = data[end];
mov edx, DWORD PTR _start$[ebp]
mov eax, DWORD PTR _end$[ebp]
mov ecx, DWORD PTR _data$[ebp+eax*4]
mov DWORD PTR _data$[ebp+edx*4], ecx
; 102 : data[end] = temp;
mov edx, DWORD PTR _end$[ebp]
movss xmm0, DWORD PTR _temp$1[ebp]
movss DWORD PTR _data$[ebp+edx*4], xmm0
; 103 : start++;
mov eax, DWORD PTR _start$[ebp]
add eax, 1
mov DWORD PTR _start$[ebp], eax
; 104 : end--;
mov ecx, DWORD PTR _end$[ebp]
sub ecx, 1
mov DWORD PTR _end$[ebp], ecx
; 105 : }
The code after applying optimization (/O2):
?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT
; 93 : {
push ebp
mov ebp, esp
sub esp, 16 ; 00000010H
; 94 : float data[] = { 3.f, 5.f, 4.f, 9.f };
movaps xmm0, XMMWORD PTR __xmm@411000004080000040a0000040400000 (packed loading in SIMD register as contrary to non-optimized one, above highlighted).
; 95 : int start = 0;
xor ecx, ecx
movups XMMWORD PTR _data$[ebp], xmm0
; 96 : int end = 3;
mov edx, 3
npad 8
$LL2@arrayRever:
; 97 :
; 98 : while (start < end)
; 99 : {
; 100 : float temp = data[start];
movss xmm0, DWORD PTR _data$[ebp+ecx*4]
; 101 : data[start] = data[end];
mov eax, DWORD PTR _data$[ebp+edx*4]
mov DWORD PTR _data$[ebp+ecx*4], eax
; 102 : data[end] = temp;
; 103 : start++;
inc ecx
movss DWORD PTR _data$[ebp+edx*4], xmm0
; 104 : end--;
dec edx
cmp ecx, edx
jl SHORT $LL2@arrayRever
; 105 : }
Also noteworthy to see loop optimization has also taken place.
No comments:
Post a Comment