Parallelism with C++: Parallelism at instruction level (SIMD contd)

Leveraging SSE capabilities (supported by CPU architecture) and do some processing like reversing array. The below C++ code written using visual studio 2019 (with inline assembly written for x86 architecture)

void arrayReverse()
{
__declspec(align(16)) float data[] = { 3.f, 5.f, 4.f, 9.f };
__asm
{
movaps xmm0, data;
shufps xmm0, xmm0, 1Bh;
movaps data, xmm0;
}

for (int i = 0; i < 4; ++i)
{
std::cout << data[i] << "\t";
}
std::cout << std::endl;
}

The code between __asm {....} is inline assembly code.
1. movaps xmm0, data ----> Moves aligned packed data from storage to xmm0 register
2. shufps xmm0, xmm0, 1Bh; -----> Switches the order of data elements in reverse with 1Bh(00011011b) mask. Leveraging single instruction on pack of data (SIMD) in effect
3. movaps data, xmm0; -----> Move the data back from register to storage.

C++ implementation of reversing an array(Not leveraging SSE):
=============================================
void arrayReverseByC_Plus_Plus()
{
float data[] = { 3.f, 5.f, 4.f, 9.f };
int start = 0;
int end = 3;

while (start < end)
{
float temp = data[start];
data[start] = data[end];
data[end] = temp;
start++;
end--;
}

for (int i = 0; i < 4; ++i)
{
std::cout << data[i] << "\t";
}
std::cout << std::endl;
}

Assembly code generated without optimization (/Od) by visual studio 2019:

?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT

; 93 : {

push ebp

mov ebp, esp

sub esp, 32 ; 00000020H

; 94 : float data[] = { 3.f, 5.f, 4.f, 9.f };

movss xmm0, DWORD PTR __real@40400000
movss DWORD PTR _data$[ebp], xmm0
movss xmm0, DWORD PTR __real@40a00000
movss DWORD PTR _data$[ebp+4], xmm0
movss xmm0, DWORD PTR __real@40800000
movss DWORD PTR _data$[ebp+8], xmm0
movss xmm0, DWORD PTR __real@41100000
movss DWORD PTR _data$[ebp+12], xmm0

(SIMD register loading is scalar)

; 95 : int start = 0;

mov DWORD PTR _start$[ebp], 0

; 96 : int end = 3;

mov DWORD PTR _end$[ebp], 3

$LN2@arrayRever:

; 97 :

; 98 : while (start < end)

mov eax, DWORD PTR _start$[ebp]

cmp eax, DWORD PTR _end$[ebp]

jge SHORT $LN3@arrayRever

; 99 : {

; 100 : float temp = data[start];

mov ecx, DWORD PTR _start$[ebp]

movss xmm0, DWORD PTR _data$[ebp+ecx*4]

movss DWORD PTR _temp$1[ebp], xmm0

; 101 : data[start] = data[end];

mov edx, DWORD PTR _start$[ebp]

mov eax, DWORD PTR _end$[ebp]

mov ecx, DWORD PTR _data$[ebp+eax*4]

mov DWORD PTR _data$[ebp+edx*4], ecx

; 102 : data[end] = temp;

mov edx, DWORD PTR _end$[ebp]

movss xmm0, DWORD PTR _temp$1[ebp]

movss DWORD PTR _data$[ebp+edx*4], xmm0

; 103 : start++;

mov eax, DWORD PTR _start$[ebp]

add eax, 1

mov DWORD PTR _start$[ebp], eax

; 104 : end--;

mov ecx, DWORD PTR _end$[ebp]

sub ecx, 1

mov DWORD PTR _end$[ebp], ecx

; 105 : }

Though SIMD register got used but it is clear that operation on data elements are not parallel but scalar (while reversing the data in array).

The code after applying optimization (/O2):

?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT

; 93 : {

push ebp
mov ebp, esp
sub esp, 16 ; 00000010H

; 94 : float data[] = { 3.f, 5.f, 4.f, 9.f };

movaps xmm0, XMMWORD PTR __xmm@411000004080000040a0000040400000 (packed loading in SIMD register as contrary to non-optimized one, above highlighted).

; 95 : int start = 0;

xor ecx, ecx
movups XMMWORD PTR _data$[ebp], xmm0

; 96 : int end = 3;

mov edx, 3
npad 8
$LL2@arrayRever:

; 97 :
; 98 : while (start < end)
; 99 : {
; 100 : float temp = data[start];

movss xmm0, DWORD PTR _data$[ebp+ecx*4]

; 101 : data[start] = data[end];

mov eax, DWORD PTR _data$[ebp+edx*4]
mov DWORD PTR _data$[ebp+ecx*4], eax

; 102 : data[end] = temp;
; 103 : start++;

inc ecx
movss DWORD PTR _data$[ebp+edx*4], xmm0

; 104 : end--;

dec edx
cmp ecx, edx
jl SHORT $LL2@arrayRever

; 105 : }

Also noteworthy to see loop optimization has also taken place.

Parallelism with C++

Sunday, April 26, 2020

Parallelism at instruction level (SIMD contd)

No comments:

Post a Comment