Parallelism with C++: April 2020

Leveraging SSE capabilities (supported by CPU architecture) and do some processing like reversing array. The below C++ code written using visual studio 2019 (with inline assembly written for x86 architecture)

void arrayReverse()
{
__declspec(align(16)) float data[] = { 3.f, 5.f, 4.f, 9.f };
__asm
{
movaps xmm0, data;
shufps xmm0, xmm0, 1Bh;
movaps data, xmm0;
}

for (int i = 0; i < 4; ++i)
{
std::cout << data[i] << "\t";
}
std::cout << std::endl;
}

The code between __asm {....} is inline assembly code.
1. movaps xmm0, data ----> Moves aligned packed data from storage to xmm0 register
2. shufps xmm0, xmm0, 1Bh; -----> Switches the order of data elements in reverse with 1Bh(00011011b) mask. Leveraging single instruction on pack of data (SIMD) in effect
3. movaps data, xmm0; -----> Move the data back from register to storage.

C++ implementation of reversing an array(Not leveraging SSE):
=============================================
void arrayReverseByC_Plus_Plus()
{
float data[] = { 3.f, 5.f, 4.f, 9.f };
int start = 0;
int end = 3;

while (start < end)
{
float temp = data[start];
data[start] = data[end];
data[end] = temp;
start++;
end--;
}

for (int i = 0; i < 4; ++i)
{
std::cout << data[i] << "\t";
}
std::cout << std::endl;
}

Assembly code generated without optimization (/Od) by visual studio 2019:

?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT

; 93 : {

push ebp

mov ebp, esp

sub esp, 32 ; 00000020H

; 94 : float data[] = { 3.f, 5.f, 4.f, 9.f };

movss xmm0, DWORD PTR __real@40400000
movss DWORD PTR _data$[ebp], xmm0
movss xmm0, DWORD PTR __real@40a00000
movss DWORD PTR _data$[ebp+4], xmm0
movss xmm0, DWORD PTR __real@40800000
movss DWORD PTR _data$[ebp+8], xmm0
movss xmm0, DWORD PTR __real@41100000
movss DWORD PTR _data$[ebp+12], xmm0

(SIMD register loading is scalar)

; 95 : int start = 0;

mov DWORD PTR _start$[ebp], 0

; 96 : int end = 3;

mov DWORD PTR _end$[ebp], 3

$LN2@arrayRever:

; 97 :

; 98 : while (start < end)

mov eax, DWORD PTR _start$[ebp]

cmp eax, DWORD PTR _end$[ebp]

jge SHORT $LN3@arrayRever

; 99 : {

; 100 : float temp = data[start];

mov ecx, DWORD PTR _start$[ebp]

movss xmm0, DWORD PTR _data$[ebp+ecx*4]

movss DWORD PTR _temp$1[ebp], xmm0

; 101 : data[start] = data[end];

mov edx, DWORD PTR _start$[ebp]

mov eax, DWORD PTR _end$[ebp]

mov ecx, DWORD PTR _data$[ebp+eax*4]

mov DWORD PTR _data$[ebp+edx*4], ecx

; 102 : data[end] = temp;

mov edx, DWORD PTR _end$[ebp]

movss xmm0, DWORD PTR _temp$1[ebp]

movss DWORD PTR _data$[ebp+edx*4], xmm0

; 103 : start++;

mov eax, DWORD PTR _start$[ebp]

add eax, 1

mov DWORD PTR _start$[ebp], eax

; 104 : end--;

mov ecx, DWORD PTR _end$[ebp]

sub ecx, 1

mov DWORD PTR _end$[ebp], ecx

; 105 : }

Though SIMD register got used but it is clear that operation on data elements are not parallel but scalar (while reversing the data in array).

The code after applying optimization (/O2):

?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT

; 93 : {

push ebp
mov ebp, esp
sub esp, 16 ; 00000010H

; 94 : float data[] = { 3.f, 5.f, 4.f, 9.f };

movaps xmm0, XMMWORD PTR __xmm@411000004080000040a0000040400000 (packed loading in SIMD register as contrary to non-optimized one, above highlighted).

; 95 : int start = 0;

xor ecx, ecx
movups XMMWORD PTR _data$[ebp], xmm0

; 96 : int end = 3;

mov edx, 3
npad 8
$LL2@arrayRever:

; 97 :
; 98 : while (start < end)
; 99 : {
; 100 : float temp = data[start];

movss xmm0, DWORD PTR _data$[ebp+ecx*4]

; 101 : data[start] = data[end];

mov eax, DWORD PTR _data$[ebp+edx*4]
mov DWORD PTR _data$[ebp+ecx*4], eax

; 102 : data[end] = temp;
; 103 : start++;

inc ecx
movss DWORD PTR _data$[ebp+edx*4], xmm0

; 104 : end--;

dec edx
cmp ecx, edx
jl SHORT $LL2@arrayRever

; 105 : }

Also noteworthy to see loop optimization has also taken place.

SIMD aka Single instruction multiple data. It is a data level parallelism but not concurrency. SIMD allows simultaneous (parallel) computation on data but with single instruction at a given moment.

Example: Say, have two sets of integer array of same length and would like to add members in pairs only. That mean, add array_1[0] and array_2[0] elements and store the result in another array, say result[0]. Similarly add array_1[1] with array_2[1] and store in result[1] and so on.

Here, each result is independent of another and hence we can leverage instruction level parallelism. So, Intel x86 architecture, we have SSE (streaming SIMD extension). It has provided a set of registers [xmm0 - xmm7] (128 bits) with additional instructions for scalar as well as packed data operation of single instruction type at any point of time.

The details on SSE can be found in wiki page.

The following C++ routine (x86) shows how can we use inline assembly instruction to detect if processor has support for SSE or not. The code is Intel x86 based code and used Visual Studio 2019.

std::string getCPUManfactureID()
{
uint32_t data[4] = {0};

__asm
{
cpuid;
mov data[0], ebx;
mov data[4], edx;
mov data[8], ecx;
}

return std::string((const char*)data);
}

void getCPUFeaturesforSSE()
{
int regDX, regcx;
__asm
{
mov eax, 1;
cpuid;
mov regDX, edx;
mov regcx, ecx;
}

if ((regDX & (1 << 26)) != 0)
{
std::cout << "SSE2 is supported..." << std::endl;
}
if ((regcx & 1) != 0)
{
std::cout << "SSE3 is supported..." << std::endl;
}
if ((regcx & (1 << 19)) != 0)
{
std::cout << "SSE4.1 is supported..." << std::endl;
}
if ((regcx & (1 << 20)) != 0)
{
std::cout << "SSE4.2 is supported..." << std::endl;
}
}

Parallelism with C++

Sunday, April 26, 2020

Parallelism at instruction level (SIMD contd)

Saturday, April 25, 2020

Parallelism at instruction level