Sunday, April 26, 2020

Parallelism at instruction level (SIMD contd)

Leveraging SSE capabilities (supported by CPU architecture) and do some processing like reversing array. The below C++ code written using visual studio 2019 (with inline assembly written for x86 architecture)

void arrayReverse()
{
    __declspec(align(16)) float data[] = { 3.f, 5.f, 4.f, 9.f };
    __asm
    {
        movaps xmm0, data;
        shufps xmm0, xmm0, 1Bh;
        movaps data, xmm0;
    }

    for (int i = 0; i < 4; ++i)
    {
        std::cout << data[i] << "\t";
    }
    std::cout << std::endl;
}

The code between __asm {....} is inline assembly code.
1. movaps xmm0, data ----> Moves aligned packed data from storage to xmm0 register
2. shufps xmm0, xmm0, 1Bh; -----> Switches the order of data elements in reverse with 1Bh(00011011b) mask. Leveraging single instruction on pack of data (SIMD) in effect
3. movaps data, xmm0; -----> Move the data back from register to storage.

C++ implementation of reversing an array(Not leveraging SSE):
=============================================
void arrayReverseByC_Plus_Plus()
{
    float data[] = { 3.f, 5.f, 4.f, 9.f };
    int start = 0;
    int end = 3;

    while (start < end)
    {
        float temp = data[start];
        data[start] = data[end];
        data[end] = temp;
        start++;
        end--;
    }

    for (int i = 0; i < 4; ++i)
    {
        std::cout << data[i] << "\t";
    }
    std::cout << std::endl;
}

Assembly code generated without optimization (/Od) by visual studio 2019:
?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT

; 93   : {

push ebp
mov ebp, esp
sub esp, 32 ; 00000020H

; 94   :     float data[] = { 3.f, 5.f, 4.f, 9.f };

movss xmm0, DWORD PTR __real@40400000
movss DWORD PTR _data$[ebp], xmm0
movss xmm0, DWORD PTR __real@40a00000
movss DWORD PTR _data$[ebp+4], xmm0
movss xmm0, DWORD PTR __real@40800000
movss DWORD PTR _data$[ebp+8], xmm0
movss xmm0, DWORD PTR __real@41100000
movss DWORD PTR _data$[ebp+12], xmm0

(SIMD register loading is scalar)

; 95   :     int start = 0;

mov DWORD PTR _start$[ebp], 0

; 96   :     int end = 3;

mov DWORD PTR _end$[ebp], 3
$LN2@arrayRever:

; 97   : 
; 98   :     while (start < end)

mov eax, DWORD PTR _start$[ebp]
cmp eax, DWORD PTR _end$[ebp]
jge SHORT $LN3@arrayRever

; 99   :     {
; 100  :         float temp = data[start];

mov ecx, DWORD PTR _start$[ebp]
movss xmm0, DWORD PTR _data$[ebp+ecx*4]
movss DWORD PTR _temp$1[ebp], xmm0

; 101  :         data[start] = data[end];

mov edx, DWORD PTR _start$[ebp]
mov eax, DWORD PTR _end$[ebp]
mov ecx, DWORD PTR _data$[ebp+eax*4]
mov DWORD PTR _data$[ebp+edx*4], ecx

; 102  :         data[end] = temp;

mov edx, DWORD PTR _end$[ebp]
movss xmm0, DWORD PTR _temp$1[ebp]
movss DWORD PTR _data$[ebp+edx*4], xmm0

; 103  :         start++;

mov eax, DWORD PTR _start$[ebp]
add eax, 1
mov DWORD PTR _start$[ebp], eax

; 104  :         end--;

mov ecx, DWORD PTR _end$[ebp]
sub ecx, 1
mov DWORD PTR _end$[ebp], ecx

; 105  :     }

 Though SIMD register got used but it is clear that operation on data elements are not parallel but scalar (while reversing the data in array).

The code after applying optimization (/O2):

?arrayReverseByC_Plus_Plus@@YAXXZ PROC ; arrayReverseByC_Plus_Plus, COMDAT

; 93   : {

push ebp
mov ebp, esp
sub esp, 16 ; 00000010H

; 94   :     float data[] = { 3.f, 5.f, 4.f, 9.f };

movaps xmm0, XMMWORD PTR __xmm@411000004080000040a0000040400000                        (packed loading in SIMD register as contrary to non-optimized one, above highlighted).

; 95   :     int start = 0;

xor ecx, ecx
movups XMMWORD PTR _data$[ebp], xmm0

; 96   :     int end = 3;

mov edx, 3
npad 8
$LL2@arrayRever:

; 97   :
; 98   :     while (start < end)
; 99   :     {
; 100  :         float temp = data[start];

movss xmm0, DWORD PTR _data$[ebp+ecx*4]

; 101  :         data[start] = data[end];

mov eax, DWORD PTR _data$[ebp+edx*4]
mov DWORD PTR _data$[ebp+ecx*4], eax

; 102  :         data[end] = temp;
; 103  :         start++;

inc ecx
movss DWORD PTR _data$[ebp+edx*4], xmm0

; 104  :         end--;

dec edx
cmp ecx, edx
jl SHORT $LL2@arrayRever

; 105  :     }

Also noteworthy to see loop optimization has also taken place.

No comments:

Post a Comment