汇编学习(11),SIMD之SSE

作者: android小奉先 | 来源:发表于2022-12-23 15:58 被阅读0次

汇编学习(11),SIMD之SSE
SSE/AVX并行优化基础
在C/C++代码中使用SSE等指令集的指令 2020-05-07
汇编学习(12), SIMD之AVX（已完结）
SIMD的编写
SSE 原子指令加速矩阵运算
Android neon加速优化
Cortex-M4和Cortex-M7中的SIMD指令
数值计算优化方法C/C++(三)——SIMD
SceneKit之属性的前缀‘simd’的含义

本篇介绍

SIMD（Single Instruction Stream，Multiple Data）可以实现高性能运算，本篇介绍下这块内容。

SSE

SIMD就是一条指令可以操作多个数据，有多种实现方法，比如SSE，AVX。
SSE(Streaming SIMD Extension)拥有16个128位的寄存器，也就是之前接触过的xmm0到xmm15，还有一个控制寄存器mxcsr。这些寄存器可以存放浮点，也可以存放scalar和packed数据。scalar是单个数据，而packetd数据就是多个，对于一个xmm寄存器可以存放如下的值：

2个64位的双精度浮点
4个32位的单精度浮点
2个64位的整数
4个32位的整数
8 个16位的整数
16个8位的字符

可以总结成如下表所示：

image.png

对于AVX，用的是256字节的ymm寄存器，另外也有512字节的zmm寄存器。

接下来看下SSE寄存器：

image.png

再看一个代码，可以看到mxcsr寄存器的值含义：

; mxcsr.asm
extern printf
extern print_mxcsr
extern print_hex
section .data                           
    eleven  dq  11.0                    
    two     dq  2.0
    three   dq      3.0
    ten         dq      10.0
    zero        dq      0.0
    hex     db  "0x",0
    fmt1        db  10,"Divide, default mxcsr:",10,0
    fmt2        db  10,"Divide by zero, default mxcsr:",10,0
    fmt4        db      10,"Divide, round up:",10,0
    fmt5        db  10,"Divide, round down:",10,0
    fmt6        db      10,"Divide, truncate:",10,0                             
    f_div   db  "%.1f divided by %.1f is %.16f, in hex: ",0
    f_before    db  10,"mxcsr before:",9,0
    f_after     db  "mxcsr after:",9,0

;mxcsr values
    default_mxcsr   dd 0001111110000000b
    round_nearest   dd 0001111110000000b
    round_down  dd 0011111110000000b
    round_up        dd 0101111110000000b
    truncate        dd 0111111110000000b

section .bss
        mxcsr_before    resd      1
        mxcsr_after     resd      1
        xmm             resq    1
section .text                           
    global main                 
main:
    mov rbp, rsp; for correct debugging
push rbp
mov     rbp,rsp

;division
;default mxcsr
    mov     rdi,fmt1
    mov     rsi,ten
    mov     rdx,two
    mov     ecx, [default_mxcsr]
    call apply_mxcsr
;----------------------------------------------
;division with precision error
;default mxcsr
    mov     rdi,fmt1
    mov     rsi,ten
    mov     rdx,three
    mov     ecx, [default_mxcsr]
    call apply_mxcsr
;divide by zero
;default mxcsr
    mov     rdi,fmt2
    mov     rsi,ten
    mov     rdx,zero
    mov     ecx, [default_mxcsr]
    call apply_mxcsr
;division with precision error
;round up
    mov     rdi,fmt4
    mov     rsi,ten
    mov     rdx,three
    mov     ecx, [round_up]
    call apply_mxcsr
;division with precision error
;round up
    mov     rdi,fmt5
    mov     rsi,ten
    mov     rdx,three
    mov     ecx, [round_down]
    call apply_mxcsr
;division with precision error
;truncate
    mov     rdi,fmt6
    mov     rsi,ten
    mov     rdx,three
    mov     ecx, [truncate]
    call apply_mxcsr
;----------------------------------------------
;division with precision error
;default mxcsr
    mov     rdi,fmt1
    mov     rsi,eleven
    mov     rdx,three
    mov     ecx, [default_mxcsr]
    call apply_mxcsr;division with precision error
;round up
    mov     rdi,fmt4
    mov     rsi,eleven
    mov     rdx,three
    mov     ecx, [round_up]
    call apply_mxcsr
;division with precision error
;round up
    mov     rdi,fmt5
    mov     rsi,eleven
    mov     rdx,three
    mov     ecx, [round_down]
    call apply_mxcsr
;division with precision error
;truncate
    mov     rdi,fmt6
    mov     rsi,eleven
    mov     rdx,three
    mov     ecx, [truncate]
    call apply_mxcsr
leave
ret                                                                                                                         

;function ------------------------------------------------------------    
apply_mxcsr:   
push    rbp
mov     rbp,rsp
        push rsi
        push    rdx
        push    rcx
        push    rbp            ; one more for stack alignment
    call    printf
        pop     rbp
        pop     rcx
        pop     rdx
        pop     rsi

    mov         [mxcsr_before],ecx
        ldmxcsr     [mxcsr_before]
        movsd   xmm2, [rsi] ; double precision float into xmm2
        divsd   xmm2, [rdx] ; divide xmm2 
        stmxcsr     [mxcsr_after]   ; save mxcsr to memory
        movsd   [xmm],xmm2      ; for use in print_xmm
        mov         rdi,f_div
        movsd   xmm0, [rsi]
        movsd   xmm1, [rdx]
        call    printf
        call    print_xmm
;print mxcsr
    mov         rdi,f_before
    call    printf
    mov         rdi, [mxcsr_before]
    call    print_mxcsr
    mov         rdi,f_after
    call    printf
    mov         rdi, [mxcsr_after]
    call        print_mxcsr
leave
ret
;function ------------------------------------------------------------    
print_xmm:
push rbp
mov  rbp,rsp
    mov     rdi, hex    ;print 0x
    call printf
        mov     rcx,8
.loop:
        xor     rdi,rdi
        mov     dil,[xmm+rcx-1]
        push rcx
        push rcx
        call print_hex 
        pop     rcx
        pop  rcx
        loop .loop   
leave
ret
结果如下：
Divide, default mxcsr:
10.0 divided by 2.0 is 5.0000000000000000, in hex: 0x4014000000000000
mxcsr before:    0001 1111 1000 0000
mxcsr after:     0001 1111 1000 0000

Divide, default mxcsr:
10.0 divided by 3.0 is 3.3333333333333335, in hex: 0x400aaaaaaaaaaaab
mxcsr before:    0001 1111 1000 0000
mxcsr after:     0001 1111 1010 0000

Divide by zero, default mxcsr:
10.0 divided by 0.0 is inf, in hex: 0x7ff0000000000000
mxcsr before:    0001 1111 1000 0000
mxcsr after:     0001 1111 1000 0100

Divide, round up:
10.0 divided by 3.0 is 3.3333333333333335, in hex: 0x400aaaaaaaaaaaab
mxcsr before:    0101 1111 1000 0000
mxcsr after:     0101 1111 1010 0000

Divide, round down:
10.0 divided by 3.0 is 3.3333333333333330, in hex: 0x400aaaaaaaaaaaaa
mxcsr before:    0011 1111 1000 0000
mxcsr after:     0011 1111 1010 0000

Divide, truncate:
10.0 divided by 3.0 is 3.3333333333333330, in hex: 0x400aaaaaaaaaaaaa
mxcsr before:    0111 1111 1000 0000
mxcsr after:     0111 1111 1010 0000

Divide, default mxcsr:
11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
mxcsr before:    0001 1111 1000 0000
mxcsr after:     0001 1111 1010 0000

Divide, round up:
11.0 divided by 3.0 is 3.6666666666666670, in hex: 0x400d555555555556
mxcsr before:    0101 1111 1000 0000
mxcsr after:     0101 1111 1010 0000

Divide, round down:
11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
mxcsr before:    0011 1111 1000 0000
mxcsr after:     0011 1111 1010 0000

Divide, truncate:
11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
mxcsr before:    0111 1111 1000 0000
mxcsr after:     0111 1111 1010 0000

接下来再看一个SSE的例子：
这是一个内存没对齐的向量求和：

; sse_unaligned.asm
extern printf
section .data
;single precision                           
    spvector1   dd      1.1
                 dd      2.2
                 dd     3.3
                 dd     4.4
    spvector2   dd      1.1
                 dd      2.2
                 dd     3.3
                 dd     4.4
;double precision                 
    dpvector1   dq      1.1
                 dq      2.2
    dpvector2   dq  3.3
                 dq     4.4
        
    fmt1 db "Single Precision Vector 1: %f, %f, %f, %f",10,0
    fmt2 db "Single Precision Vector 2: %f, %f, %f, %f",10,0
    fmt3 db "Sum of Single Precision Vector 1 and Vector 2: %f, %f, %f, %f",10,0
    fmt4 db "Double Precision Vector 1: %f, %f",10,0
    fmt5 db "Double Precision Vector 2: %f, %f",10,0
    fmt6 db "Sum of Double Precision Vector 1 and Vector 2: %f, %f",10,0        

section .bss
    spvector_res resd 4
    dpvector_res resq 4
section .text                           
    global main                 
main:
push    rbp     
mov rbp,rsp

; add 2 single precision floating point vectors
    mov rsi,spvector1
        mov     rdi,fmt1
        call printspfp 
    
        mov     rsi,spvector2
        mov     rdi,fmt2
        call printspfp 
        
        movups  xmm0, [spvector1]          
        movups  xmm1, [spvector2]            
        addps   xmm0,xmm1 


        movups  [spvector_res], xmm0
        mov         rsi,spvector_res
        mov         rdi,fmt3
        call    printspfp 

; add 2 double precision floating point vectors
        mov rsi,dpvector1
        mov     rdi,fmt4
        call printdpfp 
    
        mov     rsi,dpvector2
        mov     rdi,fmt5
        call printdpfp 
        
        movupd  xmm0, [dpvector1]          
        movupd  xmm1, [dpvector2]            
        addpd   xmm0,xmm1 
        movupd  [dpvector_res], xmm0
        mov         rsi,dpvector_res
        mov         rdi,fmt6
        call    printdpfp

leave   
ret

printspfp:    
push rbp
mov     rbp,rsp
    movss   xmm0, [rsi]
    cvtss2sd    xmm0,xmm0
    movss   xmm1, [rsi+4]
    cvtss2sd    xmm1,xmm1
    movss   xmm2, [rsi+8]
    cvtss2sd    xmm2,xmm2
    movss   xmm3, [rsi+12]
    cvtss2sd    xmm3,xmm3
    mov     rax,4   ; four floats
    call    printf
leave
ret

printdpfp:    
push rbp
mov rbp,rsp
    movsd   xmm0, [rsi]
    movsd   xmm1, [rsi+8]
    mov     rax,2   ; four floats
    call    printf
leave
ret
结果如下：
Single Precision Vector 1: 1.100000, 2.200000, 3.300000, 4.400000
Single Precision Vector 2: 1.100000, 2.200000, 3.300000, 4.400000
Sum of Single Precision Vector 1 and Vector 2: 2.200000, 4.400000, 6.600000, 8.800000
Double Precision Vector 1: 1.100000, 2.200000
Double Precision Vector 2: 3.300000, 4.400000
Sum of Double Precision Vector 1 and Vector 2: 4.400000, 6.600000

代码中的关键点如下：
movups(move unaligned packed single precision)，将数据拷贝到xmm0或xmm1寄存器。
addps(add packed single precision): 将两个xmm寄存器的值按照单精度划分并相加。
movss(move scalar single precision)将内存数据拷贝到xmm寄存器。
cvtss2sd(convert scalar single to scalar double)：将单精度转成双精度。
接下来看一个内存对齐的例子：

; sse_aligned.asm
extern printf
section .data
    dummy   db      13
align 16                        
    spvector1   dd      1.1
                 dd      2.2
                 dd     3.3
                 dd     4.4
    spvector2   dd      1.1
                 dd      2.2
                 dd     3.3
                 dd     4.4
                 
    dpvector1   dq      1.1
                 dq      2.2
    dpvector2   dq      3.3
                 dq     4.4

        fmt1    db "Single Precision Vector 1: %f, %f, %f, %f",10,0
        fmt2    db "Single Precision Vector 2: %f, %f, %f, %f",10,0
        fmt3    db "Sum of Single Precision Vector 1 and Vector 2: "
            db "%f, %f, %f, %f",10,0
        fmt4    db "Double Precision Vector 1: %f, %f",10,0
        fmt5    db "Double Precision Vector 2: %f, %f",10,0
        fmt6    db "Sum of Double Precision Vector 1 and Vector 2: "
            db "%f, %f",10,0        

section .bss
alignb 16
        spvector_res resd 4
        dpvector_res resq 4
section .text                           
    global main                 
main:
push    rbp 
mov     rbp,rsp
    
; add 2 single precision floating point vectors
    mov     rsi,spvector1
    mov     rdi,fmt1
    call    printspfp 
    
    mov     rsi,spvector2
    mov     rdi,fmt2
    call    printspfp 
        
    movaps  xmm0, [spvector1]          
    addps   xmm0, [spvector2]            

    movaps  [spvector_res], xmm0
    mov         rsi,spvector_res
    mov         rdi,fmt3
    call        printspfp 

; add 2 double precision floating point vectors
    mov     rsi,dpvector1
    mov     rdi,fmt4
    call    printdpfp 
    
    mov     rsi,dpvector2
    mov     rdi,fmt5
    call    printdpfp 
        
    movapd  xmm0, [dpvector1]          
    addpd   xmm0, [dpvector2]            
 
    movapd  [dpvector_res], xmm0
    mov         rsi,dpvector_res
    mov         rdi,fmt6
    call        printdpfp 
; exit
mov     rsp,rbp              
pop rbp     ; undo the push at the beginning
ret

printspfp:    
push rbp
mov     rbp,rsp
    movss   xmm0, [rsi]
    cvtss2sd    xmm0,xmm0  ;printf expects double precision argument
    movss   xmm1, [rsi+4]
    cvtss2sd    xmm1,xmm1
    movss   xmm2, [rsi+8]
    cvtss2sd    xmm2,xmm2
    movss   xmm3, [rsi+12]
    cvtss2sd    xmm3,xmm3
    mov     rax,4   ; four floats
    call    printf
leave
ret

printdpfp:    
push rbp
mov rbp,rsp
    movsd   xmm0, [rsi]
    movsd   xmm1, [rsi+8]
    mov     rax,2   ; two floats
    call    printf
leave
ret
结果如下：
Single Precision Vector 1: 1.100000, 2.200000, 3.300000, 4.400000
Single Precision Vector 2: 1.100000, 2.200000, 3.300000, 4.400000
Sum of Single Precision Vector 1 and Vector 2: 2.200000, 4.400000, 6.600000, 8.800000
Double Precision Vector 1: 1.100000, 2.200000
Double Precision Vector 2: 3.300000, 4.400000
Sum of Double Precision Vector 1 and Vector 2: 4.400000, 6.600000

通过使用align 16保证内存按照16字节对齐。
这时候指令稍有差异：
movaps(move aligned packed single precision)

继续看一个SSE操作整数的例子：

; sse_integer.asm
extern printf


section .data
    
    dummy   db      13
align 16                        
    pdivector1  dd  1
                    dd  2
                    dd  3
                    dd  4
    pdivector2      dd  5
                    dd  6
                    dd  7
                    dd  8
                 
        fmt1 db "Packed Integer Vector 1: %d, %d, %d, %d",10,0
        fmt2 db "Packed Integer Vector 2: %d, %d, %d, %d",10,0
        fmt3 db "Sum Vector: %d, %d, %d, %d",10,0
        fmt4 db "Reverse of Sum Vector: %d, %d, %d, %d",10,0

section .bss
alignb 16
    pdivector_res       resd 4
    pdivector_other     resd 4

section .text                           
    global main                 
main:
push    rbp 
mov rbp,rsp
    
; print vector 1
    mov rsi,pdivector1
    mov     rdi,fmt1
    call    printpdi 
; print vector 2
    mov     rsi,pdivector2
    mov     rdi,fmt2
    call    printpdi 

; add 2 aligned double int vectors        
    movdqa  xmm0, [pdivector1]          
    paddd   xmm0, [pdivector2] 
           
; store the result in memory
    movdqa  [pdivector_res], xmm0
; print the vector in memory
    mov     rsi,pdivector_res
    mov     rdi,fmt3
    call    printpdi 
 
; copy the memory vector to xmm3   
    movdqa xmm3,[pdivector_res]

; extract the packed values from xmm3   
    pextrd eax, xmm3, 0
    pextrd ebx, xmm3, 1
    pextrd ecx, xmm3, 2
    pextrd edx, xmm3, 3
; insert in xmm0 in reverse order   
    pinsrd xmm0, eax, 3
    pinsrd xmm0, ebx, 2
    pinsrd xmm0, ecx, 1
    pinsrd xmm0, edx, 0

; print the reversed vector
    movdqa [pdivector_other], xmm0
    mov     rsi,pdivector_other
    mov     rdi,fmt4
    call    printpdi   
    

; exit
mov     rsp,rbp              
pop rbp 
ret

;print function-----------------------------------------------
printpdi:    
push rbp
mov     rbp,rsp
    movdqa xmm0, [rsi]
; extract the packed values from xmm3
        pextrd esi, xmm0,0
        pextrd edx, xmm0,1
        pextrd ecx, xmm0,2
        pextrd r8d, xmm0,3
    mov rax,0   ; no floats
    call    printf
leave
ret
结果如下：
Packed Integer Vector 1: 1, 2, 3, 4
Packed Integer Vector 2: 5, 6, 7, 8
Sum Vector: 6, 8, 10, 12
Reverse of Sum Vector: 12, 10, 8, 6

SSE中也有操作字符串比较的指令：

image.png

指令的第三个参数imm8是一个立即数，含义如下：

image.png

汇编学习(11),SIMD之SSE
本篇介绍 SIMD（Single Instruction Stream，Multiple Data）可以实现高性能...
SSE/AVX并行优化基础
SIMD扩展指令集和本文集上一篇介绍的NEON类似，SSE/AVX也是SIMD（单指令多数据）扩展指令集，只不过...
在C/C++代码中使用SSE等指令集的指令 2020-05-07
SSE(Stream SIMD Extensions)是Intel在其计算机芯片Pentium3中引入的指令集，是...
汇编学习(12), SIMD之AVX（已完结）
本篇介绍 AVX是SSE的扩展版，用了256字节的ymm寄存器，本篇看下AVX相关的指令。 AVX AVX(Adv...
SIMD的编写
前言学习SIMD的笔记参考教程：SIMD Tutorial.pdf 一、使用SIMD的场景考虑如下代...
SSE 原子指令加速矩阵运算
利用原子指令加速矩阵运算 C++里面有个原子指令库，不需要通过内嵌汇编就可以调用cpu内部SIMD的指令，头文件<...
Android neon加速优化
neon是一种SIMD（单指令多数据）指令集，其效率相当于汇编，用于arm cpu平台的优化，在音视频、图形图像处...
Cortex-M4和Cortex-M7中的SIMD指令
SIMD指令简介单指令多数据流，即SIMD（Single Instruction， Multiple Data）...
数值计算优化方法C/C++(三)——SIMD
SIMD 1、概述 SIMD全称Single Instruction Multiple Data，单指令多数据流，...
SceneKit之属性的前缀‘simd’的含义
在 SCNNode (SIMD)中，声明了很多带有 ‘simd’ 的属性，如 simdPosition, sim...