美文网首页
汇编学习(11),SIMD之SSE

汇编学习(11),SIMD之SSE

作者: android小奉先 | 来源:发表于2022-12-23 15:58 被阅读0次

    本篇介绍

    SIMD(Single Instruction Stream,Multiple Data)可以实现高性能运算,本篇介绍下这块内容。

    SSE

    SIMD就是一条指令可以操作多个数据,有多种实现方法,比如SSE,AVX。
    SSE(Streaming SIMD Extension)拥有16个128位的寄存器,也就是之前接触过的xmm0到xmm15,还有一个控制寄存器mxcsr。这些寄存器可以存放浮点,也可以存放scalar和packed数据。scalar是单个数据,而packetd数据就是多个,对于一个xmm寄存器可以存放如下的值:

    • 2个64位的双精度浮点
    • 4个32位的单精度浮点
    • 2个64位的整数
    • 4个32位的整数
    • 8 个16位的整数
    • 16个8位的字符

    可以总结成如下表所示:


    image.png

    对于AVX,用的是256字节的ymm寄存器,另外也有512字节的zmm寄存器。

    接下来看下SSE寄存器:


    image.png

    再看一个代码,可以看到mxcsr寄存器的值含义:

    ; mxcsr.asm
    extern printf
    extern print_mxcsr
    extern print_hex
    section .data                           
        eleven  dq  11.0                    
        two     dq  2.0
        three   dq      3.0
        ten         dq      10.0
        zero        dq      0.0
        hex     db  "0x",0
        fmt1        db  10,"Divide, default mxcsr:",10,0
        fmt2        db  10,"Divide by zero, default mxcsr:",10,0
        fmt4        db      10,"Divide, round up:",10,0
        fmt5        db  10,"Divide, round down:",10,0
        fmt6        db      10,"Divide, truncate:",10,0                             
        f_div   db  "%.1f divided by %.1f is %.16f, in hex: ",0
        f_before    db  10,"mxcsr before:",9,0
        f_after     db  "mxcsr after:",9,0
    
    ;mxcsr values
        default_mxcsr   dd 0001111110000000b
        round_nearest   dd 0001111110000000b
        round_down  dd 0011111110000000b
        round_up        dd 0101111110000000b
        truncate        dd 0111111110000000b
    
    section .bss
            mxcsr_before    resd      1
            mxcsr_after     resd      1
            xmm             resq    1
    section .text                           
        global main                 
    main:
        mov rbp, rsp; for correct debugging
    push rbp
    mov     rbp,rsp
    
    ;division
    ;default mxcsr
        mov     rdi,fmt1
        mov     rsi,ten
        mov     rdx,two
        mov     ecx, [default_mxcsr]
        call apply_mxcsr
    ;----------------------------------------------
    ;division with precision error
    ;default mxcsr
        mov     rdi,fmt1
        mov     rsi,ten
        mov     rdx,three
        mov     ecx, [default_mxcsr]
        call apply_mxcsr
    ;divide by zero
    ;default mxcsr
        mov     rdi,fmt2
        mov     rsi,ten
        mov     rdx,zero
        mov     ecx, [default_mxcsr]
        call apply_mxcsr
    ;division with precision error
    ;round up
        mov     rdi,fmt4
        mov     rsi,ten
        mov     rdx,three
        mov     ecx, [round_up]
        call apply_mxcsr
    ;division with precision error
    ;round up
        mov     rdi,fmt5
        mov     rsi,ten
        mov     rdx,three
        mov     ecx, [round_down]
        call apply_mxcsr
    ;division with precision error
    ;truncate
        mov     rdi,fmt6
        mov     rsi,ten
        mov     rdx,three
        mov     ecx, [truncate]
        call apply_mxcsr
    ;----------------------------------------------
    ;division with precision error
    ;default mxcsr
        mov     rdi,fmt1
        mov     rsi,eleven
        mov     rdx,three
        mov     ecx, [default_mxcsr]
        call apply_mxcsr;division with precision error
    ;round up
        mov     rdi,fmt4
        mov     rsi,eleven
        mov     rdx,three
        mov     ecx, [round_up]
        call apply_mxcsr
    ;division with precision error
    ;round up
        mov     rdi,fmt5
        mov     rsi,eleven
        mov     rdx,three
        mov     ecx, [round_down]
        call apply_mxcsr
    ;division with precision error
    ;truncate
        mov     rdi,fmt6
        mov     rsi,eleven
        mov     rdx,three
        mov     ecx, [truncate]
        call apply_mxcsr
    leave
    ret                                                                                                                         
    
    ;function ------------------------------------------------------------    
    apply_mxcsr:   
    push    rbp
    mov     rbp,rsp
            push rsi
            push    rdx
            push    rcx
            push    rbp            ; one more for stack alignment
        call    printf
            pop     rbp
            pop     rcx
            pop     rdx
            pop     rsi
    
        mov         [mxcsr_before],ecx
            ldmxcsr     [mxcsr_before]
            movsd   xmm2, [rsi] ; double precision float into xmm2
            divsd   xmm2, [rdx] ; divide xmm2 
            stmxcsr     [mxcsr_after]   ; save mxcsr to memory
            movsd   [xmm],xmm2      ; for use in print_xmm
            mov         rdi,f_div
            movsd   xmm0, [rsi]
            movsd   xmm1, [rdx]
            call    printf
            call    print_xmm
    ;print mxcsr
        mov         rdi,f_before
        call    printf
        mov         rdi, [mxcsr_before]
        call    print_mxcsr
        mov         rdi,f_after
        call    printf
        mov         rdi, [mxcsr_after]
        call        print_mxcsr
    leave
    ret
    ;function ------------------------------------------------------------    
    print_xmm:
    push rbp
    mov  rbp,rsp
        mov     rdi, hex    ;print 0x
        call printf
            mov     rcx,8
    .loop:
            xor     rdi,rdi
            mov     dil,[xmm+rcx-1]
            push rcx
            push rcx
            call print_hex 
            pop     rcx
            pop  rcx
            loop .loop   
    leave
    ret
    结果如下:
    Divide, default mxcsr:
    10.0 divided by 2.0 is 5.0000000000000000, in hex: 0x4014000000000000
    mxcsr before:    0001 1111 1000 0000
    mxcsr after:     0001 1111 1000 0000
    
    Divide, default mxcsr:
    10.0 divided by 3.0 is 3.3333333333333335, in hex: 0x400aaaaaaaaaaaab
    mxcsr before:    0001 1111 1000 0000
    mxcsr after:     0001 1111 1010 0000
    
    Divide by zero, default mxcsr:
    10.0 divided by 0.0 is inf, in hex: 0x7ff0000000000000
    mxcsr before:    0001 1111 1000 0000
    mxcsr after:     0001 1111 1000 0100
    
    Divide, round up:
    10.0 divided by 3.0 is 3.3333333333333335, in hex: 0x400aaaaaaaaaaaab
    mxcsr before:    0101 1111 1000 0000
    mxcsr after:     0101 1111 1010 0000
    
    Divide, round down:
    10.0 divided by 3.0 is 3.3333333333333330, in hex: 0x400aaaaaaaaaaaaa
    mxcsr before:    0011 1111 1000 0000
    mxcsr after:     0011 1111 1010 0000
    
    Divide, truncate:
    10.0 divided by 3.0 is 3.3333333333333330, in hex: 0x400aaaaaaaaaaaaa
    mxcsr before:    0111 1111 1000 0000
    mxcsr after:     0111 1111 1010 0000
    
    Divide, default mxcsr:
    11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
    mxcsr before:    0001 1111 1000 0000
    mxcsr after:     0001 1111 1010 0000
    
    Divide, round up:
    11.0 divided by 3.0 is 3.6666666666666670, in hex: 0x400d555555555556
    mxcsr before:    0101 1111 1000 0000
    mxcsr after:     0101 1111 1010 0000
    
    Divide, round down:
    11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
    mxcsr before:    0011 1111 1000 0000
    mxcsr after:     0011 1111 1010 0000
    
    Divide, truncate:
    11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
    mxcsr before:    0111 1111 1000 0000
    mxcsr after:     0111 1111 1010 0000
    

    接下来再看一个SSE的例子:
    这是一个内存没对齐的向量求和:

    ; sse_unaligned.asm
    extern printf
    section .data
    ;single precision                           
        spvector1   dd      1.1
                     dd      2.2
                     dd     3.3
                     dd     4.4
        spvector2   dd      1.1
                     dd      2.2
                     dd     3.3
                     dd     4.4
    ;double precision                 
        dpvector1   dq      1.1
                     dq      2.2
        dpvector2   dq  3.3
                     dq     4.4
            
        fmt1 db "Single Precision Vector 1: %f, %f, %f, %f",10,0
        fmt2 db "Single Precision Vector 2: %f, %f, %f, %f",10,0
        fmt3 db "Sum of Single Precision Vector 1 and Vector 2: %f, %f, %f, %f",10,0
        fmt4 db "Double Precision Vector 1: %f, %f",10,0
        fmt5 db "Double Precision Vector 2: %f, %f",10,0
        fmt6 db "Sum of Double Precision Vector 1 and Vector 2: %f, %f",10,0        
    
    section .bss
        spvector_res resd 4
        dpvector_res resq 4
    section .text                           
        global main                 
    main:
    push    rbp     
    mov rbp,rsp
    
    ; add 2 single precision floating point vectors
        mov rsi,spvector1
            mov     rdi,fmt1
            call printspfp 
        
            mov     rsi,spvector2
            mov     rdi,fmt2
            call printspfp 
            
            movups  xmm0, [spvector1]          
            movups  xmm1, [spvector2]            
            addps   xmm0,xmm1 
    
    
            movups  [spvector_res], xmm0
            mov         rsi,spvector_res
            mov         rdi,fmt3
            call    printspfp 
    
    ; add 2 double precision floating point vectors
            mov rsi,dpvector1
            mov     rdi,fmt4
            call printdpfp 
        
            mov     rsi,dpvector2
            mov     rdi,fmt5
            call printdpfp 
            
            movupd  xmm0, [dpvector1]          
            movupd  xmm1, [dpvector2]            
            addpd   xmm0,xmm1 
            movupd  [dpvector_res], xmm0
            mov         rsi,dpvector_res
            mov         rdi,fmt6
            call    printdpfp
    
    leave   
    ret
    
    printspfp:    
    push rbp
    mov     rbp,rsp
        movss   xmm0, [rsi]
        cvtss2sd    xmm0,xmm0
        movss   xmm1, [rsi+4]
        cvtss2sd    xmm1,xmm1
        movss   xmm2, [rsi+8]
        cvtss2sd    xmm2,xmm2
        movss   xmm3, [rsi+12]
        cvtss2sd    xmm3,xmm3
        mov     rax,4   ; four floats
        call    printf
    leave
    ret
    
    printdpfp:    
    push rbp
    mov rbp,rsp
        movsd   xmm0, [rsi]
        movsd   xmm1, [rsi+8]
        mov     rax,2   ; four floats
        call    printf
    leave
    ret
    结果如下:
    Single Precision Vector 1: 1.100000, 2.200000, 3.300000, 4.400000
    Single Precision Vector 2: 1.100000, 2.200000, 3.300000, 4.400000
    Sum of Single Precision Vector 1 and Vector 2: 2.200000, 4.400000, 6.600000, 8.800000
    Double Precision Vector 1: 1.100000, 2.200000
    Double Precision Vector 2: 3.300000, 4.400000
    Sum of Double Precision Vector 1 and Vector 2: 4.400000, 6.600000
    

    代码中的关键点如下:
    movups(move unaligned packed single precision),将数据拷贝到xmm0或xmm1寄存器。
    addps(add packed single precision): 将两个xmm寄存器的值按照单精度划分并相加。
    movss(move scalar single precision)将内存数据拷贝到xmm寄存器。
    cvtss2sd(convert scalar single to scalar double):将单精度转成双精度。
    接下来看一个内存对齐的例子:

    ; sse_aligned.asm
    extern printf
    section .data
        dummy   db      13
    align 16                        
        spvector1   dd      1.1
                     dd      2.2
                     dd     3.3
                     dd     4.4
        spvector2   dd      1.1
                     dd      2.2
                     dd     3.3
                     dd     4.4
                     
        dpvector1   dq      1.1
                     dq      2.2
        dpvector2   dq      3.3
                     dq     4.4
    
            fmt1    db "Single Precision Vector 1: %f, %f, %f, %f",10,0
            fmt2    db "Single Precision Vector 2: %f, %f, %f, %f",10,0
            fmt3    db "Sum of Single Precision Vector 1 and Vector 2: "
                db "%f, %f, %f, %f",10,0
            fmt4    db "Double Precision Vector 1: %f, %f",10,0
            fmt5    db "Double Precision Vector 2: %f, %f",10,0
            fmt6    db "Sum of Double Precision Vector 1 and Vector 2: "
                db "%f, %f",10,0        
    
    section .bss
    alignb 16
            spvector_res resd 4
            dpvector_res resq 4
    section .text                           
        global main                 
    main:
    push    rbp 
    mov     rbp,rsp
        
    ; add 2 single precision floating point vectors
        mov     rsi,spvector1
        mov     rdi,fmt1
        call    printspfp 
        
        mov     rsi,spvector2
        mov     rdi,fmt2
        call    printspfp 
            
        movaps  xmm0, [spvector1]          
        addps   xmm0, [spvector2]            
    
        movaps  [spvector_res], xmm0
        mov         rsi,spvector_res
        mov         rdi,fmt3
        call        printspfp 
    
    ; add 2 double precision floating point vectors
        mov     rsi,dpvector1
        mov     rdi,fmt4
        call    printdpfp 
        
        mov     rsi,dpvector2
        mov     rdi,fmt5
        call    printdpfp 
            
        movapd  xmm0, [dpvector1]          
        addpd   xmm0, [dpvector2]            
     
        movapd  [dpvector_res], xmm0
        mov         rsi,dpvector_res
        mov         rdi,fmt6
        call        printdpfp 
    ; exit
    mov     rsp,rbp              
    pop rbp     ; undo the push at the beginning
    ret
    
    printspfp:    
    push rbp
    mov     rbp,rsp
        movss   xmm0, [rsi]
        cvtss2sd    xmm0,xmm0  ;printf expects double precision argument
        movss   xmm1, [rsi+4]
        cvtss2sd    xmm1,xmm1
        movss   xmm2, [rsi+8]
        cvtss2sd    xmm2,xmm2
        movss   xmm3, [rsi+12]
        cvtss2sd    xmm3,xmm3
        mov     rax,4   ; four floats
        call    printf
    leave
    ret
    
    printdpfp:    
    push rbp
    mov rbp,rsp
        movsd   xmm0, [rsi]
        movsd   xmm1, [rsi+8]
        mov     rax,2   ; two floats
        call    printf
    leave
    ret
    结果如下:
    Single Precision Vector 1: 1.100000, 2.200000, 3.300000, 4.400000
    Single Precision Vector 2: 1.100000, 2.200000, 3.300000, 4.400000
    Sum of Single Precision Vector 1 and Vector 2: 2.200000, 4.400000, 6.600000, 8.800000
    Double Precision Vector 1: 1.100000, 2.200000
    Double Precision Vector 2: 3.300000, 4.400000
    Sum of Double Precision Vector 1 and Vector 2: 4.400000, 6.600000
    

    通过使用align 16保证内存按照16字节对齐。
    这时候指令稍有差异:
    movaps(move aligned packed single precision)

    继续看一个SSE操作整数的例子:

    ; sse_integer.asm
    extern printf
    
    
    section .data
        
        dummy   db      13
    align 16                        
        pdivector1  dd  1
                        dd  2
                        dd  3
                        dd  4
        pdivector2      dd  5
                        dd  6
                        dd  7
                        dd  8
                     
            fmt1 db "Packed Integer Vector 1: %d, %d, %d, %d",10,0
            fmt2 db "Packed Integer Vector 2: %d, %d, %d, %d",10,0
            fmt3 db "Sum Vector: %d, %d, %d, %d",10,0
            fmt4 db "Reverse of Sum Vector: %d, %d, %d, %d",10,0
    
    section .bss
    alignb 16
        pdivector_res       resd 4
        pdivector_other     resd 4
    
    section .text                           
        global main                 
    main:
    push    rbp 
    mov rbp,rsp
        
    ; print vector 1
        mov rsi,pdivector1
        mov     rdi,fmt1
        call    printpdi 
    ; print vector 2
        mov     rsi,pdivector2
        mov     rdi,fmt2
        call    printpdi 
    
    ; add 2 aligned double int vectors        
        movdqa  xmm0, [pdivector1]          
        paddd   xmm0, [pdivector2] 
               
    ; store the result in memory
        movdqa  [pdivector_res], xmm0
    ; print the vector in memory
        mov     rsi,pdivector_res
        mov     rdi,fmt3
        call    printpdi 
     
    ; copy the memory vector to xmm3   
        movdqa xmm3,[pdivector_res]
    
    ; extract the packed values from xmm3   
        pextrd eax, xmm3, 0
        pextrd ebx, xmm3, 1
        pextrd ecx, xmm3, 2
        pextrd edx, xmm3, 3
    ; insert in xmm0 in reverse order   
        pinsrd xmm0, eax, 3
        pinsrd xmm0, ebx, 2
        pinsrd xmm0, ecx, 1
        pinsrd xmm0, edx, 0
    
    ; print the reversed vector
        movdqa [pdivector_other], xmm0
        mov     rsi,pdivector_other
        mov     rdi,fmt4
        call    printpdi   
        
    
    ; exit
    mov     rsp,rbp              
    pop rbp 
    ret
    
    ;print function-----------------------------------------------
    printpdi:    
    push rbp
    mov     rbp,rsp
        movdqa xmm0, [rsi]
    ; extract the packed values from xmm3
            pextrd esi, xmm0,0
            pextrd edx, xmm0,1
            pextrd ecx, xmm0,2
            pextrd r8d, xmm0,3
        mov rax,0   ; no floats
        call    printf
    leave
    ret
    结果如下:
    Packed Integer Vector 1: 1, 2, 3, 4
    Packed Integer Vector 2: 5, 6, 7, 8
    Sum Vector: 6, 8, 10, 12
    Reverse of Sum Vector: 12, 10, 8, 6
    

    SSE中也有操作字符串比较的指令:


    image.png image.png

    指令的第三个参数imm8是一个立即数,含义如下:


    image.png

    相关文章

      网友评论

          本文标题:汇编学习(11),SIMD之SSE

          本文链接:https://www.haomeiwen.com/subject/cywbqdtx.html