美文网首页
汇编学习(12), SIMD之AVX(已完结)

汇编学习(12), SIMD之AVX(已完结)

作者: android小奉先 | 来源:发表于2022-12-24 12:36 被阅读0次

    本篇介绍

    AVX是SSE的扩展版,用了256字节的ymm寄存器,本篇看下AVX相关的指令。

    AVX

    AVX(Advanced Vector Extensions)用的是ymm寄存器,每个256字节。
    先看一个数据操运算的例子:

    ; avx_unaligned.asm
    extern printf
    section .data                           
            spvector1   dd  1.1
                        dd      2.1
                        dd      3.1
                        dd  4.1
                        dd      5.1
                        dd  6.1
                        dd      7.1
                    dd      8.1
                     
            spvector2   dd      1.2
                        dd      1.2
                        dd      3.2
                        dd  4.2
                        dd      5.2
                        dd      6.2
                        dd      7.2
                        dd      8.2
                     
            dpvector1   dq      1.1
                        dq  2.2
                        dq      3.3
                        dq      4.4
                
            dpvector2   dq      5.5
                        dq      6.6
                        dq      7.7
                        dq  8.8
          
            fmt1    db  "Single Precision Vector 1:",10,0
            fmt2    db  10,"Single Precision Vector 2:",10,0
            fmt3    db  10,"Sum of Single Precision Vector 1 and Vector 2:",10,0
            fmt4    db  10,"Double Precision Vector 1:",10,0
            fmt5    db  10,"Double Precision Vector 2:",10,0
            fmt6    db  10,"Sum of Double Precision Vector 1 and Vector 2:",10,0         
    
    section .bss
            spvector_res    resd    8
            dpvector_res    resq    4
    section .text                           
        global main                 
    main:
    push    rbp
    mov rbp,rsp
    ;SINGLE PRECISION FLOATING POINT VECTORS
    ;load vector1 in the register ymm0   
        vmovups     ymm0, [spvector1]         
    ;extract ymm0
            vextractf128    xmm2,ymm0,0    ;first part of ymm0
            vextractf128    xmm2,ymm0,1    ;second part of ymm0
    ;load vector2 in the register ymm1          
            vmovups     ymm1, [spvector2]
    ;extract ymm1  
            vextractf128    xmm2,ymm1,0
            vextractf128    xmm2,ymm1,1
    
    ;add 2 single precision floating point vectors    
            vaddps  ymm2,ymm0,ymm1 
            vmovups [spvector_res],ymm2 
    ;print the vectors
            mov     rdi,fmt1
            call    printf
            mov     rsi,spvector1
            call    printspfpv
            mov     rdi,fmt2
            call    printf
            mov     rsi,spvector2
            call    printspfpv
            mov     rdi,fmt3
            call    printf
            mov     rsi,spvector_res
            call    printspfpv
        
    ;DOUBLE PRECISION FLOATING POINT VECTORS
    ;load vector1 in the register ymm0   
            vmovups     ymm0, [dpvector1] 
            mov         rdi,fmt4        
    ;extract ymm0   
            vextractf128    xmm2,ymm0,0    ;first part of ymm0
            vextractf128    xmm2,ymm0,1    ;second part of ymm0
    
    ;load vector2 in the register ymm1          
            vmovups     ymm1, [dpvector2]
    ;extract ymm1  
            vextractf128    xmm2,ymm1,0
            vextractf128    xmm2,ymm1,1
    
    ; add 2 double precision floating point vectors    
            vaddpd  ymm2,ymm0,ymm1 
            vmovupd     [dpvector_res],ymm2    
    ;print the vectors
        mov     rdi,fmt4
            call    printf
            mov     rsi,dpvector1
            call printdpfpv
            mov     rdi,fmt5
            call printf
            mov     rsi,dpvector2
            call    printdpfpv
            mov     rdi,fmt6
            call printf
            mov     rsi,dpvector_res
            call printdpfpv       
    leave
    ret
    
    printspfpv:
    section .data
        .NL db  10,0
        .fmt1   db  "%.1f,  ",0
    section .text    
    push    rbp
    mov     rbp,rsp
        push    rcx
        push    rbx
        mov         rcx,8
        mov         rbx,0
    .loop:
        movss   xmm0,[rsi+rbx]
        cvtss2sd    xmm0,xmm0
        mov         rdi,.fmt1
        push    rsi
        push    rcx
        call    printf
        pop         rcx
        pop         rsi
        add         rbx,4
        loop    .loop
        xor         rax,rax
        mov         rdi,.NL
        call    printf
        pop         rbx
        pop         rcx
    leave
    ret
    
    printdpfpv:
    section .data
            .NL db  10,0
            .fmt    db  "%.1f,  %.1f,  %.1f,  %.1f",0
    section .text    
    push    rbp
    mov rbp,rsp       
        movsd   xmm0, [rsi]
        movsd   xmm1, [rsi+8]
        movsd   xmm2, [rsi+16]
            movsd   xmm3, [rsi+24]
    
            mov         rdi,.fmt
        mov     rax,4   ; four floats
        call        printf
        mov         rdi,.NL
        call    printf
    leave
    ret
    结果如下:
    Single Precision Vector 1:
    1.1,  2.1,  3.1,  4.1,  5.1,  6.1,  7.1,  8.1,  
    
    Single Precision Vector 2:
    1.2,  1.2,  3.2,  4.2,  5.2,  6.2,  7.2,  8.2,  
    
    Sum of Single Precision Vector 1 and Vector 2:
    2.3,  3.3,  6.3,  8.3,  10.3,  12.3,  14.3,  16.3,  
    
    Double Precision Vector 1:
    1.1,  2.2,  3.3,  4.4
    
    Double Precision Vector 2:
    5.5,  6.6,  7.7,  8.8
    
    Sum of Double Precision Vector 1 and Vector 2:
    6.6,  8.8,  11.0,  13.2
    

    vmovups可以将未对齐的数据拷贝到ymm寄存器中。
    vextractf128 可以将ymm中的数据提取出来,每次128字节。
    接下来再看一个矩阵转置的例子,对比普通指令和AVX的差异,就可以看到AVX指令的性能优势了:

    ; transpose.asm
    extern printf
    
    section .data
        fmt0    db  "4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE",10,0
        fmt1    db  10,"This is the matrix:",10,0
        fmt2    db  10,"This is the transpose (sequential version): ",10,0
        fmt3    db  10,"This is the transpose (AVX version): ",10,0
        fmt4    db  10,"Number of loops: %d",10,0
        fmt5    db  "Sequential version elapsed cycles: %d",10,0
        fmt6    db  "AVX Shuffle version elapsed cycles: %d",10,0
      
        align   32                                             
        matrix  dq   1.,     2.,     3.,     4.
                dq   5.,     6.,     7.,     8.
                dq   9.,    10.,    11.,    12.
                dq  13.,    14.,    15.,    16.     
                                            
        loops   dq  10000
    
    section .bss
        alignb  32
        transpose   resq        16
            
        bahi_cy resq    1  ;timers for avx version
        balo_cy     resq    1
        eahi_cy     resq    1  
        ealo_cy     resq    1  
    
        bshi_cy     resq    1  ;timers for sequential version 
        bslo_cy     resq    1
        eshi_cy     resq    1  
        eslo_cy     resq    1
    
    section .text                           
        global main                 
    main:
    push    rbp
    mov rbp,rsp
    ; print title
        mov rdi, fmt0
        call    printf
    ; print matrix
        mov rdi,fmt1
        call    printf
        mov rsi,matrix
        call    printm4x4
            
    ; SEQUENTIAL VERSION        
    ; compute transpose   
            mov     rdi, matrix
            mov     rsi, transpose 
            mov     rdx, [loops]     
    
    ;start measuring the cycles
            cpuid
            rdtsc 
            mov [bshi_cy],edx
            mov     [bslo_cy],eax         
       
        call seq_transpose                                   
    
    ;stop measuring the cycles
            rdtscp
            mov     [eshi_cy],edx
            mov     [eslo_cy],eax  
            cpuid
    
    ;print the result 
        mov rdi,fmt2
        call    printf
        mov     rsi,transpose
        call    printm4x4
            
    ; AVX VERSION        
    ; compute transpose   
        mov     rdi, matrix
        mov     rsi, transpose         
        mov     rdx, [loops]
    ;start measuring the cycles
            cpuid
            rdtsc 
            mov     [bahi_cy],edx
            mov     [balo_cy],eax          
                                       
        call AVX_transpose 
    
    ;stop measuring the cycles
            rdtscp
            mov     [eahi_cy],edx
            mov     [ealo_cy],eax  
            cpuid
    
    ;print the result  
        mov     rdi,fmt3
        call    printf
        mov     rsi,transpose
        call    printm4x4
    
    ;print the loops   
            mov     rdi,fmt4 
            mov     rsi,[loops]  
            call    printf
    
    ;print the cycles
    ;cycles sequential version
            mov     rdx,[eslo_cy]        
            mov     rsi,[eshi_cy]  
            shl     rsi,32   
            or  rsi,rdx     ;rsi contains end time
    
            mov     r8,[bslo_cy]
            mov     r9,[bshi_cy] 
            shl     r9,32
            or  r9,r8           ;r9 contains start time
        
            sub     rsi,r9          ;rsi contains elapsed    
        ;print the timing result
            mov     rdi,fmt5
            call    printf
        
    ;cycles AVX blend version
            mov     rdx,[ealo_cy]        
        mov     rsi,[eahi_cy]
            shl     rsi,32     
        or  rsi,rdx     ;rsi contains end time
    
            mov     r8,[balo_cy]
            mov     r9,[bahi_cy] 
            shl     r9,32
            or  r9,r8           ;r9 contains start time
        
            sub     rsi,r9          ;rsi contains elapsed    
        ;print the timing result
            mov     rdi,fmt6
            call    printf       
    leave
    ret
    ;---------------------------------------------------------------
    seq_transpose:
    push    rbp
    mov rbp,rsp       
    .loopx:         ; the number of loops
        pxor    xmm0,xmm0
        xor     r10,r10 
        xor     rax,rax
        mov     r12,4           
        .loopo:
                push    rcx
            mov     r13,4
                .loopi:                                                                    
                movsd   xmm0, [rdi+r10]
                    movsd   [rsi+rax], xmm0
                    add         r10,8
                    add         rax,32
                    dec         r13
            jnz .loopi
                add     rax,8
                xor     rax,10000000b    ;rax - 128
                inc     rbx
                dec     r12
        jnz     .loopo
        dec rdx
    jnz .loopx 
    leave 
    ret 
    ;---------------------------------------------------------------    
    AVX_transpose:
    push    rbp
    mov rbp,rsp
    .loopx:         ; the number of loops               
    ;load matrix into the registers             
        vmovapd     ymm0,[rdi]  ;  1   2   3   4
        vmovapd     ymm1,[rdi+32]   ;  5   6   7   8     
            vmovapd     ymm2,[rdi+64]   ;  9  10  11  12
            vmovapd     ymm3,[rdi+96]   ; 13  14  15  16
    ;shuffle
            vshufpd     ymm12,ymm0,ymm1, 0000b  ;  1   5   3   7 
            vshufpd     ymm13,ymm0,ymm1, 1111b  ;  2   6   4   8
            vshufpd     ymm14,ymm2,ymm3, 0000b  ;  9  13  11  15    
            vshufpd     ymm15,ymm2,ymm3, 1111b  ; 10  14  12  16 
    ;permutate  
            vperm2f128 ymm0,ymm12,ymm14,    00100000b    ; 1   5   9  13
            vperm2f128 ymm1,ymm13,ymm15,    00100000b    ; 2   6  10  14
            vperm2f128 ymm2,ymm12,ymm14,  00110001b    ; 3   7  11  15 
            vperm2f128 ymm3,ymm13,ymm15,    00110001b    ; 4   8  12  16
    ;write to memory
            vmovapd     [rsi],   ymm0
            vmovapd     [rsi+32],ymm1      
            vmovapd     [rsi+64],ymm2
            vmovapd     [rsi+96],ymm3
        dec rdx
        jnz .loopx
    leave
    ret
    ;---------------------------------------------------------------
    printm4x4:
    section .data
        .fmt    db  "%f",9,"%f",9, "%f",9,"%f",10,0
    section .text
    push    rbp
    mov rbp,rsp
        push    rbx         ;callee saved
            push    r15             ;callee saved
        mov         rdi,.fmt
        mov         rcx,4
        xor         rbx,rbx         ;row counter
    .loop:        
        movsd   xmm0, [rsi+rbx]
        movsd   xmm1, [rsi+rbx+8]
        movsd   xmm2, [rsi+rbx+16]
        movsd   xmm3, [rsi+rbx+24]
        mov     rax,4       ; four floats
            push    rcx     ;caller saved
            push    rsi     ;caller saved
            push    rdi     ;caller saved
            ;align stack if needed
            xor     r15,r15
            test    rsp,0fh        ;last byte is 8 (not aligned)? 
            setnz   r15b            ;set if not aligned
            shl     r15,3           ;multiply by 8
            sub     rsp,r15         ;substract 0 or 8
            call    printf
            add     rsp,r15         ;add 0 or 8
            pop     rdi
            pop     rsi
            pop     rcx
            add     rbx,32      ;next row
            loop    .loop
    pop r15
    pop rbx
    leave
    ret
    结果如下:
    4x4 DOUBLE PRECISION FLOATING POINT MATRIX TRANSPOSE
    
    This is the matrix:
    1.000000    2.000000    3.000000    4.000000
    5.000000    6.000000    7.000000    8.000000
    9.000000    10.000000   11.000000   12.000000
    13.000000   14.000000   15.000000   16.000000
    
    This is the transpose (sequential version): 
    1.000000    5.000000    9.000000    13.000000
    2.000000    6.000000    10.000000   14.000000
    3.000000    7.000000    11.000000   15.000000
    4.000000    8.000000    12.000000   16.000000
    
    This is the transpose (AVX version): 
    1.000000    5.000000    9.000000    13.000000
    2.000000    6.000000    10.000000   14.000000
    3.000000    7.000000    11.000000   15.000000
    4.000000    8.000000    12.000000   16.000000
    
    Number of loops: 10000
    Sequential version elapsed cycles: 8654387
    AVX Shuffle version elapsed cycles: 814357
    

    总结

    本次汇编学习到此就先结束了,有了这些基础,相信接下来看汇编代码会容易很多。

    相关文章

      网友评论

          本文标题:汇编学习(12), SIMD之AVX(已完结)

          本文链接:https://www.haomeiwen.com/subject/rsniqdtx.html