本篇介绍
SIMD(Single Instruction Stream,Multiple Data)可以实现高性能运算,本篇介绍下这块内容。
SSE
SIMD就是一条指令可以操作多个数据,有多种实现方法,比如SSE,AVX。
SSE(Streaming SIMD Extension)拥有16个128位的寄存器,也就是之前接触过的xmm0到xmm15,还有一个控制寄存器mxcsr。这些寄存器可以存放浮点,也可以存放scalar和packed数据。scalar是单个数据,而packetd数据就是多个,对于一个xmm寄存器可以存放如下的值:
- 2个64位的双精度浮点
- 4个32位的单精度浮点
- 2个64位的整数
- 4个32位的整数
- 8 个16位的整数
- 16个8位的字符
可以总结成如下表所示:
image.png
对于AVX,用的是256字节的ymm寄存器,另外也有512字节的zmm寄存器。
接下来看下SSE寄存器:
image.png
再看一个代码,可以看到mxcsr寄存器的值含义:
; mxcsr.asm
extern printf
extern print_mxcsr
extern print_hex
section .data
eleven dq 11.0
two dq 2.0
three dq 3.0
ten dq 10.0
zero dq 0.0
hex db "0x",0
fmt1 db 10,"Divide, default mxcsr:",10,0
fmt2 db 10,"Divide by zero, default mxcsr:",10,0
fmt4 db 10,"Divide, round up:",10,0
fmt5 db 10,"Divide, round down:",10,0
fmt6 db 10,"Divide, truncate:",10,0
f_div db "%.1f divided by %.1f is %.16f, in hex: ",0
f_before db 10,"mxcsr before:",9,0
f_after db "mxcsr after:",9,0
;mxcsr values
default_mxcsr dd 0001111110000000b
round_nearest dd 0001111110000000b
round_down dd 0011111110000000b
round_up dd 0101111110000000b
truncate dd 0111111110000000b
section .bss
mxcsr_before resd 1
mxcsr_after resd 1
xmm resq 1
section .text
global main
main:
mov rbp, rsp; for correct debugging
push rbp
mov rbp,rsp
;division
;default mxcsr
mov rdi,fmt1
mov rsi,ten
mov rdx,two
mov ecx, [default_mxcsr]
call apply_mxcsr
;----------------------------------------------
;division with precision error
;default mxcsr
mov rdi,fmt1
mov rsi,ten
mov rdx,three
mov ecx, [default_mxcsr]
call apply_mxcsr
;divide by zero
;default mxcsr
mov rdi,fmt2
mov rsi,ten
mov rdx,zero
mov ecx, [default_mxcsr]
call apply_mxcsr
;division with precision error
;round up
mov rdi,fmt4
mov rsi,ten
mov rdx,three
mov ecx, [round_up]
call apply_mxcsr
;division with precision error
;round up
mov rdi,fmt5
mov rsi,ten
mov rdx,three
mov ecx, [round_down]
call apply_mxcsr
;division with precision error
;truncate
mov rdi,fmt6
mov rsi,ten
mov rdx,three
mov ecx, [truncate]
call apply_mxcsr
;----------------------------------------------
;division with precision error
;default mxcsr
mov rdi,fmt1
mov rsi,eleven
mov rdx,three
mov ecx, [default_mxcsr]
call apply_mxcsr;division with precision error
;round up
mov rdi,fmt4
mov rsi,eleven
mov rdx,three
mov ecx, [round_up]
call apply_mxcsr
;division with precision error
;round up
mov rdi,fmt5
mov rsi,eleven
mov rdx,three
mov ecx, [round_down]
call apply_mxcsr
;division with precision error
;truncate
mov rdi,fmt6
mov rsi,eleven
mov rdx,three
mov ecx, [truncate]
call apply_mxcsr
leave
ret
;function ------------------------------------------------------------
apply_mxcsr:
push rbp
mov rbp,rsp
push rsi
push rdx
push rcx
push rbp ; one more for stack alignment
call printf
pop rbp
pop rcx
pop rdx
pop rsi
mov [mxcsr_before],ecx
ldmxcsr [mxcsr_before]
movsd xmm2, [rsi] ; double precision float into xmm2
divsd xmm2, [rdx] ; divide xmm2
stmxcsr [mxcsr_after] ; save mxcsr to memory
movsd [xmm],xmm2 ; for use in print_xmm
mov rdi,f_div
movsd xmm0, [rsi]
movsd xmm1, [rdx]
call printf
call print_xmm
;print mxcsr
mov rdi,f_before
call printf
mov rdi, [mxcsr_before]
call print_mxcsr
mov rdi,f_after
call printf
mov rdi, [mxcsr_after]
call print_mxcsr
leave
ret
;function ------------------------------------------------------------
print_xmm:
push rbp
mov rbp,rsp
mov rdi, hex ;print 0x
call printf
mov rcx,8
.loop:
xor rdi,rdi
mov dil,[xmm+rcx-1]
push rcx
push rcx
call print_hex
pop rcx
pop rcx
loop .loop
leave
ret
结果如下:
Divide, default mxcsr:
10.0 divided by 2.0 is 5.0000000000000000, in hex: 0x4014000000000000
mxcsr before: 0001 1111 1000 0000
mxcsr after: 0001 1111 1000 0000
Divide, default mxcsr:
10.0 divided by 3.0 is 3.3333333333333335, in hex: 0x400aaaaaaaaaaaab
mxcsr before: 0001 1111 1000 0000
mxcsr after: 0001 1111 1010 0000
Divide by zero, default mxcsr:
10.0 divided by 0.0 is inf, in hex: 0x7ff0000000000000
mxcsr before: 0001 1111 1000 0000
mxcsr after: 0001 1111 1000 0100
Divide, round up:
10.0 divided by 3.0 is 3.3333333333333335, in hex: 0x400aaaaaaaaaaaab
mxcsr before: 0101 1111 1000 0000
mxcsr after: 0101 1111 1010 0000
Divide, round down:
10.0 divided by 3.0 is 3.3333333333333330, in hex: 0x400aaaaaaaaaaaaa
mxcsr before: 0011 1111 1000 0000
mxcsr after: 0011 1111 1010 0000
Divide, truncate:
10.0 divided by 3.0 is 3.3333333333333330, in hex: 0x400aaaaaaaaaaaaa
mxcsr before: 0111 1111 1000 0000
mxcsr after: 0111 1111 1010 0000
Divide, default mxcsr:
11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
mxcsr before: 0001 1111 1000 0000
mxcsr after: 0001 1111 1010 0000
Divide, round up:
11.0 divided by 3.0 is 3.6666666666666670, in hex: 0x400d555555555556
mxcsr before: 0101 1111 1000 0000
mxcsr after: 0101 1111 1010 0000
Divide, round down:
11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
mxcsr before: 0011 1111 1000 0000
mxcsr after: 0011 1111 1010 0000
Divide, truncate:
11.0 divided by 3.0 is 3.6666666666666665, in hex: 0x400d555555555555
mxcsr before: 0111 1111 1000 0000
mxcsr after: 0111 1111 1010 0000
接下来再看一个SSE的例子:
这是一个内存没对齐的向量求和:
; sse_unaligned.asm
extern printf
section .data
;single precision
spvector1 dd 1.1
dd 2.2
dd 3.3
dd 4.4
spvector2 dd 1.1
dd 2.2
dd 3.3
dd 4.4
;double precision
dpvector1 dq 1.1
dq 2.2
dpvector2 dq 3.3
dq 4.4
fmt1 db "Single Precision Vector 1: %f, %f, %f, %f",10,0
fmt2 db "Single Precision Vector 2: %f, %f, %f, %f",10,0
fmt3 db "Sum of Single Precision Vector 1 and Vector 2: %f, %f, %f, %f",10,0
fmt4 db "Double Precision Vector 1: %f, %f",10,0
fmt5 db "Double Precision Vector 2: %f, %f",10,0
fmt6 db "Sum of Double Precision Vector 1 and Vector 2: %f, %f",10,0
section .bss
spvector_res resd 4
dpvector_res resq 4
section .text
global main
main:
push rbp
mov rbp,rsp
; add 2 single precision floating point vectors
mov rsi,spvector1
mov rdi,fmt1
call printspfp
mov rsi,spvector2
mov rdi,fmt2
call printspfp
movups xmm0, [spvector1]
movups xmm1, [spvector2]
addps xmm0,xmm1
movups [spvector_res], xmm0
mov rsi,spvector_res
mov rdi,fmt3
call printspfp
; add 2 double precision floating point vectors
mov rsi,dpvector1
mov rdi,fmt4
call printdpfp
mov rsi,dpvector2
mov rdi,fmt5
call printdpfp
movupd xmm0, [dpvector1]
movupd xmm1, [dpvector2]
addpd xmm0,xmm1
movupd [dpvector_res], xmm0
mov rsi,dpvector_res
mov rdi,fmt6
call printdpfp
leave
ret
printspfp:
push rbp
mov rbp,rsp
movss xmm0, [rsi]
cvtss2sd xmm0,xmm0
movss xmm1, [rsi+4]
cvtss2sd xmm1,xmm1
movss xmm2, [rsi+8]
cvtss2sd xmm2,xmm2
movss xmm3, [rsi+12]
cvtss2sd xmm3,xmm3
mov rax,4 ; four floats
call printf
leave
ret
printdpfp:
push rbp
mov rbp,rsp
movsd xmm0, [rsi]
movsd xmm1, [rsi+8]
mov rax,2 ; four floats
call printf
leave
ret
结果如下:
Single Precision Vector 1: 1.100000, 2.200000, 3.300000, 4.400000
Single Precision Vector 2: 1.100000, 2.200000, 3.300000, 4.400000
Sum of Single Precision Vector 1 and Vector 2: 2.200000, 4.400000, 6.600000, 8.800000
Double Precision Vector 1: 1.100000, 2.200000
Double Precision Vector 2: 3.300000, 4.400000
Sum of Double Precision Vector 1 and Vector 2: 4.400000, 6.600000
代码中的关键点如下:
movups(move unaligned packed single precision),将数据拷贝到xmm0或xmm1寄存器。
addps(add packed single precision): 将两个xmm寄存器的值按照单精度划分并相加。
movss(move scalar single precision)将内存数据拷贝到xmm寄存器。
cvtss2sd(convert scalar single to scalar double):将单精度转成双精度。
接下来看一个内存对齐的例子:
; sse_aligned.asm
extern printf
section .data
dummy db 13
align 16
spvector1 dd 1.1
dd 2.2
dd 3.3
dd 4.4
spvector2 dd 1.1
dd 2.2
dd 3.3
dd 4.4
dpvector1 dq 1.1
dq 2.2
dpvector2 dq 3.3
dq 4.4
fmt1 db "Single Precision Vector 1: %f, %f, %f, %f",10,0
fmt2 db "Single Precision Vector 2: %f, %f, %f, %f",10,0
fmt3 db "Sum of Single Precision Vector 1 and Vector 2: "
db "%f, %f, %f, %f",10,0
fmt4 db "Double Precision Vector 1: %f, %f",10,0
fmt5 db "Double Precision Vector 2: %f, %f",10,0
fmt6 db "Sum of Double Precision Vector 1 and Vector 2: "
db "%f, %f",10,0
section .bss
alignb 16
spvector_res resd 4
dpvector_res resq 4
section .text
global main
main:
push rbp
mov rbp,rsp
; add 2 single precision floating point vectors
mov rsi,spvector1
mov rdi,fmt1
call printspfp
mov rsi,spvector2
mov rdi,fmt2
call printspfp
movaps xmm0, [spvector1]
addps xmm0, [spvector2]
movaps [spvector_res], xmm0
mov rsi,spvector_res
mov rdi,fmt3
call printspfp
; add 2 double precision floating point vectors
mov rsi,dpvector1
mov rdi,fmt4
call printdpfp
mov rsi,dpvector2
mov rdi,fmt5
call printdpfp
movapd xmm0, [dpvector1]
addpd xmm0, [dpvector2]
movapd [dpvector_res], xmm0
mov rsi,dpvector_res
mov rdi,fmt6
call printdpfp
; exit
mov rsp,rbp
pop rbp ; undo the push at the beginning
ret
printspfp:
push rbp
mov rbp,rsp
movss xmm0, [rsi]
cvtss2sd xmm0,xmm0 ;printf expects double precision argument
movss xmm1, [rsi+4]
cvtss2sd xmm1,xmm1
movss xmm2, [rsi+8]
cvtss2sd xmm2,xmm2
movss xmm3, [rsi+12]
cvtss2sd xmm3,xmm3
mov rax,4 ; four floats
call printf
leave
ret
printdpfp:
push rbp
mov rbp,rsp
movsd xmm0, [rsi]
movsd xmm1, [rsi+8]
mov rax,2 ; two floats
call printf
leave
ret
结果如下:
Single Precision Vector 1: 1.100000, 2.200000, 3.300000, 4.400000
Single Precision Vector 2: 1.100000, 2.200000, 3.300000, 4.400000
Sum of Single Precision Vector 1 and Vector 2: 2.200000, 4.400000, 6.600000, 8.800000
Double Precision Vector 1: 1.100000, 2.200000
Double Precision Vector 2: 3.300000, 4.400000
Sum of Double Precision Vector 1 and Vector 2: 4.400000, 6.600000
通过使用align 16保证内存按照16字节对齐。
这时候指令稍有差异:
movaps(move aligned packed single precision)
继续看一个SSE操作整数的例子:
; sse_integer.asm
extern printf
section .data
dummy db 13
align 16
pdivector1 dd 1
dd 2
dd 3
dd 4
pdivector2 dd 5
dd 6
dd 7
dd 8
fmt1 db "Packed Integer Vector 1: %d, %d, %d, %d",10,0
fmt2 db "Packed Integer Vector 2: %d, %d, %d, %d",10,0
fmt3 db "Sum Vector: %d, %d, %d, %d",10,0
fmt4 db "Reverse of Sum Vector: %d, %d, %d, %d",10,0
section .bss
alignb 16
pdivector_res resd 4
pdivector_other resd 4
section .text
global main
main:
push rbp
mov rbp,rsp
; print vector 1
mov rsi,pdivector1
mov rdi,fmt1
call printpdi
; print vector 2
mov rsi,pdivector2
mov rdi,fmt2
call printpdi
; add 2 aligned double int vectors
movdqa xmm0, [pdivector1]
paddd xmm0, [pdivector2]
; store the result in memory
movdqa [pdivector_res], xmm0
; print the vector in memory
mov rsi,pdivector_res
mov rdi,fmt3
call printpdi
; copy the memory vector to xmm3
movdqa xmm3,[pdivector_res]
; extract the packed values from xmm3
pextrd eax, xmm3, 0
pextrd ebx, xmm3, 1
pextrd ecx, xmm3, 2
pextrd edx, xmm3, 3
; insert in xmm0 in reverse order
pinsrd xmm0, eax, 3
pinsrd xmm0, ebx, 2
pinsrd xmm0, ecx, 1
pinsrd xmm0, edx, 0
; print the reversed vector
movdqa [pdivector_other], xmm0
mov rsi,pdivector_other
mov rdi,fmt4
call printpdi
; exit
mov rsp,rbp
pop rbp
ret
;print function-----------------------------------------------
printpdi:
push rbp
mov rbp,rsp
movdqa xmm0, [rsi]
; extract the packed values from xmm3
pextrd esi, xmm0,0
pextrd edx, xmm0,1
pextrd ecx, xmm0,2
pextrd r8d, xmm0,3
mov rax,0 ; no floats
call printf
leave
ret
结果如下:
Packed Integer Vector 1: 1, 2, 3, 4
Packed Integer Vector 2: 5, 6, 7, 8
Sum Vector: 6, 8, 10, 12
Reverse of Sum Vector: 12, 10, 8, 6
SSE中也有操作字符串比较的指令:
image.png image.png
指令的第三个参数imm8是一个立即数,含义如下:
image.png
网友评论