Marsaglia's XORSHIFT32 in PSIMD
Written 2018-08-03
Tags:PRNG XORSHIFT PSIMD
Here's a simple C program using PSIMD to implement Marsaglia's XORSHIFT32 PRNG with four parallel generators.
#include "psimd.h" #include <stdio.h> PSIMD_INTRINSIC psimd_u32 psimd_lsr_u32(psimd_u32 a, uint32_t b) {return a >> b;} PSIMD_INTRINSIC psimd_u32 psimd_lsl_u32(psimd_u32 a, uint32_t b) {return a << b;} PSIMD_INTRINSIC psimd_u32 psimd_xor_u32(psimd_u32 a, psimd_u32 b) {return a ^ b;} psimd_u32 xorshift32_4( psimd_u32 x ) { x = psimd_xor_u32( x, psimd_lsl_u32( x, 13 ) ); x = psimd_xor_u32( x, psimd_lsr_u32( x, 17 ) ); x = psimd_xor_u32( x, psimd_lsl_u32( x, 5 ) ); return x; } uint32_t xorshift32(uint32_t x) { x ^= x << 13; x ^= x >> 17; x ^= x << 5; return x; } #define print_psimd_u32( _x ) printf("%08x %08x %08x %08x\n", _x[0], _x[1], _x[2], _x[3] ) int main() { psimd_u32 state_start = { 1, 2, 3, 4 }; print_psimd_u32( state_start ); psimd_u32 state_vector = xorshift32_4( state_start ); print_psimd_u32( state_vector ); psimd_u32 state_scalar = { xorshift32(state_start[0]),xorshift32(state_start[1]),xorshift32(state_start[2]),xorshift32(state_start[3]) }; print_psimd_u32( state_vector ); return 0; }
And we get the same results for both PSIMD and scalar execution
0x00000001 | 0x00000002 | 0x00000003 | 0x00000004 | Starting States |
0x00042021 | 0x00084042 | 0x000c6063 | 0x00108084 | PSIMD States |
0x00042021 | 0x00084042 | 0x000c6063 | 0x00108084 | Scalar States |
With GCC6.3, -O3, assembly for the two algorithms comes out to the same number of instructions, even though one does four times the work.
xorshift32_4: .cfi_startproc movdqa %xmm0, %xmm1 pslld $13, %xmm1 pxor %xmm0, %xmm1 movdqa %xmm1, %xmm0 psrld $17, %xmm0 pxor %xmm1, %xmm0 movdqa %xmm0, %xmm1 pslld $5, %xmm1 pxor %xmm1, %xmm0 ret .cfi_endproc xorshift32: .cfi_startproc movl %edi, %eax sall $13, %edi xorl %edi, %eax movl %eax, %edi shrl $17, %edi xorl %edi, %eax movl %eax, %edi sall $5, %edi xorl %edi, %eax ret .cfi_endproc