Marsaglia's XORSHIFT32 in PSIMD
Written 2018-08-03
Tags:PRNG XORSHIFT PSIMD
Here's a simple C program using PSIMD to implement Marsaglia's XORSHIFT32 PRNG with four parallel generators.
#include "psimd.h"
#include <stdio.h>
PSIMD_INTRINSIC psimd_u32 psimd_lsr_u32(psimd_u32 a, uint32_t b) {return a >> b;}
PSIMD_INTRINSIC psimd_u32 psimd_lsl_u32(psimd_u32 a, uint32_t b) {return a << b;}
PSIMD_INTRINSIC psimd_u32 psimd_xor_u32(psimd_u32 a, psimd_u32 b) {return a ^ b;}
psimd_u32 xorshift32_4( psimd_u32 x )
{
x = psimd_xor_u32( x, psimd_lsl_u32( x, 13 ) );
x = psimd_xor_u32( x, psimd_lsr_u32( x, 17 ) );
x = psimd_xor_u32( x, psimd_lsl_u32( x, 5 ) );
return x;
}
uint32_t xorshift32(uint32_t x)
{
x ^= x << 13;
x ^= x >> 17;
x ^= x << 5;
return x;
}
#define print_psimd_u32( _x ) printf("%08x %08x %08x %08x\n", _x[0], _x[1], _x[2], _x[3] )
int main()
{
psimd_u32 state_start = { 1, 2, 3, 4 };
print_psimd_u32( state_start );
psimd_u32 state_vector = xorshift32_4( state_start );
print_psimd_u32( state_vector );
psimd_u32 state_scalar = { xorshift32(state_start[0]),xorshift32(state_start[1]),xorshift32(state_start[2]),xorshift32(state_start[3]) };
print_psimd_u32( state_vector );
return 0;
}
And we get the same results for both PSIMD and scalar execution
| 0x00000001 | 0x00000002 | 0x00000003 | 0x00000004 | Starting States |
| 0x00042021 | 0x00084042 | 0x000c6063 | 0x00108084 | PSIMD States |
| 0x00042021 | 0x00084042 | 0x000c6063 | 0x00108084 | Scalar States |
With GCC6.3, -O3, assembly for the two algorithms comes out to the same number of instructions, even though one does four times the work.
xorshift32_4:
.cfi_startproc
movdqa %xmm0, %xmm1
pslld $13, %xmm1
pxor %xmm0, %xmm1
movdqa %xmm1, %xmm0
psrld $17, %xmm0
pxor %xmm1, %xmm0
movdqa %xmm0, %xmm1
pslld $5, %xmm1
pxor %xmm1, %xmm0
ret
.cfi_endproc
xorshift32:
.cfi_startproc
movl %edi, %eax
sall $13, %edi
xorl %edi, %eax
movl %eax, %edi
shrl $17, %edi
xorl %edi, %eax
movl %eax, %edi
sall $5, %edi
xorl %edi, %eax
ret
.cfi_endproc