ArrogantHair
Member
- Feb 6, 2025
- 41
- 27
- 51
I ain't a SIMD guy.How's my synthetic AVX-512 benchmark?
As noted in this post: http://www.portvapes.co.uk/?id=Latest-exam-1Z0-876-Dumps&exid=threads/rudi_float_bench-v0-02a.2628323/post-41412252Kinda strange benchmark, i'm seeing almost 100% thread scaling on 16 core Zen5 with SMT enabled/disabled
well if you read my post from marchNice! Something to play with
loop carried dependency will not allow hitting optimal performance, since every iteration depends on results of previous iteration. Since most cpus are able to execute 2 fadds per cycle, second unit will not be used in parallel.
well if you read my post from march
The optimal unroll ratio is number of execution units x latency of the operation, provided you have enough architectural registers. For Zen5 it will be 6 512b adds. For Zen4 3 512b adds but 6 256b adds. Though the easiest to write that would be in inline assembly if you ditch msvc for clang, then you can get rid of the volatile. Not neccessary though.
#include <immintrin.h>
template<unsigned int MAX_PARALLEL = 6>
int rudiparallel(int subLoop = 256)
{
if (subLoop <= 0)
return 0;
__m512 a[MAX_PARALLEL];
__m512 b[MAX_PARALLEL];
volatile float va[MAX_PARALLEL];
int maxp = 0;
do {
va[maxp] = static_cast<float>(maxp + 1);
a[maxp] = _mm512_set1_ps(va[maxp]);
b[maxp] = _mm512_set1_ps(0.0f);
} while (++maxp < MAX_PARALLEL);
int subLooper = subLoop;
do {
maxp = 0;
do {
b[maxp] = _mm512_add_ps(b[maxp], a[maxp]);
} while (++maxp < MAX_PARALLEL);
} while (--subLooper);
maxp = 0;
do {
va[maxp] = *reinterpret_cast<float *>(&b[maxp]);
} while (++maxp < MAX_PARALLEL);
return subLoop * MAX_PARALLEL;
}
int main()
{
int numOps = rudiparallel<3>(); // numOps = 256 * 3
return rudiparallel(); // returns 256 * 6 ops
}
.LBB0_3:
vaddps zmm11, zmm1, zmm11
vaddps zmm10, zmm2, zmm10
vaddps zmm9, zmm3, zmm9
vaddps zmm7, zmm5, zmm7
vaddps zmm4, zmm6, zmm4
vaddps zmm0, zmm8, zmm0
vaddps zmm11, zmm1, zmm11
vaddps zmm10, zmm2, zmm10
vaddps zmm9, zmm3, zmm9
vaddps zmm7, zmm5, zmm7
vaddps zmm4, zmm6, zmm4
vaddps zmm0, zmm8, zmm0
add eax, 2
jne .LBB0_3
$LL7@rudiparall:
vaddps zmm5, zmm5, zmm11
vaddps zmm4, zmm4, zmm10
vaddps zmm3, zmm3, zmm9
vaddps zmm2, zmm2, zmm8
vaddps zmm1, zmm1, zmm7
vaddps zmm0, zmm0, zmm6
sub r8d, 1
jne SHORT $LL7@rudiparall
.L2:
vaddps zmm0, zmm0, zmm11
vaddps zmm5, zmm5, zmm10
vaddps zmm4, zmm4, zmm9
vaddps zmm3, zmm3, zmm8
vaddps zmm2, zmm2, zmm7
vaddps zmm1, zmm1, zmm6
sub eax, 1
jne .L2