SSE4.1 Langsamer als SSE3 auf Matrix 4x4 -Multiplikation?

Anonymous · Post by **Anonymous** » 21 Feb 2025, 23:21

Ich habe diese SSE3 -Implementierung für die Matrixmultiplikation: < /p>
/**
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

// Need to perform dot product [x, y, z, d] * [1, 5, 9, 10]
// This results in [x + 1, y + 5, z + 9, d + 10]
__m128 mul = _mm_mul_ps(a0, b0);
// Perform horizontal addition to sum of all of these values
// This results in [x + 1 + y + 5, z + 9 + d + 10, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// This results in [x + 1 + y + 5 + z + 9 + d + 10, 0.0, 0.0, 0.0]
mul = _mm_hadd_ps(mul, mul);
// Retrieve the result into result[0]
result[0] = _mm_cvtss_f32(mul);

// Perform the same for the rest of the matrix elements

mul = _mm_mul_ps(a0, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[1] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[2] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a0, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[3] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[4] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[5] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[6] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a1, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[7] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[8] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[9] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[10] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a2, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[11] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b0);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[12] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b1);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[13] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b2);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[14] = _mm_cvtss_f32(mul);

mul = _mm_mul_ps(a3, b3);
mul = _mm_hadd_ps(mul, mul);
mul = _mm_hadd_ps(mul, mul);
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Ausführen dieser Funktion 1.000.000 -mal führt zu einer Geschwindigkeit von ~ 0,04 Sekunden
Ich habe darüber nachgedacht, ein Punktprodukt zu verwenden, die die Dinge beschleunigen würden, da ich nicht habe an: < /p>
1. Multiply
2. Do horizontal addition
3. Do another horizontal addition
< /code>
Aber stattdessen nur: < /p>
1. Single Dot product
< /code>
Hier ist die SSE4.1 -Implementierung: < /p>
* Loop is unwraped for performance
* @attention As opposed to non-SIMD multiplication we're using column-major
*/
inline void multiply(const float *__restrict affector, const float *__restrict affected, float *__restrict result)
{
// std::cout b1 = [2, 6, 10, 14]
// need to arrive at-> b2 = [3, 7, 11, 15]
// need to arrive at-> b3 = [4, 8, 12, 16]

// tmp0 = [1, 5, 2, 6]
__m128 tmp0 = _mm_unpacklo_ps(b0, b1);
// tmp1 = [3, 7, 4, 8]
__m128 tmp1 = _mm_unpackhi_ps(b0, b1);
// tmp2 = [9, 13, 10, 14]
__m128 tmp2 = _mm_unpacklo_ps(b2, b3);
// tmp3 = [11, 15, 12, 16]
__m128 tmp3 = _mm_unpackhi_ps(b2, b3);

// b0 = [1, 5, ....] = move tmp2 low into tmp0 high
b0 = _mm_movelh_ps(tmp0, tmp2);
// b1 = [...., 10, 14] = move tmp0 high into tmp tmp2 low
b1 = _mm_movehl_ps(tmp2, tmp0);
// b2 = [3, 7, ....] = move tmp3 lows into tmp1 highs
b2 = _mm_movelh_ps(tmp1, tmp3);
// b3 = [...., 12, 16] = move tmp1 highs into tmp3 lows
b3 = _mm_movehl_ps(tmp3, tmp1);

__m128 mul;

// Perform the matrix multiplication for each element
mul = _mm_dp_ps(a0, b0, 0xF1); // Dot product of a0 and b0, 0xF1 means all four elements
result[0] = _mm_cvtss_f32(mul); // Store result

mul = _mm_dp_ps(a0, b1, 0xF1); // Dot product of a0 and b1
result[1] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b2, 0xF1); // Dot product of a0 and b2
result[2] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a0, b3, 0xF1); // Dot product of a0 and b3
result[3] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b0, 0xF1); // Dot product of a1 and b0
result[4] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b1, 0xF1); // Dot product of a1 and b1
result[5] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b2, 0xF1); // Dot product of a1 and b2
result[6] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a1, b3, 0xF1); // Dot product of a1 and b3
result[7] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b0, 0xF1); // Dot product of a2 and b0
result[8] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b1, 0xF1); // Dot product of a2 and b1
result[9] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b2, 0xF1); // Dot product of a2 and b2
result[10] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a2, b3, 0xF1); // Dot product of a2 and b3
result[11] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b0, 0xF1); // Dot product of a3 and b0
result[12] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b1, 0xF1); // Dot product of a3 and b1
result[13] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b2, 0xF1); // Dot product of a3 and b2
result[14] = _mm_cvtss_f32(mul);

mul = _mm_dp_ps(a3, b3, 0xF1); // Dot product of a3 and b3
result[15] = _mm_cvtss_f32(mul);
}
< /code>
Das Ergebnis war: ~ 0,15 Sekunden !!! Dies ist noch langsamer als meine Implementierung, bei der intristik (~ 0,11-0,12 Sekunden) und die, die SSE2 (~ 0,10-0,9 Sekunden) verwendet, nicht verwendet. Was ist los?? Liegt es darauf, wie das DOT -Produkt auf der unteren Ebene implementiert ist oder etwas falsch mache?>

SSE4.1 Langsamer als SSE3 auf Matrix 4x4 -Multiplikation?

SSE4.1 Langsamer als SSE3 auf Matrix 4x4 -Multiplikation? ⇐ C++

Quick Reply