From 50d491e22d615925bb772cb91f89745acd4de286 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Fri, 10 Feb 2017 14:56:57 -0800 Subject: [PATCH] swr: [rasterizer core] Finish SIMD16 PA OPT including tesselation Reviewed-by: Bruce Cherniak --- src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 268 +++++++++++++++++++-- 1 file changed, 247 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index 22643bdcc34..6fb37e5d7e1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -361,18 +361,35 @@ void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m1 /// @todo Optimize this +#if USE_SIMD16_FRONTEND + if (pa.useAlternateOffset) + { + primIndex += KNOB_SIMD_WIDTH; + } + +#endif float* pOutVec = (float*)verts; for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) { uint32_t input_cp = primIndex * TotalControlPoints + cp; +#if USE_SIMD16_FRONTEND + uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; + uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; + +#else uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; +#endif // Loop over all components of the attribute for (uint32_t i = 0; i < 4; ++i) { +#if USE_SIMD16_FRONTEND + const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); +#else const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); +#endif pOutVec[cp * 4 + i] = pInputVec[input_lane]; } } @@ -398,6 +415,15 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) /// @todo Optimize this +#if USE_SIMD16_FRONTEND + uint32_t lane_offset = 0; + + if (pa.useAlternateOffset) + { + lane_offset = KNOB_SIMD_WIDTH; + } + +#endif // Loop over all components of the attribute for (uint32_t i = 0; i < 4; ++i) { @@ -406,11 +432,19 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) float vec[KNOB_SIMD_WIDTH]; for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane) { +#if USE_SIMD16_FRONTEND + uint32_t input_cp = (lane + lane_offset) * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; + uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; + + const float* pInputVec = (const float*)(&PaGetSimdVector_simd16(pa, input_vec, slot)[i]); +#else uint32_t input_cp = lane * TotalControlPoints + cp; uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH; uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH; const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); +#endif vec[lane] = pInputVec[input_lane]; } verts[cp][i] = _simd_loadu_ps(vec); @@ -428,6 +462,58 @@ static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) return true; } +#if ENABLE_AVX512_SIMD16 +template +static bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) +{ + SetNextPaState_simd16( + pa, + PaPatchList_simd16, + PaPatchListSingle); + + return false; +} + +template +static bool PaPatchListTerm_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) +{ + // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output + // KNOB_SIMD16_WIDTH * 1 patch. This function is called once per attribute. + // Each attribute has 4 components. + + /// @todo Optimize this + + // Loop over all components of the attribute + for (uint32_t i = 0; i < 4; ++i) + { + for (uint32_t cp = 0; cp < TotalControlPoints; ++cp) + { + float vec[KNOB_SIMD16_WIDTH]; + for (uint32_t lane = 0; lane < KNOB_SIMD16_WIDTH; ++lane) + { + uint32_t input_cp = lane * TotalControlPoints + cp; + uint32_t input_vec = input_cp / KNOB_SIMD16_WIDTH; + uint32_t input_lane = input_cp % KNOB_SIMD16_WIDTH; + + const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]); + vec[lane] = pInputVec[input_lane]; + } + verts[cp][i] = _simd16_loadu_ps(vec); + } + } + + SetNextPaState_simd16( + pa, + PaPatchList_simd16, + PaPatchListSingle, + 0, + KNOB_SIMD16_WIDTH, + true); + + return true; +} + +#endif #define PA_PATCH_LIST_TERMINATOR(N) \ template<> bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\ { return PaPatchListTerm(pa, slot, verts); } @@ -465,6 +551,45 @@ PA_PATCH_LIST_TERMINATOR(31) PA_PATCH_LIST_TERMINATOR(32) #undef PA_PATCH_LIST_TERMINATOR +#if ENABLE_AVX512_SIMD16 +#define PA_PATCH_LIST_TERMINATOR_SIMD16(N) \ + template<> bool PaPatchList_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])\ + { return PaPatchListTerm_simd16(pa, slot, verts); } +PA_PATCH_LIST_TERMINATOR_SIMD16(1) +PA_PATCH_LIST_TERMINATOR_SIMD16(2) +PA_PATCH_LIST_TERMINATOR_SIMD16(3) +PA_PATCH_LIST_TERMINATOR_SIMD16(4) +PA_PATCH_LIST_TERMINATOR_SIMD16(5) +PA_PATCH_LIST_TERMINATOR_SIMD16(6) +PA_PATCH_LIST_TERMINATOR_SIMD16(7) +PA_PATCH_LIST_TERMINATOR_SIMD16(8) +PA_PATCH_LIST_TERMINATOR_SIMD16(9) +PA_PATCH_LIST_TERMINATOR_SIMD16(10) +PA_PATCH_LIST_TERMINATOR_SIMD16(11) +PA_PATCH_LIST_TERMINATOR_SIMD16(12) +PA_PATCH_LIST_TERMINATOR_SIMD16(13) +PA_PATCH_LIST_TERMINATOR_SIMD16(14) +PA_PATCH_LIST_TERMINATOR_SIMD16(15) +PA_PATCH_LIST_TERMINATOR_SIMD16(16) +PA_PATCH_LIST_TERMINATOR_SIMD16(17) +PA_PATCH_LIST_TERMINATOR_SIMD16(18) +PA_PATCH_LIST_TERMINATOR_SIMD16(19) +PA_PATCH_LIST_TERMINATOR_SIMD16(20) +PA_PATCH_LIST_TERMINATOR_SIMD16(21) +PA_PATCH_LIST_TERMINATOR_SIMD16(22) +PA_PATCH_LIST_TERMINATOR_SIMD16(23) +PA_PATCH_LIST_TERMINATOR_SIMD16(24) +PA_PATCH_LIST_TERMINATOR_SIMD16(25) +PA_PATCH_LIST_TERMINATOR_SIMD16(26) +PA_PATCH_LIST_TERMINATOR_SIMD16(27) +PA_PATCH_LIST_TERMINATOR_SIMD16(28) +PA_PATCH_LIST_TERMINATOR_SIMD16(29) +PA_PATCH_LIST_TERMINATOR_SIMD16(30) +PA_PATCH_LIST_TERMINATOR_SIMD16(31) +PA_PATCH_LIST_TERMINATOR_SIMD16(32) +#undef PA_PATCH_LIST_TERMINATOR_SIMD16 + +#endif bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) { SetNextPaState(pa, PaTriList1, PaTriListSingle0); @@ -2324,44 +2449,49 @@ bool PaRectList1_simd16( } } - __m256 tmp0, tmp1, tmp2; + simd16vector &v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + simd16vector &v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + simd16vector &v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } // Loop over each component in the simdvector. for (int i = 0; i < 4; i += 1) { - simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + simdscalar v0_lo; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + simdscalar v1_lo; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + simdscalar v2_lo; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + + __m256 tmp0, tmp1, tmp2; + tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } - v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. - tmp1 = _mm256_permute_ps(v0[i].lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } - v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } - v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + v0_lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + tmp1 = _mm256_permute_ps(v0_lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } + v0_lo = _mm256_permute_ps(v0_lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0_lo = _mm256_blend_ps(tmp1, v0_lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. /// AVX2 should make this much cheaper. - simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } - v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + v1_lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } - tmp2 = _mm256_blend_ps(v1[i].lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } + tmp2 = _mm256_blend_ps(v1_lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } - v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } - v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } - v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + v1_lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1_lo = _mm256_blend_ps(tmp2, v1_lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1_lo = _mm256_blend_ps(v1_lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } // verts[2] = { v2, w, v5, x, v8, y, v11, z } - simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } - v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + v2_lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } - v2[i].lo = _mm256_blend_ps(tmp1, v2[i].lo, 0xF0); + v2_lo = _mm256_blend_ps(tmp1, v2_lo, 0xF0); // Need to compute 4th implied vertex for the rectangle. - tmp2 = _mm256_sub_ps(v0[i].lo, v1[i].lo); - tmp2 = _mm256_add_ps(tmp2, v2[i].lo); // tmp2 = { w, *, x, *, y, *, z, * } + tmp2 = _mm256_sub_ps(v0_lo, v1_lo); + tmp2 = _mm256_add_ps(tmp2, v2_lo); // tmp2 = { w, *, x, *, y, *, z, * } tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } - v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + v2_lo = _mm256_blend_ps(v2_lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } - v0[i].hi = _simd_setzero_ps(); - v1[i].hi = _simd_setzero_ps(); - v2[i].hi = _simd_setzero_ps(); + v0[i] = _simd16_insert_ps(_simd16_setzero_ps(), v0_lo, 0); + v1[i] = _simd16_insert_ps(_simd16_setzero_ps(), v1_lo, 0); + v2[i] = _simd16_insert_ps(_simd16_setzero_ps(), v2_lo, 0); } SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true); @@ -2542,99 +2672,195 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* case TOP_PATCHLIST_1: this->pfnPaFunc = PaPatchList<1>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<1>; +#endif break; case TOP_PATCHLIST_2: this->pfnPaFunc = PaPatchList<2>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<2>; +#endif break; case TOP_PATCHLIST_3: this->pfnPaFunc = PaPatchList<3>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<3>; +#endif break; case TOP_PATCHLIST_4: this->pfnPaFunc = PaPatchList<4>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<4>; +#endif break; case TOP_PATCHLIST_5: this->pfnPaFunc = PaPatchList<5>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<5>; +#endif break; case TOP_PATCHLIST_6: this->pfnPaFunc = PaPatchList<6>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<6>; +#endif break; case TOP_PATCHLIST_7: this->pfnPaFunc = PaPatchList<7>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<7>; +#endif break; case TOP_PATCHLIST_8: this->pfnPaFunc = PaPatchList<8>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<8>; +#endif break; case TOP_PATCHLIST_9: this->pfnPaFunc = PaPatchList<9>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<9>; +#endif break; case TOP_PATCHLIST_10: this->pfnPaFunc = PaPatchList<10>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<10>; +#endif break; case TOP_PATCHLIST_11: this->pfnPaFunc = PaPatchList<11>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<11>; +#endif break; case TOP_PATCHLIST_12: this->pfnPaFunc = PaPatchList<12>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<12>; +#endif break; case TOP_PATCHLIST_13: this->pfnPaFunc = PaPatchList<13>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<13>; +#endif break; case TOP_PATCHLIST_14: this->pfnPaFunc = PaPatchList<14>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<14>; +#endif break; case TOP_PATCHLIST_15: this->pfnPaFunc = PaPatchList<15>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<15>; +#endif break; case TOP_PATCHLIST_16: this->pfnPaFunc = PaPatchList<16>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<16>; +#endif break; case TOP_PATCHLIST_17: this->pfnPaFunc = PaPatchList<17>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<17>; +#endif break; case TOP_PATCHLIST_18: this->pfnPaFunc = PaPatchList<18>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<18>; +#endif break; case TOP_PATCHLIST_19: this->pfnPaFunc = PaPatchList<19>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<19>; +#endif break; case TOP_PATCHLIST_20: this->pfnPaFunc = PaPatchList<20>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<20>; +#endif break; case TOP_PATCHLIST_21: this->pfnPaFunc = PaPatchList<21>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<21>; +#endif break; case TOP_PATCHLIST_22: this->pfnPaFunc = PaPatchList<22>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<22>; +#endif break; case TOP_PATCHLIST_23: this->pfnPaFunc = PaPatchList<23>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<23>; +#endif break; case TOP_PATCHLIST_24: this->pfnPaFunc = PaPatchList<24>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<24>; +#endif break; case TOP_PATCHLIST_25: this->pfnPaFunc = PaPatchList<25>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<25>; +#endif break; case TOP_PATCHLIST_26: this->pfnPaFunc = PaPatchList<26>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<26>; +#endif break; case TOP_PATCHLIST_27: this->pfnPaFunc = PaPatchList<27>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<27>; +#endif break; case TOP_PATCHLIST_28: this->pfnPaFunc = PaPatchList<28>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<28>; +#endif break; case TOP_PATCHLIST_29: this->pfnPaFunc = PaPatchList<29>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<29>; +#endif break; case TOP_PATCHLIST_30: this->pfnPaFunc = PaPatchList<30>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<30>; +#endif break; case TOP_PATCHLIST_31: this->pfnPaFunc = PaPatchList<31>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<31>; +#endif break; case TOP_PATCHLIST_32: this->pfnPaFunc = PaPatchList<32>; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaPatchList_simd16<32>; +#endif break; default: -- 2.11.0