From 21839274612ff8a7c6434ead104a142b875fcf65 Mon Sep 17 00:00:00 2001 From: DTL2020 <68707763+DTL2020@users.noreply.github.com> Date: Sat, 16 Oct 2021 19:24:03 +0300 Subject: [PATCH 1/3] Added different predictors proc and new AVX2 SAD Added different predictors proc and new AVX2 SAD for ExhaustiveSearch --- Sources/MVAnalyse.cpp | 2 + Sources/PlaneOfBlocks.cpp | 686 +++++++++++++++++++++++++++++++++++++- Sources/PlaneOfBlocks.h | 30 +- 3 files changed, 705 insertions(+), 13 deletions(-) diff --git a/Sources/MVAnalyse.cpp b/Sources/MVAnalyse.cpp index 7d353f7..7ec5627 100644 --- a/Sources/MVAnalyse.cpp +++ b/Sources/MVAnalyse.cpp @@ -258,6 +258,8 @@ MVAnalyse::MVAnalyse( ++nLevelsMax; } +// nLevelsMax = 2; + analysisData.nLvCount = (lv > 0) ? lv : nLevelsMax + lv; if (analysisData.nLvCount > nSuperLevels) { diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp index 33802b6..dc170d9 100644 --- a/Sources/PlaneOfBlocks.cpp +++ b/Sources/PlaneOfBlocks.cpp @@ -103,10 +103,10 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi freqArray[0].resize(8192 * _nPel * 2); freqArray[1].resize(8192 * _nPel * 2); // for nFlags, we use CPU_xxxx constants instead of Avisynth's CPUF_xxx values, because there are extra bits here - bool sse2 = (bool)(nFlags & CPU_SSE2); // no tricks for really old processors. If SSE2 is reported, use it - bool sse41 = (bool)(nFlags & CPU_SSE4); - bool avx = (bool)(nFlags & CPU_AVX); - bool avx2 = (bool)(nFlags & CPU_AVX2); + sse2 = (bool)(nFlags & CPU_SSE2); // no tricks for really old processors. If SSE2 is reported, use it + sse41 = (bool)(nFlags & CPU_SSE4); + avx = (bool)(nFlags & CPU_AVX); + avx2 = (bool)(nFlags & CPU_AVX2); // bool ssd = (bool)(nFlags & MOTION_USE_SSD); // bool satd = (bool)(nFlags & MOTION_USE_SATD); @@ -685,7 +685,6 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea) { workarea.predictors[1] = ClipMV(workarea, zeroMVfieldShifted); // v1.11.1 - values instead of pointer } - // fixme note: // MAnalyze mt-inconsistency reason #1 // this is _not_ internal mt friendly, since here up or bottom predictors @@ -703,7 +702,6 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea) { workarea.predictors[2] = ClipMV(workarea, zeroMVfieldShifted); } - // Original problem: random, small, rare, mostly irreproducible differences between multiple encodings. // In all, I spent at least a week on the problem during a half year, losing hope // and restarting again four times. Nasty bug it was. @@ -747,7 +745,7 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea) workarea.predictors[0].sad = workarea.predictors[1].sad; } - // if there are no other planes, predictor is the median + // if there are no other planes, predictor is the median if (smallestPlane) { workarea.predictor = workarea.predictors[0]; @@ -811,12 +809,23 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea) } break; case EXHAUSTIVE: { + // ExhaustiveSearch(nSearchParam); int mvx = workarea.bestMV.x; int mvy = workarea.bestMV.y; + if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) + { + ExhaustiveSearch8x8_sp2_avx2(workarea, mvx, mvy); + break; + } + if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) + { + ExhaustiveSearch8x8_sp4_avx2(workarea, mvx, mvy); + break; + } for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far) { - ExpandingSearch(workarea, i, 1, mvx, mvy); + ExpandingSearch(workarea, i, 1, mvx, mvy); } } break; @@ -1097,6 +1106,113 @@ void PlaneOfBlocks::PseudoEPZSearch(WorkingArea& workarea) } +template +void PlaneOfBlocks::PseudoEPZSearch_no_pred(WorkingArea& workarea) +{ + typedef typename std::conditional < sizeof(pixel_t) == 1, sad_t, bigsad_t >::type safe_sad_t; + // FetchPredictors(workarea); + + sad_t sad; + sad_t saduv; + + + // We treat zero alone + // Do we bias zero with not taking into account distorsion ? + workarea.bestMV.x = zeroMVfieldShifted.x; + workarea.bestMV.y = zeroMVfieldShifted.y; + saduv = (chroma) ? + ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, 0, 0), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, 0, 0), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, 0, zeroMVfieldShifted.y)); + sad += saduv; + workarea.bestMV.sad = sad; + workarea.nMinCost = sad + ((penaltyZero * (safe_sad_t)sad) >> 8); // v.1.11.0.2 + + // then, we refine, according to the search type + Refine(workarea); + + // we store the result + vectors[workarea.blkIdx].x = workarea.bestMV.x; + vectors[workarea.blkIdx].y = workarea.bestMV.y; + vectors[workarea.blkIdx].sad = workarea.bestMV.sad; + + workarea.planeSAD += workarea.bestMV.sad; // for debug, plus fixme outer planeSAD is not used +} + +template +void PlaneOfBlocks::PseudoEPZSearch_glob_med_pred(WorkingArea& workarea) +{ + typedef typename std::conditional < sizeof(pixel_t) == 1, sad_t, bigsad_t >::type safe_sad_t; + FetchPredictors(workarea); + + sad_t sad; + sad_t saduv; + + + // We treat zero alone + // Do we bias zero with not taking into account distorsion ? + workarea.bestMV.x = zeroMVfieldShifted.x; + workarea.bestMV.y = zeroMVfieldShifted.y; + saduv = (chroma) ? + ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, 0, 0), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, 0, 0), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, 0, zeroMVfieldShifted.y)); + sad += saduv; + workarea.bestMV.sad = sad; + workarea.nMinCost = sad + ((penaltyZero * (safe_sad_t)sad) >> 8); // v.1.11.0.2 + + + // Global MV predictor - added by Fizick + workarea.globalMVPredictor = ClipMV(workarea, workarea.globalMVPredictor); + + // if ( workarea.IsVectorOK(workarea.globalMVPredictor.x, workarea.globalMVPredictor.y ) ) + { + saduv = (chroma) ? + ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y)); + sad += saduv; + sad_t cost = sad + ((pglobal * (safe_sad_t)sad) >> 8); + + if (cost < workarea.nMinCost || tryMany) + { + workarea.bestMV.x = workarea.globalMVPredictor.x; + workarea.bestMV.y = workarea.globalMVPredictor.y; + workarea.bestMV.sad = sad; + workarea.nMinCost = cost; + } + + // } + // Then, the predictor : + // if ( (( workarea.predictor.x != zeroMVfieldShifted.x ) || ( workarea.predictor.y != zeroMVfieldShifted.y )) + // && (( workarea.predictor.x != workarea.globalMVPredictor.x ) || ( workarea.predictor.y != workarea.globalMVPredictor.y ))) + // { + saduv = (chroma) ? ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, workarea.predictor.x, workarea.predictor.y), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, workarea.predictor.x, workarea.predictor.y), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, workarea.predictor.x, workarea.predictor.y)); + sad += saduv; + + cost = sad; + if (cost < workarea.nMinCost || tryMany) + { + workarea.bestMV.x = workarea.predictor.x; + workarea.bestMV.y = workarea.predictor.y; + workarea.bestMV.sad = sad; + workarea.nMinCost = cost; + } + + } + + // then, we refine, according to the search type + Refine(workarea); + + // we store the result + vectors[workarea.blkIdx].x = workarea.bestMV.x; + vectors[workarea.blkIdx].y = workarea.bestMV.y; + vectors[workarea.blkIdx].sad = workarea.bestMV.sad; + + workarea.planeSAD += workarea.bestMV.sad; // for debug, plus fixme outer planeSAD is not used +} template void PlaneOfBlocks::DiamondSearch(WorkingArea &workarea, int length) @@ -1323,9 +1439,8 @@ void PlaneOfBlocks::OneTimeSearch(WorkingArea &workarea, int length) template void PlaneOfBlocks::ExpandingSearch(WorkingArea &workarea, int r, int s, int mvx, int mvy) // diameter = 2*r + 1, step=s { // part of true enhaustive search (thin expanding square) around mvx, mvy - int i, j; - // VECTOR mv = workarea.bestMV; // bug: it was pointer assignent, not values, so iterative! - v2.1 - + int i, j; + // VECTOR mv = workarea.bestMV; // bug: it was pointer assignent, not values, so iterative! - v2.1 // sides of square without corners for (i = -r + s; i < r; i += s) // without corners! - v2.1 { @@ -3055,7 +3170,10 @@ void PlaneOfBlocks::search_mv_slice(Slicer::TaskData &td) workarea.predictors[4] = ClipMV(workarea, zeroMV); } - PseudoEPZSearch(workarea); + // Possible point of placement selection of 'predictiors control' + PseudoEPZSearch(workarea); // all predictors +// PseudoEPZSearch_glob_med_pred(workarea); // partial predictors +// PseudoEPZSearch_no_pred(workarea); // no predictiors // workarea.bestMV = zeroMV; // debug if (outfilebuf != NULL) // write vector to outfile @@ -3641,3 +3759,547 @@ PlaneOfBlocks::WorkingArea *PlaneOfBlocks::WorkingAreaFactory::do_create() } +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy + 3)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) + { + int dbr01 = 0; + return; + } + + // array of sads 8x8 + // due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search + unsigned short ArrSADs[8][9]; + const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner + const uint8_t* pucCurr = workarea.pSrc[0]; + + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; + + __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x16bytes store, require buf padding to allow 16bytes reads to xmm + __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; + + __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; + + + // __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + // __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + + xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); + xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); + ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); + + xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); + xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); + ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); + + xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); + xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); + ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); + + xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); + xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); + ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); + // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms + + for (int i = 0; i < 9; i++) + { + ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); + ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); + ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); + ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); + // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + + // process sad[-4,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -4,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[0][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-3,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -3,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[1][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-2,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -2,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[2][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-1,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -1,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[3][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[0,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 0,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[4][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[1,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 1,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[5][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[2,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 2,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[6][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[3,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 3,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[7][i], _mm256_castsi256_si128(ymm4_tmp)); + } + + _mm256_zeroupper(); + + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -4; x < 4; x++) + { + for (int y = -4; y < 5; y++) + { + if (ArrSADs[x+4][y+4] < minsad) + { + minsad = ArrSADs[x+4][y+4]; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew*minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; + +} + +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check !! need to fix caller to now allow illegal vectors + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) + { + int dbr01 = 0; + return; + } + + // array of sads 5x5 + unsigned short ArrSADs[5][5]; + const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner + const uint8_t* pucCurr = workarea.pSrc[0]; + + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; + + __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm + __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; + + __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; + + +// __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work +// __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work +// my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes + + xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); + xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); + ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); + + xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); + xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); + ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); + + xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); + xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); + ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); + + xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); + xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); + ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); + // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms + + for (int i = 0; i < 5; i++) + { + ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); + ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); + ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); + ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); + // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + + // process sad[-2,i-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -2,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[0][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-1,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -1,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[1][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-0,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 0,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[2][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[1,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 1,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[3][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[2,i-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 2,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[4][i], _mm256_castsi256_si128(ymm4_tmp)); + } + + _mm256_zeroupper(); + + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -2; x < 2; x++) + { + for (int y = -2; y < 2; y++) + { + if (ArrSADs[x+2][y+2] < minsad) + { + minsad = ArrSADs[x+2][y+2]; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew*minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; +} diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h index 718e16f..5714cd9 100644 --- a/Sources/PlaneOfBlocks.h +++ b/Sources/PlaneOfBlocks.h @@ -15,6 +15,14 @@ // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit // http://www.gnu.org/copyleft/gpl.html . +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) +#include +// x86intrin.h includes header files for whatever instruction +// sets are specified on the compiler command line, such as: xopintrin.h, fma4intrin.h +#else +#include // MS version of immintrin.h covers AVX, AVX2 and FMA3 +#endif // __GNUC__ + #ifndef __POBLOCKS__ #define __POBLOCKS__ @@ -137,6 +145,11 @@ class PlaneOfBlocks bool isse; /* can we use isse asm code */ bool chroma; /* do we do chroma me */ + bool sse2; // make members now to use in SADs + bool sse41; + bool avx; + bool avx2; + int dctpitch; conc::ObjPool * // Set to 0 if not used @@ -315,13 +328,28 @@ class PlaneOfBlocks /* performs an epz search */ template - void PseudoEPZSearch(WorkingArea &workarea); + void PseudoEPZSearch(WorkingArea &workarea); // full predictors, slowest, max quality + + /* performs an epz search */ + template + void PseudoEPZSearch_no_pred(WorkingArea& workarea); // planes = 1 recommended + + /* performs an epz search */ + template + void PseudoEPZSearch_glob_med_pred(WorkingArea& workarea); // planes >=2 recommended // void PhaseShiftSearch(int vx, int vy); /* performs an exhaustive search */ template void ExpandingSearch(WorkingArea &workarea, int radius, int step, int mvx, int mvy); // diameter = 2*radius + 1 + + template + void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based + + template + void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based + template void Hex2Search(WorkingArea &workarea, int i_me_range); From c9d7636d6676613261ff32262ea10375d093881d Mon Sep 17 00:00:00 2001 From: DTL2020 <68707763+DTL2020@users.noreply.github.com> Date: Tue, 19 Oct 2021 15:29:53 +0300 Subject: [PATCH 2/3] Added new more AVX2 Esa search sp2 and sp4 Added new more AVX2 Esa search sp2 and sp4. One pass only without immediate array of SADs. --- Sources/PlaneOfBlocks.cpp | 657 +++++++++++++++++++++++++++++++++++++- Sources/PlaneOfBlocks.h | 6 + 2 files changed, 661 insertions(+), 2 deletions(-) diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp index dc170d9..c34135e 100644 --- a/Sources/PlaneOfBlocks.cpp +++ b/Sources/PlaneOfBlocks.cpp @@ -815,12 +815,14 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea) int mvy = workarea.bestMV.y; if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) { - ExhaustiveSearch8x8_sp2_avx2(workarea, mvx, mvy); +// ExhaustiveSearch8x8_sp2_avx2(workarea, mvx, mvy); // old version with Arr of SADs for second pass + ExhaustiveSearch8x8_sp2_avx2_2(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search break; } if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) { - ExhaustiveSearch8x8_sp4_avx2(workarea, mvx, mvy); +// ExhaustiveSearch8x8_sp4_avx2(workarea, mvx, mvy); + ExhaustiveSearch8x8_sp4_avx2_2(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search break; } for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far) @@ -4074,6 +4076,387 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, } +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy + 3)) + { + int dbr01 = 0; + return; + } +/* if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) + { + int dbr01 = 0; + return; + } + */ // need to check if it is non needed cheks + + // due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search + alignas(256) unsigned short SIMD256Res[16]; + + const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner + const uint8_t* pucCurr = workarea.pSrc[0]; + + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; + + __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm + __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; + __m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0); + __m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0); + __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; + __m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit + __m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0 + + // __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + // __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + // my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes + + xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); + xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); + ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); + + xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); + xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); + ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); + + xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); + xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); + ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); + + xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); + xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); + ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); + // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms + + for (int i = 0; i < 9; i++) + { + ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); + ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); + ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); + ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); + // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + + // process sad[-4,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -4,i-4 ready in low of ymm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-3,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -3,i-4 ready in low of ymm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-2,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -2,i-4 ready in low of ymm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-1,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -1,i-4 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[0,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 0,i-4 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[1,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 1,i-4 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[2,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 2,i-4 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[3,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 3,i-4 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord + + // clear x cord to 0 + ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48); + ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48); + + } // rows counter + + // store result + _mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad); + + _mm256_zeroupper(); // check if it may be done _after_ all searches ??? +/* + SIMD256Res[0]; // minsad; + SIMD256Res[2] - 2; // x of minsad vector + SIMD256Res[3] - 2; // y of minsad vector + )*/ + + unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + SIMD256Res[2] - 4; + workarea.bestMV.y = mvy + SIMD256Res[3] - 4; + workarea.nMinCost = cost; + workarea.bestMV.sad = SIMD256Res[0]; + +} + template void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy) { @@ -4303,3 +4686,273 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, workarea.nMinCost = cost; workarea.bestMV.sad = minsad; } + + +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check !! need to fix caller to now allow illegal vectors + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) + { + int dbr01 = 0; + return; + } +/* if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) + { + int dbr01 = 0; + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) + { + int dbr01 = 0; + return; + } + */ // - check if it is OK to skip these cheks + + alignas(256) unsigned short SIMD256Res[16]; + const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner + const uint8_t* pucCurr = workarea.pSrc[0]; + + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; + + __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm + __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; + __m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0); + __m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0); + __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; + __m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit + __m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0 + + // __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + // __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + // my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes + + xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); + xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); + ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); + + xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); + xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); + ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); + + xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); + xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); + ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); + + xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); + xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); + ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); + // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms + + for (int i = 0; i < 5; i++) + { + ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); + ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); + ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); + ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); + // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + + // process sad[-2,i-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -2,i-2 ready in low of ymm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-1,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -1,i-2 ready in low of ymm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-0,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 0,i-2 ready in low of ymm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[1,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 1,i-2 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[2,i-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 2,i-2 ready in low of mm4 + //check if min and replace in ymm14_yx_minsad + ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed + ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist + ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than + ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed + ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than + + ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord + + // clear x cord to 0 + ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48); + ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48); + + } // rows counter + + // store result + _mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad); + + _mm256_zeroupper(); // check if it may be done _after_ all searches ??? +/* + SIMD256Res[0]; // minsad; + SIMD256Res[2] - 2; // x of minsad vector + SIMD256Res[3] - 2; // y of minsad vector + )*/ + + unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + SIMD256Res[2] - 2; + workarea.bestMV.y = mvy + SIMD256Res[3] - 2; + workarea.nMinCost = cost; + workarea.bestMV.sad = SIMD256Res[0]; +} diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h index 5714cd9..5608084 100644 --- a/Sources/PlaneOfBlocks.h +++ b/Sources/PlaneOfBlocks.h @@ -347,9 +347,15 @@ class PlaneOfBlocks template void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based + template + void ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based + template void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based + template + void ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based + template void Hex2Search(WorkingArea &workarea, int i_me_range); From c12a9047835745ac022bc06e00ecd4a853e8509d Mon Sep 17 00:00:00 2001 From: DTL2020 <68707763+DTL2020@users.noreply.github.com> Date: Wed, 20 Oct 2021 20:01:41 +0300 Subject: [PATCH 3/3] Added C-ref functions, fixed some bug, + Added C-ref functions (uses SAD() as old), fixed bug with ends of ArraySADs for sp2, added prefetch attempt. --- Sources/MVAnalyse.cpp | 2 +- Sources/PlaneOfBlocks.cpp | 772 ++++++-------------------------------- Sources/PlaneOfBlocks.h | 6 +- 3 files changed, 111 insertions(+), 669 deletions(-) diff --git a/Sources/MVAnalyse.cpp b/Sources/MVAnalyse.cpp index 7ec5627..fe68925 100644 --- a/Sources/MVAnalyse.cpp +++ b/Sources/MVAnalyse.cpp @@ -258,7 +258,7 @@ MVAnalyse::MVAnalyse( ++nLevelsMax; } -// nLevelsMax = 2; + nLevelsMax = 2; analysisData.nLvCount = (lv > 0) ? lv : nLevelsMax + lv; if (analysisData.nLvCount > nSuperLevels) diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp index c34135e..4e19490 100644 --- a/Sources/PlaneOfBlocks.cpp +++ b/Sources/PlaneOfBlocks.cpp @@ -815,14 +815,14 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea) int mvy = workarea.bestMV.y; if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) { -// ExhaustiveSearch8x8_sp2_avx2(workarea, mvx, mvy); // old version with Arr of SADs for second pass - ExhaustiveSearch8x8_sp2_avx2_2(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search + ExhaustiveSearch8x8_sp2_avx2(workarea, mvx, mvy); + //ExhaustiveSearch8x8_sp2_C_ref(workarea, mvx, mvy); break; } if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) { -// ExhaustiveSearch8x8_sp4_avx2(workarea, mvx, mvy); - ExhaustiveSearch8x8_sp4_avx2_2(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search + ExhaustiveSearch8x8_sp4_avx2(workarea, mvx, mvy); + //ExhaustiveSearch8x8_sp4_C_ref(workarea, mvx, mvy); break; } for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far) @@ -3173,8 +3173,8 @@ void PlaneOfBlocks::search_mv_slice(Slicer::TaskData &td) } // Possible point of placement selection of 'predictiors control' - PseudoEPZSearch(workarea); // all predictors -// PseudoEPZSearch_glob_med_pred(workarea); // partial predictors +// PseudoEPZSearch(workarea); // all predictors + PseudoEPZSearch_glob_med_pred(workarea); // partial predictors // PseudoEPZSearch_no_pred(workarea); // no predictiors // workarea.bestMV = zeroMV; // debug @@ -3760,6 +3760,54 @@ PlaneOfBlocks::WorkingArea *PlaneOfBlocks::WorkingAreaFactory::do_create() )); } +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_C_ref(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy + 4)) + { + return; + } + /* if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) + { + return; + } + */ + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -4; x < 4; x++) + { + for (int y = -4; y < 5; y++) + { + int sad = SAD(workarea.pSrc[0], nSrcPitch[0], GetRefBlock(workarea, mvx + x, mvy + y), nRefPitch[0]); + if (sad < minsad) + { + minsad = sad; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew * minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; + +} template void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy) @@ -3768,25 +3816,21 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) { - int dbr01 = 0; return; } if (!workarea.IsVectorOK(mvx + 3, mvy + 3)) { - int dbr01 = 0; return; } - if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) +/* if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) { - int dbr01 = 0; return; } if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) { - int dbr01 = 0; return; } - + */ // array of sads 8x8 // due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search unsigned short ArrSADs[8][9]; @@ -3828,6 +3872,7 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + _mm_prefetch(const_cast(reinterpret_cast(pucRef + nRefPitch[0] * (i + 8))), _MM_HINT_NTA); // prefetch next Ref row // process sad[-4,i-4] ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); @@ -4077,386 +4122,55 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, } template -void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy) +void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_C_ref(WorkingArea& workarea, int mvx, int mvy) { - // debug check ! - // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) - if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) - { - int dbr01 = 0; - return; - } - if (!workarea.IsVectorOK(mvx + 3, mvy + 3)) - { - int dbr01 = 0; - return; - } -/* if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) - { - int dbr01 = 0; - return; - } - if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) - { - int dbr01 = 0; - return; - } - */ // need to check if it is non needed cheks - - // due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search - alignas(256) unsigned short SIMD256Res[16]; - - const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner - const uint8_t* pucCurr = workarea.pSrc[0]; + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) + { + return; + } + /* if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) + { + return; + } + */ + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -2; x < 3; x++) + { + for (int y = -2; y < 3; y++) + { + int sad = SAD(workarea.pSrc[0], nSrcPitch[0], GetRefBlock(workarea, mvx + x, mvy + y), nRefPitch[0]); + if (sad < minsad) + { + minsad = sad; + x_minsad = x; + y_minsad = y; + } + } + } - __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; - - __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm - __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; - __m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0); - __m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0); - __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; - __m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit - __m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0 - - // __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work - // __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work - // my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes - - xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); - xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); - ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); - - xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); - xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); - ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); - - xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); - xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); - ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); - - xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); - xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); - ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); - // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms - - for (int i = 0; i < 9; i++) - { - ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); - ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); - ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); - ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); - // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 - - // process sad[-4,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad -4,i-4 ready in low of ymm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[-3,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad -3,i-4 ready in low of ymm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[-2,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad -2,i-4 ready in low of ymm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[-1,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad -1,i-4 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[0,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 0,i-4 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[1,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 1,i-4 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[2,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 2,i-4 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[3,i-4] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 3,i-4 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord - - // clear x cord to 0 - ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48); - ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48); - - } // rows counter - - // store result - _mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad); - - _mm256_zeroupper(); // check if it may be done _after_ all searches ??? -/* - SIMD256Res[0]; // minsad; - SIMD256Res[2] - 2; // x of minsad vector - SIMD256Res[3] - 2; // y of minsad vector - )*/ - - unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8); - if (cost >= workarea.nMinCost) return; + unsigned short cost = minsad + ((penaltyNew * minsad) >> 8); + if (cost >= workarea.nMinCost) return; - workarea.bestMV.x = mvx + SIMD256Res[2] - 4; - workarea.bestMV.y = mvy + SIMD256Res[3] - 4; - workarea.nMinCost = cost; - workarea.bestMV.sad = SIMD256Res[0]; + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; } + template void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy) { @@ -4464,30 +4178,27 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) { - int dbr01 = 0; return; } if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) { - int dbr01 = 0; return; } - if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) + // still not sure if it good for speed or not + /* if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) { - int dbr01 = 0; return; } if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) { - int dbr01 = 0; return; } - + */ // array of sads 5x5 unsigned short ArrSADs[5][5]; const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner const uint8_t* pucCurr = workarea.pSrc[0]; - + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm @@ -4524,6 +4235,7 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + _mm_prefetch(const_cast(reinterpret_cast(pucRef + nRefPitch[0] * (i + 8))), _MM_HINT_NTA); // prefetch next Ref row // process sad[-2,i-2] ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); @@ -4665,9 +4377,9 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, unsigned short minsad = 65535; int x_minsad = 0; int y_minsad = 0; - for (int x = -2; x < 2; x++) + for (int x = -2; x < 3; x++) { - for (int y = -2; y < 2; y++) + for (int y = -2; y < 3; y++) { if (ArrSADs[x+2][y+2] < minsad) { @@ -4686,273 +4398,3 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, workarea.nMinCost = cost; workarea.bestMV.sad = minsad; } - - -template -void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy) -{ - // debug check !! need to fix caller to now allow illegal vectors - // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) - if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) - { - int dbr01 = 0; - return; - } - if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) - { - int dbr01 = 0; - return; - } -/* if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) - { - int dbr01 = 0; - return; - } - if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) - { - int dbr01 = 0; - return; - } - */ // - check if it is OK to skip these cheks - - alignas(256) unsigned short SIMD256Res[16]; - const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner - const uint8_t* pucCurr = workarea.pSrc[0]; - - __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; - - __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm - __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; - __m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0); - __m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0); - __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; - __m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit - __m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0 - - // __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work - // __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work - // my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes - - xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); - xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); - ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); - - xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); - xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); - ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); - - xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); - xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); - ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); - - xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); - xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); - ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); - // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms - - for (int i = 0; i < 5; i++) - { - ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); - ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); - ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); - ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); - // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 - - // process sad[-2,i-2] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad -2,i-2 ready in low of ymm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[-1,-2] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad -1,i-2 ready in low of ymm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[-0,-2] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 0,i-2 ready in low of ymm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[1,-2] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 1,i-2 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord - - // rotate Ref to 1 samples - ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); - ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); - ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); - ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); - - // process sad[2,i-2] - ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); - ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); - ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); - ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); - - ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); - ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); - ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); - ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); - - ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); - - ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); - // sad 2,i-2 ready in low of mm4 - //check if min and replace in ymm14_yx_minsad - ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed - ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist - ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than - ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed - ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than - - ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord - - // clear x cord to 0 - ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48); - ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48); - - } // rows counter - - // store result - _mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad); - - _mm256_zeroupper(); // check if it may be done _after_ all searches ??? -/* - SIMD256Res[0]; // minsad; - SIMD256Res[2] - 2; // x of minsad vector - SIMD256Res[3] - 2; // y of minsad vector - )*/ - - unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8); - if (cost >= workarea.nMinCost) return; - - workarea.bestMV.x = mvx + SIMD256Res[2] - 2; - workarea.bestMV.y = mvy + SIMD256Res[3] - 2; - workarea.nMinCost = cost; - workarea.bestMV.sad = SIMD256Res[0]; -} diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h index 5608084..fd7d0d8 100644 --- a/Sources/PlaneOfBlocks.h +++ b/Sources/PlaneOfBlocks.h @@ -346,15 +346,15 @@ class PlaneOfBlocks template void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based - + template - void ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based + void ExhaustiveSearch8x8_sp4_C_ref(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 C-ref template void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based template - void ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based + void ExhaustiveSearch8x8_sp2_C_ref(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search radius 2 C-ref template