diff --git a/Sources/MVAnalyse.cpp b/Sources/MVAnalyse.cpp index 7d353f7..fe68925 100644 --- a/Sources/MVAnalyse.cpp +++ b/Sources/MVAnalyse.cpp @@ -258,6 +258,8 @@ MVAnalyse::MVAnalyse( ++nLevelsMax; } + nLevelsMax = 2; + analysisData.nLvCount = (lv > 0) ? lv : nLevelsMax + lv; if (analysisData.nLvCount > nSuperLevels) { diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp index 33802b6..4e19490 100644 --- a/Sources/PlaneOfBlocks.cpp +++ b/Sources/PlaneOfBlocks.cpp @@ -103,10 +103,10 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi freqArray[0].resize(8192 * _nPel * 2); freqArray[1].resize(8192 * _nPel * 2); // for nFlags, we use CPU_xxxx constants instead of Avisynth's CPUF_xxx values, because there are extra bits here - bool sse2 = (bool)(nFlags & CPU_SSE2); // no tricks for really old processors. If SSE2 is reported, use it - bool sse41 = (bool)(nFlags & CPU_SSE4); - bool avx = (bool)(nFlags & CPU_AVX); - bool avx2 = (bool)(nFlags & CPU_AVX2); + sse2 = (bool)(nFlags & CPU_SSE2); // no tricks for really old processors. If SSE2 is reported, use it + sse41 = (bool)(nFlags & CPU_SSE4); + avx = (bool)(nFlags & CPU_AVX); + avx2 = (bool)(nFlags & CPU_AVX2); // bool ssd = (bool)(nFlags & MOTION_USE_SSD); // bool satd = (bool)(nFlags & MOTION_USE_SATD); @@ -685,7 +685,6 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea) { workarea.predictors[1] = ClipMV(workarea, zeroMVfieldShifted); // v1.11.1 - values instead of pointer } - // fixme note: // MAnalyze mt-inconsistency reason #1 // this is _not_ internal mt friendly, since here up or bottom predictors @@ -703,7 +702,6 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea) { workarea.predictors[2] = ClipMV(workarea, zeroMVfieldShifted); } - // Original problem: random, small, rare, mostly irreproducible differences between multiple encodings. // In all, I spent at least a week on the problem during a half year, losing hope // and restarting again four times. Nasty bug it was. @@ -747,7 +745,7 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea) workarea.predictors[0].sad = workarea.predictors[1].sad; } - // if there are no other planes, predictor is the median + // if there are no other planes, predictor is the median if (smallestPlane) { workarea.predictor = workarea.predictors[0]; @@ -811,12 +809,25 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea) } break; case EXHAUSTIVE: { + // ExhaustiveSearch(nSearchParam); int mvx = workarea.bestMV.x; int mvy = workarea.bestMV.y; + if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) + { + ExhaustiveSearch8x8_sp2_avx2(workarea, mvx, mvy); + //ExhaustiveSearch8x8_sp2_C_ref(workarea, mvx, mvy); + break; + } + if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma) + { + ExhaustiveSearch8x8_sp4_avx2(workarea, mvx, mvy); + //ExhaustiveSearch8x8_sp4_C_ref(workarea, mvx, mvy); + break; + } for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far) { - ExpandingSearch(workarea, i, 1, mvx, mvy); + ExpandingSearch(workarea, i, 1, mvx, mvy); } } break; @@ -1097,6 +1108,113 @@ void PlaneOfBlocks::PseudoEPZSearch(WorkingArea& workarea) } +template +void PlaneOfBlocks::PseudoEPZSearch_no_pred(WorkingArea& workarea) +{ + typedef typename std::conditional < sizeof(pixel_t) == 1, sad_t, bigsad_t >::type safe_sad_t; + // FetchPredictors(workarea); + + sad_t sad; + sad_t saduv; + + + // We treat zero alone + // Do we bias zero with not taking into account distorsion ? + workarea.bestMV.x = zeroMVfieldShifted.x; + workarea.bestMV.y = zeroMVfieldShifted.y; + saduv = (chroma) ? + ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, 0, 0), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, 0, 0), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, 0, zeroMVfieldShifted.y)); + sad += saduv; + workarea.bestMV.sad = sad; + workarea.nMinCost = sad + ((penaltyZero * (safe_sad_t)sad) >> 8); // v.1.11.0.2 + + // then, we refine, according to the search type + Refine(workarea); + + // we store the result + vectors[workarea.blkIdx].x = workarea.bestMV.x; + vectors[workarea.blkIdx].y = workarea.bestMV.y; + vectors[workarea.blkIdx].sad = workarea.bestMV.sad; + + workarea.planeSAD += workarea.bestMV.sad; // for debug, plus fixme outer planeSAD is not used +} + +template +void PlaneOfBlocks::PseudoEPZSearch_glob_med_pred(WorkingArea& workarea) +{ + typedef typename std::conditional < sizeof(pixel_t) == 1, sad_t, bigsad_t >::type safe_sad_t; + FetchPredictors(workarea); + + sad_t sad; + sad_t saduv; + + + // We treat zero alone + // Do we bias zero with not taking into account distorsion ? + workarea.bestMV.x = zeroMVfieldShifted.x; + workarea.bestMV.y = zeroMVfieldShifted.y; + saduv = (chroma) ? + ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, 0, 0), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, 0, 0), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, 0, zeroMVfieldShifted.y)); + sad += saduv; + workarea.bestMV.sad = sad; + workarea.nMinCost = sad + ((penaltyZero * (safe_sad_t)sad) >> 8); // v.1.11.0.2 + + + // Global MV predictor - added by Fizick + workarea.globalMVPredictor = ClipMV(workarea, workarea.globalMVPredictor); + + // if ( workarea.IsVectorOK(workarea.globalMVPredictor.x, workarea.globalMVPredictor.y ) ) + { + saduv = (chroma) ? + ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y)); + sad += saduv; + sad_t cost = sad + ((pglobal * (safe_sad_t)sad) >> 8); + + if (cost < workarea.nMinCost || tryMany) + { + workarea.bestMV.x = workarea.globalMVPredictor.x; + workarea.bestMV.y = workarea.globalMVPredictor.y; + workarea.bestMV.sad = sad; + workarea.nMinCost = cost; + } + + // } + // Then, the predictor : + // if ( (( workarea.predictor.x != zeroMVfieldShifted.x ) || ( workarea.predictor.y != zeroMVfieldShifted.y )) + // && (( workarea.predictor.x != workarea.globalMVPredictor.x ) || ( workarea.predictor.y != workarea.globalMVPredictor.y ))) + // { + saduv = (chroma) ? ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, workarea.predictor.x, workarea.predictor.y), nRefPitch[1]) + + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, workarea.predictor.x, workarea.predictor.y), nRefPitch[2]), effective_chromaSADscale) : 0; + sad = LumaSAD(workarea, GetRefBlock(workarea, workarea.predictor.x, workarea.predictor.y)); + sad += saduv; + + cost = sad; + if (cost < workarea.nMinCost || tryMany) + { + workarea.bestMV.x = workarea.predictor.x; + workarea.bestMV.y = workarea.predictor.y; + workarea.bestMV.sad = sad; + workarea.nMinCost = cost; + } + + } + + // then, we refine, according to the search type + Refine(workarea); + + // we store the result + vectors[workarea.blkIdx].x = workarea.bestMV.x; + vectors[workarea.blkIdx].y = workarea.bestMV.y; + vectors[workarea.blkIdx].sad = workarea.bestMV.sad; + + workarea.planeSAD += workarea.bestMV.sad; // for debug, plus fixme outer planeSAD is not used +} template void PlaneOfBlocks::DiamondSearch(WorkingArea &workarea, int length) @@ -1323,9 +1441,8 @@ void PlaneOfBlocks::OneTimeSearch(WorkingArea &workarea, int length) template void PlaneOfBlocks::ExpandingSearch(WorkingArea &workarea, int r, int s, int mvx, int mvy) // diameter = 2*r + 1, step=s { // part of true enhaustive search (thin expanding square) around mvx, mvy - int i, j; - // VECTOR mv = workarea.bestMV; // bug: it was pointer assignent, not values, so iterative! - v2.1 - + int i, j; + // VECTOR mv = workarea.bestMV; // bug: it was pointer assignent, not values, so iterative! - v2.1 // sides of square without corners for (i = -r + s; i < r; i += s) // without corners! - v2.1 { @@ -3055,7 +3172,10 @@ void PlaneOfBlocks::search_mv_slice(Slicer::TaskData &td) workarea.predictors[4] = ClipMV(workarea, zeroMV); } - PseudoEPZSearch(workarea); + // Possible point of placement selection of 'predictiors control' +// PseudoEPZSearch(workarea); // all predictors + PseudoEPZSearch_glob_med_pred(workarea); // partial predictors +// PseudoEPZSearch_no_pred(workarea); // no predictiors // workarea.bestMV = zeroMV; // debug if (outfilebuf != NULL) // write vector to outfile @@ -3640,4 +3760,641 @@ PlaneOfBlocks::WorkingArea *PlaneOfBlocks::WorkingAreaFactory::do_create() )); } +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_C_ref(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy + 4)) + { + return; + } + /* if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) + { + return; + } + */ + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -4; x < 4; x++) + { + for (int y = -4; y < 5; y++) + { + int sad = SAD(workarea.pSrc[0], nSrcPitch[0], GetRefBlock(workarea, mvx + x, mvy + y), nRefPitch[0]); + if (sad < minsad) + { + minsad = sad; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew * minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; + +} + +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 4, mvy - 4)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy + 3)) + { + return; + } +/* if (!workarea.IsVectorOK(mvx - 4, mvy + 3)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 3, mvy - 4)) + { + return; + } + */ + // array of sads 8x8 + // due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search + unsigned short ArrSADs[8][9]; + const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner + const uint8_t* pucCurr = workarea.pSrc[0]; + + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; + + __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x16bytes store, require buf padding to allow 16bytes reads to xmm + __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; + + __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; + + + // __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + // __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work + + xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); + xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); + ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); + + xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); + xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); + ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); + + xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); + xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); + ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); + + xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); + xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); + ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); + // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms + + for (int i = 0; i < 9; i++) + { + ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); + ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); + ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); + ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); + // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + _mm_prefetch(const_cast(reinterpret_cast(pucRef + nRefPitch[0] * (i + 8))), _MM_HINT_NTA); // prefetch next Ref row + + // process sad[-4,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -4,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[0][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-3,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -3,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[1][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-2,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -2,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[2][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-1,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -1,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[3][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[0,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 0,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[4][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[1,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 1,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[5][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[2,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 2,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[6][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[3,i-4] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 3,i-4 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[7][i], _mm256_castsi256_si128(ymm4_tmp)); + } + + _mm256_zeroupper(); + + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -4; x < 4; x++) + { + for (int y = -4; y < 5; y++) + { + if (ArrSADs[x+4][y+4] < minsad) + { + minsad = ArrSADs[x+4][y+4]; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew*minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; + +} + +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_C_ref(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check ! + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) + { + return; + } + /* if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) + { + return; + } + */ + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -2; x < 3; x++) + { + for (int y = -2; y < 3; y++) + { + int sad = SAD(workarea.pSrc[0], nSrcPitch[0], GetRefBlock(workarea, mvx + x, mvy + y), nRefPitch[0]); + if (sad < minsad) + { + minsad = sad; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew * minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; + +} + + +template +void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy) +{ + // debug check !! need to fix caller to now allow illegal vectors + // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test) + if (!workarea.IsVectorOK(mvx - 2, mvy - 2)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy + 2)) + { + return; + } + // still not sure if it good for speed or not + /* if (!workarea.IsVectorOK(mvx - 2, mvy + 2)) + { + return; + } + if (!workarea.IsVectorOK(mvx + 2, mvy - 2)) + { + return; + } + */ + // array of sads 5x5 + unsigned short ArrSADs[5][5]; + const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner + const uint8_t* pucCurr = workarea.pSrc[0]; + + __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67; + + __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm + __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67; + + __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp; + + +// __m256i my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work +// __m256i my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work +// my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes + + xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr); + xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0]))); + ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220); + + xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2)); + xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3))); + ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220); + + xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4)); + xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5))); + ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220); + + xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6)); + xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7))); + ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220); + // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms + + for (int i = 0; i < 5; i++) + { + ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0))); + ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2))); + ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4))); + ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6))); + // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3 + _mm_prefetch(const_cast(reinterpret_cast(pucRef + nRefPitch[0] * (i + 8))), _MM_HINT_NTA); // prefetch next Ref row + + // process sad[-2,i-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -2,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[0][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-1,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad -1,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[1][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[-0,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 0,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[2][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[1,-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 1,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[3][i], _mm256_castsi256_si128(ymm4_tmp)); + + // rotate Ref to 1 samples + ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1); + ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1); + ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1); + ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1); + + // process sad[2,i-2] + ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51); + ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51); + ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51); + ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51); + + ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01); + ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23); + ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45); + ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp); + + ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65); + + ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp); + // sad 2,i-2 ready in low of mm4 + _mm_storeu_si16(&ArrSADs[4][i], _mm256_castsi256_si128(ymm4_tmp)); + } + + _mm256_zeroupper(); + + unsigned short minsad = 65535; + int x_minsad = 0; + int y_minsad = 0; + for (int x = -2; x < 3; x++) + { + for (int y = -2; y < 3; y++) + { + if (ArrSADs[x+2][y+2] < minsad) + { + minsad = ArrSADs[x+2][y+2]; + x_minsad = x; + y_minsad = y; + } + } + } + + unsigned short cost = minsad + ((penaltyNew*minsad) >> 8); + if (cost >= workarea.nMinCost) return; + + workarea.bestMV.x = mvx + x_minsad; + workarea.bestMV.y = mvy + y_minsad; + workarea.nMinCost = cost; + workarea.bestMV.sad = minsad; +} diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h index 718e16f..fd7d0d8 100644 --- a/Sources/PlaneOfBlocks.h +++ b/Sources/PlaneOfBlocks.h @@ -15,6 +15,14 @@ // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit // http://www.gnu.org/copyleft/gpl.html . +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) +#include +// x86intrin.h includes header files for whatever instruction +// sets are specified on the compiler command line, such as: xopintrin.h, fma4intrin.h +#else +#include // MS version of immintrin.h covers AVX, AVX2 and FMA3 +#endif // __GNUC__ + #ifndef __POBLOCKS__ #define __POBLOCKS__ @@ -137,6 +145,11 @@ class PlaneOfBlocks bool isse; /* can we use isse asm code */ bool chroma; /* do we do chroma me */ + bool sse2; // make members now to use in SADs + bool sse41; + bool avx; + bool avx2; + int dctpitch; conc::ObjPool * // Set to 0 if not used @@ -315,13 +328,34 @@ class PlaneOfBlocks /* performs an epz search */ template - void PseudoEPZSearch(WorkingArea &workarea); + void PseudoEPZSearch(WorkingArea &workarea); // full predictors, slowest, max quality + + /* performs an epz search */ + template + void PseudoEPZSearch_no_pred(WorkingArea& workarea); // planes = 1 recommended + + /* performs an epz search */ + template + void PseudoEPZSearch_glob_med_pred(WorkingArea& workarea); // planes >=2 recommended // void PhaseShiftSearch(int vx, int vy); /* performs an exhaustive search */ template void ExpandingSearch(WorkingArea &workarea, int radius, int step, int mvx, int mvy); // diameter = 2*radius + 1 + + template + void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based + + template + void ExhaustiveSearch8x8_sp4_C_ref(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 C-ref + + template + void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based + + template + void ExhaustiveSearch8x8_sp2_C_ref(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search radius 2 C-ref + template void Hex2Search(WorkingArea &workarea, int i_me_range);