From 21839274612ff8a7c6434ead104a142b875fcf65 Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Sat, 16 Oct 2021 19:24:03 +0300
Subject: [PATCH 1/3] Added different predictors proc and new AVX2 SAD

Added different predictors proc and new AVX2 SAD for ExhaustiveSearch
---
 Sources/MVAnalyse.cpp     |   2 +
 Sources/PlaneOfBlocks.cpp | 686 +++++++++++++++++++++++++++++++++++++-
 Sources/PlaneOfBlocks.h   |  30 +-
 3 files changed, 705 insertions(+), 13 deletions(-)
diff --git a/Sources/MVAnalyse.cpp b/Sources/MVAnalyse.cpp
index 7d353f7..7ec5627 100644
--- a/Sources/MVAnalyse.cpp
+++ b/Sources/MVAnalyse.cpp
@@ -258,6 +258,8 @@ MVAnalyse::MVAnalyse(
     ++nLevelsMax;
   }
 
+//  nLevelsMax = 2;
+
   analysisData.nLvCount = (lv > 0) ? lv : nLevelsMax + lv;
   if (analysisData.nLvCount > nSuperLevels)
   {
diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp
index 33802b6..dc170d9 100644
--- a/Sources/PlaneOfBlocks.cpp
+++ b/Sources/PlaneOfBlocks.cpp
@@ -103,10 +103,10 @@ PlaneOfBlocks::PlaneOfBlocks(int _nBlkX, int _nBlkY, int _nBlkSizeX, int _nBlkSi
   freqArray[0].resize(8192 * _nPel * 2);
   freqArray[1].resize(8192 * _nPel * 2);
   // for nFlags, we use CPU_xxxx constants instead of Avisynth's CPUF_xxx values, because there are extra bits here
-  bool sse2 = (bool)(nFlags & CPU_SSE2); // no tricks for really old processors. If SSE2 is reported, use it
-  bool sse41 = (bool)(nFlags & CPU_SSE4);
-  bool avx = (bool)(nFlags & CPU_AVX);
-  bool avx2 = (bool)(nFlags & CPU_AVX2);
+  sse2 = (bool)(nFlags & CPU_SSE2); // no tricks for really old processors. If SSE2 is reported, use it
+  sse41 = (bool)(nFlags & CPU_SSE4);
+  avx = (bool)(nFlags & CPU_AVX);
+  avx2 = (bool)(nFlags & CPU_AVX2);
 //  bool ssd = (bool)(nFlags & MOTION_USE_SSD);
 //  bool satd = (bool)(nFlags & MOTION_USE_SATD);
 
@@ -685,7 +685,6 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea)
   {
     workarea.predictors[1] = ClipMV(workarea, zeroMVfieldShifted); // v1.11.1 - values instead of pointer
   }
-
   // fixme note:
   // MAnalyze mt-inconsistency reason #1
   // this is _not_ internal mt friendly, since here up or bottom predictors
@@ -703,7 +702,6 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea)
   {
     workarea.predictors[2] = ClipMV(workarea, zeroMVfieldShifted);
   }
-
     // Original problem: random, small, rare, mostly irreproducible differences between multiple encodings.
     // In all, I spent at least a week on the problem during a half year, losing hope
     // and restarting again four times. Nasty bug it was.
@@ -747,7 +745,7 @@ void PlaneOfBlocks::FetchPredictors(WorkingArea &workarea)
     workarea.predictors[0].sad = workarea.predictors[1].sad;
   }
 
-  // if there are no other planes, predictor is the median
+   // if there are no other planes, predictor is the median
   if (smallestPlane)
   {
     workarea.predictor = workarea.predictors[0];
@@ -811,12 +809,23 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea)
     }
     break;
   case EXHAUSTIVE: {
+
     //		ExhaustiveSearch(nSearchParam);
     int mvx = workarea.bestMV.x;
     int mvy = workarea.bestMV.y;
+	if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma)
+	{
+		ExhaustiveSearch8x8_sp2_avx2<pixel_t>(workarea, mvx, mvy);
+		break;
+	}
+	if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma)
+	{
+		ExhaustiveSearch8x8_sp4_avx2<pixel_t>(workarea, mvx, mvy);
+		break;
+	}
     for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far)
     {
-      ExpandingSearch<pixel_t>(workarea, i, 1, mvx, mvy);
+       ExpandingSearch<pixel_t>(workarea, i, 1, mvx, mvy);
     }
   }
                    break;
@@ -1097,6 +1106,113 @@ void PlaneOfBlocks::PseudoEPZSearch(WorkingArea& workarea)
 }
 
 
+template<typename pixel_t>
+void PlaneOfBlocks::PseudoEPZSearch_no_pred(WorkingArea& workarea)
+{
+    typedef typename std::conditional < sizeof(pixel_t) == 1, sad_t, bigsad_t >::type safe_sad_t;
+ //   FetchPredictors<pixel_t>(workarea);
+
+    sad_t sad;
+    sad_t saduv;
+
+
+    // We treat zero alone
+    // Do we bias zero with not taking into account distorsion ?
+    workarea.bestMV.x = zeroMVfieldShifted.x;
+    workarea.bestMV.y = zeroMVfieldShifted.y;
+    saduv = (chroma) ?
+        ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, 0, 0), nRefPitch[1])
+            + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, 0, 0), nRefPitch[2]), effective_chromaSADscale) : 0;
+    sad = LumaSAD<pixel_t>(workarea, GetRefBlock(workarea, 0, zeroMVfieldShifted.y));
+    sad += saduv;
+    workarea.bestMV.sad = sad;
+    workarea.nMinCost = sad + ((penaltyZero * (safe_sad_t)sad) >> 8); // v.1.11.0.2
+
+    // then, we refine, according to the search type
+    Refine<pixel_t>(workarea);
+
+    // we store the result
+    vectors[workarea.blkIdx].x = workarea.bestMV.x;
+    vectors[workarea.blkIdx].y = workarea.bestMV.y;
+    vectors[workarea.blkIdx].sad = workarea.bestMV.sad;
+
+    workarea.planeSAD += workarea.bestMV.sad; // for debug, plus fixme outer planeSAD is not used
+}
+
+template<typename pixel_t>
+void PlaneOfBlocks::PseudoEPZSearch_glob_med_pred(WorkingArea& workarea)
+{
+    typedef typename std::conditional < sizeof(pixel_t) == 1, sad_t, bigsad_t >::type safe_sad_t;
+    FetchPredictors<pixel_t>(workarea);
+
+    sad_t sad;
+    sad_t saduv;
+
+
+    // We treat zero alone
+    // Do we bias zero with not taking into account distorsion ?
+    workarea.bestMV.x = zeroMVfieldShifted.x;
+    workarea.bestMV.y = zeroMVfieldShifted.y;
+    saduv = (chroma) ?
+        ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, 0, 0), nRefPitch[1])
+            + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, 0, 0), nRefPitch[2]), effective_chromaSADscale) : 0;
+    sad = LumaSAD<pixel_t>(workarea, GetRefBlock(workarea, 0, zeroMVfieldShifted.y));
+    sad += saduv;
+    workarea.bestMV.sad = sad;
+    workarea.nMinCost = sad + ((penaltyZero * (safe_sad_t)sad) >> 8); // v.1.11.0.2
+
+
+   // Global MV predictor  - added by Fizick
+    workarea.globalMVPredictor = ClipMV(workarea, workarea.globalMVPredictor);
+
+    //	if ( workarea.IsVectorOK(workarea.globalMVPredictor.x, workarea.globalMVPredictor.y ) )
+    {
+        saduv = (chroma) ?
+            ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y), nRefPitch[1])
+                + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y), nRefPitch[2]), effective_chromaSADscale) : 0;
+        sad = LumaSAD<pixel_t>(workarea, GetRefBlock(workarea, workarea.globalMVPredictor.x, workarea.globalMVPredictor.y));
+        sad += saduv;
+        sad_t cost = sad + ((pglobal * (safe_sad_t)sad) >> 8);
+
+        if (cost < workarea.nMinCost || tryMany)
+        {
+            workarea.bestMV.x = workarea.globalMVPredictor.x;
+            workarea.bestMV.y = workarea.globalMVPredictor.y;
+            workarea.bestMV.sad = sad;
+            workarea.nMinCost = cost;
+        }
+
+        //	}
+        //	Then, the predictor :
+        //	if (   (( workarea.predictor.x != zeroMVfieldShifted.x ) || ( workarea.predictor.y != zeroMVfieldShifted.y ))
+        //	    && (( workarea.predictor.x != workarea.globalMVPredictor.x ) || ( workarea.predictor.y != workarea.globalMVPredictor.y )))
+        //	{
+        saduv = (chroma) ? ScaleSadChroma(SADCHROMA(workarea.pSrc[1], nSrcPitch[1], GetRefBlockU(workarea, workarea.predictor.x, workarea.predictor.y), nRefPitch[1])
+            + SADCHROMA(workarea.pSrc[2], nSrcPitch[2], GetRefBlockV(workarea, workarea.predictor.x, workarea.predictor.y), nRefPitch[2]), effective_chromaSADscale) : 0;
+        sad = LumaSAD<pixel_t>(workarea, GetRefBlock(workarea, workarea.predictor.x, workarea.predictor.y));
+        sad += saduv;
+
+        cost = sad;
+        if (cost < workarea.nMinCost || tryMany)
+        {
+            workarea.bestMV.x = workarea.predictor.x;
+            workarea.bestMV.y = workarea.predictor.y;
+            workarea.bestMV.sad = sad;
+            workarea.nMinCost = cost;
+        }
+        
+    }
+    
+    // then, we refine, according to the search type
+    Refine<pixel_t>(workarea);
+
+    // we store the result
+    vectors[workarea.blkIdx].x = workarea.bestMV.x;
+    vectors[workarea.blkIdx].y = workarea.bestMV.y;
+    vectors[workarea.blkIdx].sad = workarea.bestMV.sad;
+
+    workarea.planeSAD += workarea.bestMV.sad; // for debug, plus fixme outer planeSAD is not used
+}
 
 template<typename pixel_t>
 void PlaneOfBlocks::DiamondSearch(WorkingArea &workarea, int length)
@@ -1323,9 +1439,8 @@ void PlaneOfBlocks::OneTimeSearch(WorkingArea &workarea, int length)
 template<typename pixel_t>
 void PlaneOfBlocks::ExpandingSearch(WorkingArea &workarea, int r, int s, int mvx, int mvy) // diameter = 2*r + 1, step=s
 { // part of true enhaustive search (thin expanding square) around mvx, mvy
-  int i, j;
-  //	VECTOR mv = workarea.bestMV; // bug: it was pointer assignent, not values, so iterative! - v2.1
-
+	int i, j;
+	//	VECTOR mv = workarea.bestMV; // bug: it was pointer assignent, not values, so iterative! - v2.1
     // sides of square without corners
   for (i = -r + s; i < r; i += s) // without corners! - v2.1
   {
@@ -3055,7 +3170,10 @@ void	PlaneOfBlocks::search_mv_slice(Slicer::TaskData &td)
         workarea.predictors[4] = ClipMV(workarea, zeroMV);
       }
 
-      PseudoEPZSearch<pixel_t>(workarea);
+      // Possible point of placement selection of 'predictiors control'
+      PseudoEPZSearch<pixel_t>(workarea); // all predictors
+//      PseudoEPZSearch_glob_med_pred<pixel_t>(workarea); // partial predictors
+//      PseudoEPZSearch_no_pred<pixel_t>(workarea); // no predictiors
       //			workarea.bestMV = zeroMV; // debug
 
       if (outfilebuf != NULL) // write vector to outfile
@@ -3641,3 +3759,547 @@ PlaneOfBlocks::WorkingArea *PlaneOfBlocks::WorkingAreaFactory::do_create()
 }
 
 
+template<typename pixel_t>
+void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy)
+{
+	// debug check !
+    // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
+	if (!workarea.IsVectorOK(mvx - 4, mvy - 4))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 3, mvy + 3))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx - 4, mvy + 3))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 3, mvy - 4))
+	{
+		int dbr01 = 0;
+		return;
+	}
+
+	// array of sads 8x8
+	// due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search
+	unsigned short ArrSADs[8][9];
+	const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner
+	const uint8_t* pucCurr = workarea.pSrc[0];
+
+	__m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
+
+	__m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x16bytes store, require buf padding to allow 16bytes reads to xmm
+	__m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67;
+
+	__m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp;
+
+
+	//        __m256i 	my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+	//        __m256i 	my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+
+	xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr);
+	xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0])));
+	ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220);
+
+	xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2));
+	xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3)));
+	ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220);
+
+	xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4));
+	xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5)));
+	ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220);
+
+	xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6));
+	xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7)));
+	ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220);
+	// loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms
+
+	for (int i = 0; i < 9; i++)
+	{
+		ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0)));
+		ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2)));
+		ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
+		ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
+		// loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
+
+		// process sad[-4,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -4,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[0][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-3,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -3,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[1][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-2,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -2,i-2 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[2][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-1,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -1,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[3][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[0,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 0,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[4][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[1,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 1,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[5][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[2,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 2,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[6][i], _mm256_castsi256_si128(ymm4_tmp));
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[3,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 3,i-4 ready in low of mm4
+		_mm_storeu_si16(&ArrSADs[7][i], _mm256_castsi256_si128(ymm4_tmp));
+	}
+
+    _mm256_zeroupper();
+
+	unsigned short minsad = 65535;
+	int x_minsad = 0;
+	int y_minsad = 0;
+	for (int x = -4; x < 4; x++)
+	{
+		for (int y = -4; y < 5; y++)
+		{
+			if (ArrSADs[x+4][y+4] < minsad)
+			{
+				minsad = ArrSADs[x+4][y+4];
+				x_minsad = x;
+				y_minsad = y;
+			}
+		}
+	}
+
+	unsigned short cost = minsad + ((penaltyNew*minsad) >> 8);
+	if (cost >= workarea.nMinCost) return;
+
+	workarea.bestMV.x = mvx + x_minsad;
+	workarea.bestMV.y = mvy + y_minsad;
+	workarea.nMinCost = cost;
+	workarea.bestMV.sad = minsad;
+   
+}
+
+template<typename pixel_t>
+void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy)
+{
+	// debug check !! need to fix caller to now allow illegal vectors 
+    // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
+	if (!workarea.IsVectorOK(mvx - 2, mvy - 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 2, mvy + 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx - 2, mvy + 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 2, mvy - 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+
+    // array of sads 5x5 
+    unsigned short ArrSADs[5][5];
+    const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner
+    const uint8_t* pucCurr = workarea.pSrc[0];
+
+    __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
+
+    __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm
+    __m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67;
+
+    __m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp;
+
+
+//        __m256i 	my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+//        __m256i 	my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+//        my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes
+
+    xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr);
+    xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0])));
+    ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220);
+
+    xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2));
+    xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3)));
+    ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220);
+
+    xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4));
+    xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5)));
+    ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220);
+
+    xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6));
+    xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7)));
+    ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220);
+    // loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms
+
+    for (int i = 0; i < 5; i++)
+    {
+		ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0)));
+        ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2)));
+        ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
+        ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
+        // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
+
+        // process sad[-2,i-2]
+        ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+        ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+        ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+        ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+        ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+        ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+        ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+        ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+        ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        // sad -2,i-2 ready in low of mm4
+        _mm_storeu_si16(&ArrSADs[0][i], _mm256_castsi256_si128(ymm4_tmp));
+
+        // rotate Ref to 1 samples
+        ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+        ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+        ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+        ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+        // process sad[-1,-2]
+        ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+        ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+        ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+        ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+        ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+        ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+        ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+        ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+        ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        // sad -1,i-2 ready in low of mm4
+        _mm_storeu_si16(&ArrSADs[1][i], _mm256_castsi256_si128(ymm4_tmp));
+
+        // rotate Ref to 1 samples
+        ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+        ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+        ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+        ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+        // process sad[-0,-2]
+        ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+        ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+        ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+        ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+        ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+        ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+        ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+        ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+        ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        // sad 0,i-2 ready in low of mm4
+        _mm_storeu_si16(&ArrSADs[2][i], _mm256_castsi256_si128(ymm4_tmp));
+
+        // rotate Ref to 1 samples
+        ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+        ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+        ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+        ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+        // process sad[1,-2]
+        ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+        ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+        ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+        ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+        ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+        ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+        ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+        ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+        ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        // sad 1,i-2 ready in low of mm4
+        _mm_storeu_si16(&ArrSADs[3][i], _mm256_castsi256_si128(ymm4_tmp));
+
+        // rotate Ref to 1 samples
+        ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+        ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+        ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+        ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+        // process sad[2,i-2]
+        ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+        ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+        ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+        ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+        ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+        ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+        ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+        ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+        ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+        ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+        // sad 2,i-2 ready in low of mm4
+        _mm_storeu_si16(&ArrSADs[4][i], _mm256_castsi256_si128(ymm4_tmp));
+    }
+
+    _mm256_zeroupper();
+
+    unsigned short minsad = 65535;
+    int x_minsad = 0;
+    int y_minsad = 0;
+    for (int x = -2; x < 2; x++)
+    {
+        for (int y = -2; y < 2; y++)
+        {
+            if (ArrSADs[x+2][y+2] < minsad)
+            {
+                minsad = ArrSADs[x+2][y+2];
+                x_minsad = x;
+                y_minsad = y;
+            }
+        }
+    }
+
+	unsigned short cost = minsad + ((penaltyNew*minsad) >> 8);
+	if (cost >= workarea.nMinCost) return;
+
+    workarea.bestMV.x = mvx + x_minsad;
+    workarea.bestMV.y = mvy + y_minsad;
+    workarea.nMinCost = cost;
+    workarea.bestMV.sad = minsad;
+}
diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h
index 718e16f..5714cd9 100644
--- a/Sources/PlaneOfBlocks.h
+++ b/Sources/PlaneOfBlocks.h
@@ -15,6 +15,14 @@
 // Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA, or visit
 // http://www.gnu.org/copyleft/gpl.html .
 
+#if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
+#include <x86intrin.h>
+// x86intrin.h includes header files for whatever instruction
+// sets are specified on the compiler command line, such as: xopintrin.h, fma4intrin.h
+#else
+#include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3
+#endif // __GNUC__
+
 #ifndef __POBLOCKS__
 #define __POBLOCKS__
 
@@ -137,6 +145,11 @@ class PlaneOfBlocks
   bool           isse;              /* can we use isse asm code */
   bool           chroma;            /* do we do chroma me */
 
+  bool sse2; // make members now to use in SADs
+  bool sse41;
+  bool avx;
+  bool avx2;
+
 
   int dctpitch;
   conc::ObjPool <DCTClass> *		// Set to 0 if not used
@@ -315,13 +328,28 @@ class PlaneOfBlocks
 
   /* performs an epz search */
   template<typename pixel_t>
-  void PseudoEPZSearch(WorkingArea &workarea);
+  void PseudoEPZSearch(WorkingArea &workarea); // full predictors, slowest, max quality
+
+  /* performs an epz search */
+  template<typename pixel_t>
+  void PseudoEPZSearch_no_pred(WorkingArea& workarea); // planes = 1 recommended
+
+  /* performs an epz search */
+  template<typename pixel_t>
+  void PseudoEPZSearch_glob_med_pred(WorkingArea& workarea); // planes >=2 recommended
 
   //	void PhaseShiftSearch(int vx, int vy);
 
   /* performs an exhaustive search */
   template<typename pixel_t>
   void ExpandingSearch(WorkingArea &workarea, int radius, int step, int mvx, int mvy); // diameter = 2*radius + 1
+  
+  template<typename pixel_t>
+  void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based
+
+  template<typename pixel_t>
+  void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based
+
 
   template<typename pixel_t>
   void Hex2Search(WorkingArea &workarea, int i_me_range);

From c9d7636d6676613261ff32262ea10375d093881d Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Tue, 19 Oct 2021 15:29:53 +0300
Subject: [PATCH 2/3] Added new more AVX2 Esa search sp2 and sp4

Added new more AVX2 Esa search sp2 and sp4. One pass only without immediate array of SADs.
---
 Sources/PlaneOfBlocks.cpp | 657 +++++++++++++++++++++++++++++++++++++-
 Sources/PlaneOfBlocks.h   |   6 +
 2 files changed, 661 insertions(+), 2 deletions(-)

diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp
index dc170d9..c34135e 100644
--- a/Sources/PlaneOfBlocks.cpp
+++ b/Sources/PlaneOfBlocks.cpp
@@ -815,12 +815,14 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea)
     int mvy = workarea.bestMV.y;
 	if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma)
 	{
-		ExhaustiveSearch8x8_sp2_avx2<pixel_t>(workarea, mvx, mvy);
+//		ExhaustiveSearch8x8_sp2_avx2<pixel_t>(workarea, mvx, mvy); // old version with Arr of SADs for second pass
+		ExhaustiveSearch8x8_sp2_avx2_2<pixel_t>(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search
 		break;
 	}
 	if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma)
 	{
-		ExhaustiveSearch8x8_sp4_avx2<pixel_t>(workarea, mvx, mvy);
+//		ExhaustiveSearch8x8_sp4_avx2<pixel_t>(workarea, mvx, mvy);
+		ExhaustiveSearch8x8_sp4_avx2_2<pixel_t>(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search
 		break;
 	}
     for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far)
@@ -4074,6 +4076,387 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx,
    
 }
 
+template<typename pixel_t>
+void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy)
+{
+	// debug check !
+	// idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
+	if (!workarea.IsVectorOK(mvx - 4, mvy - 4))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 3, mvy + 3))
+	{
+		int dbr01 = 0;
+		return;
+	}
+/*	if (!workarea.IsVectorOK(mvx - 4, mvy + 3))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 3, mvy - 4))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	*/ // need to check if it is non needed cheks
+
+	// due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search
+	alignas(256) unsigned short SIMD256Res[16];
+
+	const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner
+	const uint8_t* pucCurr = workarea.pSrc[0];
+
+	__m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
+
+	__m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm
+	__m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp;
+	__m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0);
+	__m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0);
+	__m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67;
+	__m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit
+	__m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0
+
+	//        __m256i 	my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+	//        __m256i 	my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+	//        my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes
+
+	xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr);
+	xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0])));
+	ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220);
+
+	xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2));
+	xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3)));
+	ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220);
+
+	xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4));
+	xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5)));
+	ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220);
+
+	xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6));
+	xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7)));
+	ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220);
+	// loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms
+
+	for (int i = 0; i < 9; i++)
+	{
+		ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0)));
+		ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2)));
+		ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
+		ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
+		// loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
+
+		// process sad[-4,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -4,i-4 ready in low of ymm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-3,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -3,i-4 ready in low of ymm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-2,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -2,i-4 ready in low of ymm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-1,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -1,i-4 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[0,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 0,i-4 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[1,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 1,i-4 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[2,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 2,i-4 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[3,i-4]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 3,i-4 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord
+
+		// clear x cord to 0
+		ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48);
+		ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48);
+
+	} // rows counter
+
+	// store result
+	_mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad);
+
+	_mm256_zeroupper(); // check if it may be done _after_ all searches ???
+/*
+	SIMD256Res[0]; // minsad;
+	SIMD256Res[2] - 2; // x of minsad vector
+	SIMD256Res[3] - 2; // y of minsad vector
+	)*/
+
+	unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8);
+	if (cost >= workarea.nMinCost) return;
+
+	workarea.bestMV.x = mvx + SIMD256Res[2] - 4;
+	workarea.bestMV.y = mvy + SIMD256Res[3] - 4;
+	workarea.nMinCost = cost;
+	workarea.bestMV.sad = SIMD256Res[0];
+
+}
+
 template<typename pixel_t>
 void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy)
 {
@@ -4303,3 +4686,273 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx,
     workarea.nMinCost = cost;
     workarea.bestMV.sad = minsad;
 }
+
+
+template<typename pixel_t>
+void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy)
+{
+	// debug check !! need to fix caller to now allow illegal vectors 
+	// idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
+	if (!workarea.IsVectorOK(mvx - 2, mvy - 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 2, mvy + 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+/*	if (!workarea.IsVectorOK(mvx - 2, mvy + 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	if (!workarea.IsVectorOK(mvx + 2, mvy - 2))
+	{
+		int dbr01 = 0;
+		return;
+	}
+	*/ // - check if it is OK to skip these cheks
+
+	alignas(256) unsigned short SIMD256Res[16];
+	const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner
+	const uint8_t* pucCurr = workarea.pSrc[0];
+
+	__m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
+
+	__m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm
+	__m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp;
+	__m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0);
+	__m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0);
+	__m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67;
+	__m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit
+	__m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0
+
+	//        __m256i 	my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+	//        __m256i 	my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
+	//        my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes
+
+	xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr);
+	xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0])));
+	ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220);
+
+	xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2));
+	xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3)));
+	ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220);
+
+	xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4));
+	xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5)));
+	ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220);
+
+	xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6));
+	xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7)));
+	ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220);
+	// loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms
+
+	for (int i = 0; i < 5; i++)
+	{
+		ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0)));
+		ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2)));
+		ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
+		ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
+		// loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
+
+		// process sad[-2,i-2]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -2,i-2 ready in low of ymm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-1,-2]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad -1,i-2 ready in low of ymm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[-0,-2]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 0,i-2 ready in low of ymm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[1,-2]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 1,i-2 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
+
+		// rotate Ref to 1 samples
+		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
+		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
+		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
+		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
+
+		// process sad[2,i-2]
+		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
+		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
+		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
+		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
+
+		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
+		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
+		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
+		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
+
+		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
+
+		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
+		// sad 2,i-2 ready in low of mm4
+		//check if min and replace in ymm14_yx_minsad
+		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
+		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
+		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
+		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
+		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
+
+		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord
+
+		// clear x cord to 0
+		ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48);
+		ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48);
+
+	} // rows counter
+
+	// store result
+	_mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad);
+
+	_mm256_zeroupper(); // check if it may be done _after_ all searches ???
+/*
+	SIMD256Res[0]; // minsad;
+	SIMD256Res[2] - 2; // x of minsad vector
+	SIMD256Res[3] - 2; // y of minsad vector
+	)*/
+
+	unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8);
+	if (cost >= workarea.nMinCost) return;
+
+	workarea.bestMV.x = mvx + SIMD256Res[2] - 2;
+	workarea.bestMV.y = mvy + SIMD256Res[3] - 2;
+	workarea.nMinCost = cost;
+	workarea.bestMV.sad = SIMD256Res[0]; 
+}
diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h
index 5714cd9..5608084 100644
--- a/Sources/PlaneOfBlocks.h
+++ b/Sources/PlaneOfBlocks.h
@@ -347,9 +347,15 @@ class PlaneOfBlocks
   template<typename pixel_t>
   void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based
 
+  template<typename pixel_t>
+  void ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based
+
   template<typename pixel_t>
   void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based
 
+  template<typename pixel_t>
+  void ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based
+
 
   template<typename pixel_t>
   void Hex2Search(WorkingArea &workarea, int i_me_range);

From c12a9047835745ac022bc06e00ecd4a853e8509d Mon Sep 17 00:00:00 2001
From: DTL2020 <68707763+DTL2020@users.noreply.github.com>
Date: Wed, 20 Oct 2021 20:01:41 +0300
Subject: [PATCH 3/3] Added C-ref functions, fixed some bug, +

Added C-ref functions (uses SAD() as old), fixed bug with ends of ArraySADs for sp2, added prefetch attempt.
---
 Sources/MVAnalyse.cpp     |   2 +-
 Sources/PlaneOfBlocks.cpp | 772 ++++++--------------------------------
 Sources/PlaneOfBlocks.h   |   6 +-
 3 files changed, 111 insertions(+), 669 deletions(-)

diff --git a/Sources/MVAnalyse.cpp b/Sources/MVAnalyse.cpp
index 7ec5627..fe68925 100644
--- a/Sources/MVAnalyse.cpp
+++ b/Sources/MVAnalyse.cpp
@@ -258,7 +258,7 @@ MVAnalyse::MVAnalyse(
     ++nLevelsMax;
   }
 
-//  nLevelsMax = 2;
+  nLevelsMax = 2;
 
   analysisData.nLvCount = (lv > 0) ? lv : nLevelsMax + lv;
   if (analysisData.nLvCount > nSuperLevels)
diff --git a/Sources/PlaneOfBlocks.cpp b/Sources/PlaneOfBlocks.cpp
index c34135e..4e19490 100644
--- a/Sources/PlaneOfBlocks.cpp
+++ b/Sources/PlaneOfBlocks.cpp
@@ -815,14 +815,14 @@ void PlaneOfBlocks::Refine(WorkingArea &workarea)
     int mvy = workarea.bestMV.y;
 	if (nSearchParam == 2 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma)
 	{
-//		ExhaustiveSearch8x8_sp2_avx2<pixel_t>(workarea, mvx, mvy); // old version with Arr of SADs for second pass
-		ExhaustiveSearch8x8_sp2_avx2_2<pixel_t>(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search
+		ExhaustiveSearch8x8_sp2_avx2<pixel_t>(workarea, mvx, mvy);
+        //ExhaustiveSearch8x8_sp2_C_ref<pixel_t>(workarea, mvx, mvy);
 		break;
 	}
 	if (nSearchParam == 4 && nBlkSizeX == 8 && nBlkSizeY == 8 && avx2 && !chroma)
 	{
-//		ExhaustiveSearch8x8_sp4_avx2<pixel_t>(workarea, mvx, mvy);
-		ExhaustiveSearch8x8_sp4_avx2_2<pixel_t>(workarea, mvx, mvy); // test of new fully-AVX2 one-pass Exa search
+	    ExhaustiveSearch8x8_sp4_avx2<pixel_t>(workarea, mvx, mvy);
+       //ExhaustiveSearch8x8_sp4_C_ref<pixel_t>(workarea, mvx, mvy);
 		break;
 	}
     for (int i = 1; i <= nSearchParam; i++)// region is same as exhaustive, but ordered by radius (from near to far)
@@ -3173,8 +3173,8 @@ void	PlaneOfBlocks::search_mv_slice(Slicer::TaskData &td)
       }
 
       // Possible point of placement selection of 'predictiors control'
-      PseudoEPZSearch<pixel_t>(workarea); // all predictors
-//      PseudoEPZSearch_glob_med_pred<pixel_t>(workarea); // partial predictors
+//      PseudoEPZSearch<pixel_t>(workarea); // all predictors
+      PseudoEPZSearch_glob_med_pred<pixel_t>(workarea); // partial predictors
 //      PseudoEPZSearch_no_pred<pixel_t>(workarea); // no predictiors
       //			workarea.bestMV = zeroMV; // debug
 
@@ -3760,6 +3760,54 @@ PlaneOfBlocks::WorkingArea *PlaneOfBlocks::WorkingAreaFactory::do_create()
   ));
 }
 
+template<typename pixel_t>
+void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_C_ref(WorkingArea& workarea, int mvx, int mvy)
+{
+    // debug check !
+    // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
+    if (!workarea.IsVectorOK(mvx - 4, mvy - 4))
+    {
+        return;
+    }
+    if (!workarea.IsVectorOK(mvx + 3, mvy + 4))
+    {
+        return;
+    }
+    /*	if (!workarea.IsVectorOK(mvx - 4, mvy + 3))
+        {
+            return;
+        }
+        if (!workarea.IsVectorOK(mvx + 3, mvy - 4))
+        {
+            return;
+        }
+        */
+    unsigned short minsad = 65535;
+    int x_minsad = 0;
+    int y_minsad = 0;
+    for (int x = -4; x < 4; x++)
+    {
+        for (int y = -4; y < 5; y++)
+        {
+            int sad = SAD(workarea.pSrc[0], nSrcPitch[0], GetRefBlock(workarea, mvx + x, mvy + y), nRefPitch[0]);
+            if (sad < minsad)
+            {
+                minsad = sad;
+                x_minsad = x;
+                y_minsad = y;
+            }
+        }
+    }
+
+    unsigned short cost = minsad + ((penaltyNew * minsad) >> 8);
+    if (cost >= workarea.nMinCost) return;
+
+    workarea.bestMV.x = mvx + x_minsad;
+    workarea.bestMV.y = mvy + y_minsad;
+    workarea.nMinCost = cost;
+    workarea.bestMV.sad = minsad;
+
+}
 
 template<typename pixel_t>
 void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy)
@@ -3768,25 +3816,21 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx,
     // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
 	if (!workarea.IsVectorOK(mvx - 4, mvy - 4))
 	{
-		int dbr01 = 0;
 		return;
 	}
 	if (!workarea.IsVectorOK(mvx + 3, mvy + 3))
 	{
-		int dbr01 = 0;
 		return;
 	}
-	if (!workarea.IsVectorOK(mvx - 4, mvy + 3))
+/*	if (!workarea.IsVectorOK(mvx - 4, mvy + 3))
 	{
-		int dbr01 = 0;
 		return;
 	}
 	if (!workarea.IsVectorOK(mvx + 3, mvy - 4))
 	{
-		int dbr01 = 0;
 		return;
 	}
-
+    */
 	// array of sads 8x8
 	// due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search
 	unsigned short ArrSADs[8][9];
@@ -3828,6 +3872,7 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx,
 		ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
 		ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
 		// loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
+        _mm_prefetch(const_cast<const CHAR*>(reinterpret_cast<const CHAR*>(pucRef + nRefPitch[0] * (i + 8))), _MM_HINT_NTA); // prefetch next Ref row
 
 		// process sad[-4,i-4]
 		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
@@ -4077,386 +4122,55 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx,
 }
 
 template<typename pixel_t>
-void PlaneOfBlocks::ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy)
+void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_C_ref(WorkingArea& workarea, int mvx, int mvy)
 {
-	// debug check !
-	// idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
-	if (!workarea.IsVectorOK(mvx - 4, mvy - 4))
-	{
-		int dbr01 = 0;
-		return;
-	}
-	if (!workarea.IsVectorOK(mvx + 3, mvy + 3))
-	{
-		int dbr01 = 0;
-		return;
-	}
-/*	if (!workarea.IsVectorOK(mvx - 4, mvy + 3))
-	{
-		int dbr01 = 0;
-		return;
-	}
-	if (!workarea.IsVectorOK(mvx + 3, mvy - 4))
-	{
-		int dbr01 = 0;
-		return;
-	}
-	*/ // need to check if it is non needed cheks
-
-	// due to 256bit registers limit is actually -4..+3 H search around zero, but full -4 +4 V search
-	alignas(256) unsigned short SIMD256Res[16];
-
-	const uint8_t* pucRef = GetRefBlock(workarea, mvx - 4, mvy - 4); // upper left corner
-	const uint8_t* pucCurr = workarea.pSrc[0];
+    // debug check !
+    // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
+    if (!workarea.IsVectorOK(mvx - 2, mvy - 2))
+    {
+        return;
+    }
+    if (!workarea.IsVectorOK(mvx + 2, mvy + 2))
+    {
+        return;
+    }
+    /*	if (!workarea.IsVectorOK(mvx - 2, mvy + 2))
+        {
+            return;
+        }
+        if (!workarea.IsVectorOK(mvx + 2, mvy - 2))
+        {
+            return;
+        }
+        */
+    unsigned short minsad = 65535;
+    int x_minsad = 0;
+    int y_minsad = 0;
+    for (int x = -2; x < 3; x++)
+    {
+        for (int y = -2; y < 3; y++)
+        {
+            int sad = SAD(workarea.pSrc[0], nSrcPitch[0], GetRefBlock(workarea, mvx + x, mvy + y), nRefPitch[0]);
+            if (sad < minsad)
+            {
+                minsad = sad;
+                x_minsad = x;
+                y_minsad = y;
+            }
+        }
+    }
 
-	__m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
-
-	__m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm
-	__m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp;
-	__m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0);
-	__m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0);
-	__m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67;
-	__m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit
-	__m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0
-
-	//        __m256i 	my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
-	//        __m256i 	my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
-	//        my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes
-
-	xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr);
-	xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0])));
-	ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220);
-
-	xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2));
-	xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3)));
-	ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220);
-
-	xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4));
-	xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5)));
-	ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220);
-
-	xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6));
-	xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7)));
-	ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220);
-	// loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms
-
-	for (int i = 0; i < 9; i++)
-	{
-		ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0)));
-		ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2)));
-		ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
-		ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
-		// loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
-
-		// process sad[-4,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad -4,i-4 ready in low of ymm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[-3,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad -3,i-4 ready in low of ymm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[-2,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad -2,i-4 ready in low of ymm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[-1,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad -1,i-4 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[0,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 0,i-4 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[1,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 1,i-4 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[2,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 2,i-4 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[3,i-4]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 3,i-4 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord
-
-		// clear x cord to 0
-		ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48);
-		ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48);
-
-	} // rows counter
-
-	// store result
-	_mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad);
-
-	_mm256_zeroupper(); // check if it may be done _after_ all searches ???
-/*
-	SIMD256Res[0]; // minsad;
-	SIMD256Res[2] - 2; // x of minsad vector
-	SIMD256Res[3] - 2; // y of minsad vector
-	)*/
-
-	unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8);
-	if (cost >= workarea.nMinCost) return;
+    unsigned short cost = minsad + ((penaltyNew * minsad) >> 8);
+    if (cost >= workarea.nMinCost) return;
 
-	workarea.bestMV.x = mvx + SIMD256Res[2] - 4;
-	workarea.bestMV.y = mvy + SIMD256Res[3] - 4;
-	workarea.nMinCost = cost;
-	workarea.bestMV.sad = SIMD256Res[0];
+    workarea.bestMV.x = mvx + x_minsad;
+    workarea.bestMV.y = mvy + y_minsad;
+    workarea.nMinCost = cost;
+    workarea.bestMV.sad = minsad;
 
 }
 
+
 template<typename pixel_t>
 void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy)
 {
@@ -4464,30 +4178,27 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx,
     // idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
 	if (!workarea.IsVectorOK(mvx - 2, mvy - 2))
 	{
-		int dbr01 = 0;
 		return;
 	}
 	if (!workarea.IsVectorOK(mvx + 2, mvy + 2))
 	{
-		int dbr01 = 0;
 		return;
 	}
-	if (!workarea.IsVectorOK(mvx - 2, mvy + 2))
+    // still not sure if it good for speed or not
+    /*	if (!workarea.IsVectorOK(mvx - 2, mvy + 2))
 	{
-		int dbr01 = 0;
 		return;
 	}
 	if (!workarea.IsVectorOK(mvx + 2, mvy - 2))
 	{
-		int dbr01 = 0;
 		return;
 	}
-
+    */
     // array of sads 5x5 
     unsigned short ArrSADs[5][5];
     const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner
     const uint8_t* pucCurr = workarea.pSrc[0];
-
+    
     __m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
 
     __m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm
@@ -4524,6 +4235,7 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx,
         ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
         ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
         // loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
+        _mm_prefetch(const_cast<const CHAR*>(reinterpret_cast<const CHAR*>(pucRef + nRefPitch[0] * (i + 8))), _MM_HINT_NTA); // prefetch next Ref row
 
         // process sad[-2,i-2]
         ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
@@ -4665,9 +4377,9 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx,
     unsigned short minsad = 65535;
     int x_minsad = 0;
     int y_minsad = 0;
-    for (int x = -2; x < 2; x++)
+    for (int x = -2; x < 3; x++)
     {
-        for (int y = -2; y < 2; y++)
+        for (int y = -2; y < 3; y++)
         {
             if (ArrSADs[x+2][y+2] < minsad)
             {
@@ -4686,273 +4398,3 @@ void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx,
     workarea.nMinCost = cost;
     workarea.bestMV.sad = minsad;
 }
-
-
-template<typename pixel_t>
-void PlaneOfBlocks::ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy)
-{
-	// debug check !! need to fix caller to now allow illegal vectors 
-	// idea - may be not 4 checks are required - only upper left corner (starting addresses of buffer) and lower right (to not over-run atfer end of buffer - need check/test)
-	if (!workarea.IsVectorOK(mvx - 2, mvy - 2))
-	{
-		int dbr01 = 0;
-		return;
-	}
-	if (!workarea.IsVectorOK(mvx + 2, mvy + 2))
-	{
-		int dbr01 = 0;
-		return;
-	}
-/*	if (!workarea.IsVectorOK(mvx - 2, mvy + 2))
-	{
-		int dbr01 = 0;
-		return;
-	}
-	if (!workarea.IsVectorOK(mvx + 2, mvy - 2))
-	{
-		int dbr01 = 0;
-		return;
-	}
-	*/ // - check if it is OK to skip these cheks
-
-	alignas(256) unsigned short SIMD256Res[16];
-	const uint8_t* pucRef = GetRefBlock(workarea, mvx - 2, mvy - 2); // upper left corner
-	const uint8_t* pucCurr = workarea.pSrc[0];
-
-	__m128i xmm10_Src_01, xmm11_Src_23, xmm12_Src_45, xmm13_Src_67;
-
-	__m256i ymm0_Ref_01, ymm1_Ref_23, ymm2_Ref_45, ymm3_Ref_67; // 2x12bytes store, require buf padding to allow 16bytes reads to xmm
-	__m256i ymm4_tmp, ymm5_tmp, ymm6_tmp, ymm7_tmp;
-	__m256i ymm8_x1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0);
-	__m256i ymm9_y1 = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0);
-	__m256i ymm10_Src_01, ymm11_Src_23, ymm12_Src_45, ymm13_Src_67;
-	__m256i ymm14_yx_minsad = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 1073741824); // 2^30 large signed 32bit
-	__m256i ymm15_yx_cnt = _mm256_setzero_si256(); // y,x coords of search count from 0
-
-	//        __m256i 	my_ymm_test = _mm256_set_epi8(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
-	//        __m256i 	my_ymm_test = _mm256_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);// -debug values for check how permutes work
-	//        my_ymm_test = _mm256_alignr_epi8(my_ymm_test, my_ymm_test, 2); // rotate each half of ymm by number of bytes
-
-	xmm10_Src_01 = _mm_loadu_si64((__m128i*)pucCurr);
-	xmm10_Src_01 = _mm_unpacklo_epi64(xmm10_Src_01, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0])));
-	ymm10_Src_01 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm10_Src_01), 220);
-
-	xmm11_Src_23 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 2));
-	xmm11_Src_23 = _mm_unpacklo_epi64(xmm11_Src_23, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 3)));
-	ymm11_Src_23 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm11_Src_23), 220);
-
-	xmm12_Src_45 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 4));
-	xmm12_Src_45 = _mm_unpacklo_epi64(xmm12_Src_45, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 5)));
-	ymm12_Src_45 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm12_Src_45), 220);
-
-	xmm13_Src_67 = _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 6));
-	xmm13_Src_67 = _mm_unpacklo_epi64(xmm13_Src_67, _mm_loadu_si64((__m128i*)(pucCurr + nSrcPitch[0] * 7)));
-	ymm13_Src_67 = _mm256_permute4x64_epi64(_mm256_castsi128_si256(xmm13_Src_67), 220);
-	// loaded 8 rows of 8x8 Src block, now in low and high 128bits of ymms
-
-	for (int i = 0; i < 5; i++)
-	{
-		ymm0_Ref_01 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 1)), (__m128i*)(pucRef + nRefPitch[0] * (i + 0)));
-		ymm1_Ref_23 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 3)), (__m128i*)(pucRef + nRefPitch[0] * (i + 2)));
-		ymm2_Ref_45 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 5)), (__m128i*)(pucRef + nRefPitch[0] * (i + 4)));
-		ymm3_Ref_67 = _mm256_loadu2_m128i((__m128i*)(pucRef + nRefPitch[0] * (i + 7)), (__m128i*)(pucRef + nRefPitch[0] * (i + 6)));
-		// loaded 8 rows of Ref plane 16samples wide into ymm0..ymm3
-
-		// process sad[-2,i-2]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad -2,i-2 ready in low of ymm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[-1,-2]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad -1,i-2 ready in low of ymm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[-0,-2]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 0,i-2 ready in low of ymm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[1,-2]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 1,i-2 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm8_x1);//increment x coord
-
-		// rotate Ref to 1 samples
-		ymm0_Ref_01 = _mm256_alignr_epi8(ymm0_Ref_01, ymm0_Ref_01, 1);
-		ymm1_Ref_23 = _mm256_alignr_epi8(ymm1_Ref_23, ymm1_Ref_23, 1);
-		ymm2_Ref_45 = _mm256_alignr_epi8(ymm2_Ref_45, ymm2_Ref_45, 1);
-		ymm3_Ref_67 = _mm256_alignr_epi8(ymm3_Ref_67, ymm3_Ref_67, 1);
-
-		// process sad[2,i-2]
-		ymm4_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm0_Ref_01, 51);
-		ymm5_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm1_Ref_23, 51);
-		ymm6_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm2_Ref_45, 51);
-		ymm7_tmp = _mm256_blend_epi32(_mm256_setzero_si256(), ymm3_Ref_67, 51);
-
-		ymm4_tmp = _mm256_sad_epu8(ymm4_tmp, ymm10_Src_01);
-		ymm5_tmp = _mm256_sad_epu8(ymm5_tmp, ymm11_Src_23);
-		ymm6_tmp = _mm256_sad_epu8(ymm6_tmp, ymm12_Src_45);
-		ymm7_tmp = _mm256_sad_epu8(ymm7_tmp, ymm13_Src_67);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		ymm6_tmp = _mm256_add_epi32(ymm6_tmp, ymm7_tmp);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm6_tmp);
-
-		ymm5_tmp = _mm256_permute2f128_si256(ymm4_tmp, ymm4_tmp, 65);
-
-		ymm4_tmp = _mm256_add_epi32(ymm4_tmp, ymm5_tmp);
-		// sad 2,i-2 ready in low of mm4
-		//check if min and replace in ymm14_yx_minsad
-		ymm5_tmp = _mm256_cmpgt_epi32(ymm14_yx_minsad, ymm4_tmp); // mask in 0 16bit word for update minsad if needed
-		ymm5_tmp = _mm256_slli_epi64(ymm5_tmp, 32);// clear higher bits of mask - may be better method exist
-		ymm5_tmp = _mm256_srli_epi64(ymm5_tmp, 32);// clear higher bits of mask
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm4_tmp, ymm5_tmp); // blend minsad if not greater than
-		ymm5_tmp = _mm256_shufflelo_epi16(ymm5_tmp, 10); // move/broadcast mask from 0 to 2 and 3 16bit word for update y,x if needed
-		ymm14_yx_minsad = _mm256_blendv_epi8(ymm14_yx_minsad, ymm15_yx_cnt, ymm5_tmp); // blend y,x of minsad if not greater than
-
-		ymm15_yx_cnt = _mm256_add_epi16(ymm15_yx_cnt, ymm9_y1);//increment y coord
-
-		// clear x cord to 0
-		ymm15_yx_cnt = _mm256_srli_epi64(ymm15_yx_cnt, 48);
-		ymm15_yx_cnt = _mm256_slli_epi64(ymm15_yx_cnt, 48);
-
-	} // rows counter
-
-	// store result
-	_mm256_store_si256((__m256i*)&SIMD256Res[0], ymm14_yx_minsad);
-
-	_mm256_zeroupper(); // check if it may be done _after_ all searches ???
-/*
-	SIMD256Res[0]; // minsad;
-	SIMD256Res[2] - 2; // x of minsad vector
-	SIMD256Res[3] - 2; // y of minsad vector
-	)*/
-
-	unsigned short cost = SIMD256Res[0] + ((penaltyNew * SIMD256Res[0]) >> 8);
-	if (cost >= workarea.nMinCost) return;
-
-	workarea.bestMV.x = mvx + SIMD256Res[2] - 2;
-	workarea.bestMV.y = mvy + SIMD256Res[3] - 2;
-	workarea.nMinCost = cost;
-	workarea.bestMV.sad = SIMD256Res[0]; 
-}
diff --git a/Sources/PlaneOfBlocks.h b/Sources/PlaneOfBlocks.h
index 5608084..fd7d0d8 100644
--- a/Sources/PlaneOfBlocks.h
+++ b/Sources/PlaneOfBlocks.h
@@ -346,15 +346,15 @@ class PlaneOfBlocks
   
   template<typename pixel_t>
   void ExhaustiveSearch8x8_sp4_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based
-
+  
   template<typename pixel_t>
-  void ExhaustiveSearch8x8_sp4_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 intrinsincs based
+  void ExhaustiveSearch8x8_sp4_C_ref(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 4 C-ref
 
   template<typename pixel_t>
   void ExhaustiveSearch8x8_sp2_avx2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based
 
   template<typename pixel_t>
-  void ExhaustiveSearch8x8_sp2_avx2_2(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search AVX2 radius 2 intrinsincs based
+  void ExhaustiveSearch8x8_sp2_C_ref(WorkingArea& workarea, int mvx, int mvy); // 8x8 esa search radius 2 C-ref
 
 
   template<typename pixel_t>