/* The copyright in this software is being made available under the BSD * License, included below. This software may be subject to other third party * and contributor rights, including patent rights, and no such rights are * granted under this license. * * Copyright (c) 2010-2023, ITU/ISO/IEC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the ITU/ISO/IEC nor the names of its contributors may * be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /** \file RdCostX86.cpp \brief RD cost computation class, SIMD version */ #include #include #include "CommonDefX86.h" #include "../Rom.h" #include "../RdCost.h" #ifdef TARGET_SIMD_X86 typedef Pel Torg; typedef Pel Tcur; #if !RExt__HIGH_BIT_DEPTH_SUPPORT inline __m128i getSse1(const Pel *pSrc1, const ptrdiff_t strideSrc1, const Pel *pSrc2, const ptrdiff_t strideSrc2, const int rows, const int shift) { static_assert(sizeof(Pel) == 2, "Pel must be 16-bit wide"); uint32_t sum = 0; for (int y = 0; y < rows; y++) { const uint16_t v1 = pSrc1[y * strideSrc1]; const uint16_t v2 = pSrc2[y * strideSrc2]; const int16_t diff = v1 - v2; const uint32_t res = diff * diff >> shift; sum += res; } return _mm_cvtsi32_si128(sum); } inline __m128i getSse2(const Pel *pSrc1, const ptrdiff_t strideSrc1, const Pel *pSrc2, const ptrdiff_t strideSrc2, const int rows, const int shift) { static_assert(sizeof(Pel) == 2, "Pel must be 16-bit wide"); __m128i sum = _mm_setzero_si128(); for (int y = 0; y < rows; y += 2) { const uint32_t v1a = *(uint32_t *) (pSrc1 + y * strideSrc1); const uint32_t v1b = *(uint32_t *) (pSrc1 + y * strideSrc1 + strideSrc1); const uint32_t v2a = *(uint32_t *) (pSrc2 + y * strideSrc2); const uint32_t v2b = *(uint32_t *) (pSrc2 + y * strideSrc2 + strideSrc2); const __m128i src1 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(v1a), _mm_cvtsi32_si128(v1b)); const __m128i src2 = _mm_unpacklo_epi64(_mm_cvtsi32_si128(v2a), _mm_cvtsi32_si128(v2b)); const __m128i diff = _mm_sub_epi16(src1, src2); const __m128i res = _mm_sra_epi32(_mm_madd_epi16(diff, diff), _mm_cvtsi32_si128(shift)); sum = _mm_add_epi32(sum, res); } return sum; } inline __m128i getSse4(const Pel *pSrc1, const ptrdiff_t strideSrc1, const Pel *pSrc2, const ptrdiff_t strideSrc2, const int rows, const int shift) { static_assert(sizeof(Pel) == 2, "Pel must be 16-bit wide"); __m128i sum = _mm_setzero_si128(); for (int y = 0; y < rows; y++) { const __m128i src1 = _mm_loadl_epi64((const __m128i *) (pSrc1 + y * strideSrc1)); const __m128i src2 = _mm_loadl_epi64((const __m128i *) (pSrc2 + y * strideSrc2)); const __m128i diff = _mm_sub_epi16(src1, src2); const __m128i res = _mm_sra_epi32(_mm_madd_epi16(diff, diff), _mm_cvtsi32_si128(shift)); sum = _mm_add_epi32(sum, res); } return _mm_cvtepu32_epi64(sum); } inline __m128i getSse8(const Pel *pSrc1, const ptrdiff_t strideSrc1, const Pel *pSrc2, const ptrdiff_t strideSrc2, const int rows, const int shift) { static_assert(sizeof(Pel) == 2, "Pel must be 16-bit wide"); __m128i sum = _mm_setzero_si128(); for (int y = 0; y < rows; y++) { const __m128i src1 = _mm_loadu_si128((const __m128i *) (pSrc1 + y * strideSrc1)); const __m128i src2 = _mm_loadu_si128((const __m128i *) (pSrc2 + y * strideSrc2)); const __m128i diff = _mm_sub_epi16(src1, src2); const __m128i res = _mm_sra_epi32(_mm_madd_epi16(diff, diff), _mm_cvtsi32_si128(shift)); sum = _mm_add_epi32(sum, res); } return _mm_add_epi64(_mm_cvtepu32_epi64(sum), _mm_unpackhi_epi32(sum, _mm_setzero_si128())); } #ifdef USE_AVX2 inline __m128i getSse16(const Pel *pSrc1, const ptrdiff_t strideSrc1, const Pel *pSrc2, const ptrdiff_t strideSrc2, const int rows, const int shift) { static_assert(sizeof(Pel) == 2, "Pel must be 16-bit wide"); __m256i sum = _mm256_setzero_si256(); for (int y = 0; y < rows; y++) { const __m256i src1 = _mm256_loadu_si256((const __m256i *) (pSrc1 + y * strideSrc1)); const __m256i src2 = _mm256_loadu_si256((const __m256i *) (pSrc2 + y * strideSrc2)); const __m256i diff = _mm256_sub_epi16(src1, src2); const __m256i res = _mm256_sra_epi32(_mm256_madd_epi16(diff, diff), _mm_cvtsi32_si128(shift)); sum = _mm256_add_epi32(sum, res); } sum = _mm256_add_epi64(_mm256_unpacklo_epi32(sum, _mm256_setzero_si256()), _mm256_unpackhi_epi32(sum, _mm256_setzero_si256())); return _mm_add_epi64(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1)); } #endif template Distortion RdCost::xGetSSE_SIMD( const DistParam &rcDtParam ) { if (rcDtParam.applyWeight) { return RdCostWeightPrediction::xGetSSEw(rcDtParam); } const int rows = rcDtParam.org.height; const int cols = rcDtParam.org.width; const Pel *pSrc1 = rcDtParam.org.buf; const Pel *pSrc2 = rcDtParam.cur.buf; const ptrdiff_t strideSrc1 = rcDtParam.org.stride; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride; const uint32_t shift = 2 * DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); __m128i sum = _mm_setzero_si128(); if ((cols & 1) != 0) { for (int x = 0; x < cols; x += 1) { sum = _mm_add_epi64(sum, getSse1(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } else if ((cols & 2) != 0) { for (int x = 0; x < cols; x += 2) { sum = _mm_add_epi64(sum, getSse2(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } else if ((cols & 4) != 0) { for (int x = 0; x < cols; x += 4) { sum = _mm_add_epi64(sum, getSse4(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } else { #ifdef USE_AVX2 if (vext >= AVX2 && (cols & 15) == 0) { for (int x = 0; x < cols; x += 16) { sum = _mm_add_epi64(sum, getSse16(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } else #endif { for (int x = 0; x < cols; x += 8) { sum = _mm_add_epi64(sum, getSse8(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } } sum = _mm_add_epi64(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si64(sum); } template Distortion RdCost::xGetSSE_NxN_SIMD(const DistParam &rcDtParam) { if (rcDtParam.applyWeight) { return RdCostWeightPrediction::xGetSSEw(rcDtParam); } const Pel *pSrc1 = rcDtParam.org.buf; const Pel *pSrc2 = rcDtParam.cur.buf; const int rows = rcDtParam.org.height; const ptrdiff_t strideSrc1 = rcDtParam.org.stride; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride; const uint32_t shift = 2 * DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); __m128i sum = _mm_setzero_si128(); if (2 == WIDTH) { sum = getSse2(pSrc1, strideSrc1, pSrc2, strideSrc2, rows, shift); } else if (4 == WIDTH) { sum = getSse4(pSrc1, strideSrc1, pSrc2, strideSrc2, rows, shift); } else { #ifdef USE_AVX2 if (vext >= AVX2 && WIDTH >= 16) { for (int x = 0; x < WIDTH; x += 16) { sum = _mm_add_epi64(sum, getSse16(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } else #endif { for (int x = 0; x < WIDTH; x += 8) { sum = _mm_add_epi64(sum, getSse8(pSrc1 + x, strideSrc1, pSrc2 + x, strideSrc2, rows, shift)); } } } sum = _mm_add_epi64(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(1, 0, 3, 2))); return _mm_cvtsi128_si64(sum); } #endif template< X86_VEXT vext > Distortion RdCost::xGetSAD_SIMD( const DistParam &rcDtParam ) { if( rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) { return RdCost::xGetSAD( rcDtParam ); } const short* pSrc1 = (const short*)rcDtParam.org.buf; const short* pSrc2 = (const short*)rcDtParam.cur.buf; int rows = rcDtParam.org.height; int cols = rcDtParam.org.width; int subShift = rcDtParam.subShift; int subStep = (1 << subShift); const ptrdiff_t strideSrc1 = rcDtParam.org.stride * subStep; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride * subStep; uint32_t sum = 0; if (vext >= AVX2 && (cols & 15) == 0) { #ifdef USE_AVX2 // Do for width that multiple of 16 __m256i vzero = _mm256_setzero_si256(); __m256i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { __m256i vsum16 = vzero; for (int x = 0; x < cols; x += 16) { __m256i vsrc1 = _mm256_lddqu_si256((__m256i *) (&pSrc1[x])); __m256i vsrc2 = _mm256_lddqu_si256((__m256i *) (&pSrc2[x])); vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) ); } __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vzero ), _mm256_unpackhi_epi16( vsum16, vzero ) ); vsum32 = _mm256_add_epi32( vsum32, vsumtemp ); pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm256_hadd_epi32( vsum32, vzero ); vsum32 = _mm256_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11))); #endif } else if ((cols & 7) == 0) { // Do with step of 8 __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { __m128i vsum16 = vzero; for (int x = 0; x < cols; x += 8) { __m128i vsrc1 = _mm_loadu_si128((const __m128i *) (&pSrc1[x])); __m128i vsrc2 = _mm_lddqu_si128((const __m128i *) (&pSrc2[x])); vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); } __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); vsum32 = _mm_add_epi32( vsum32, vsumtemp ); pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm_hadd_epi32( vsum32, vzero ); vsum32 = _mm_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32(vsum32); } else { // Do with step of 4 CHECK((cols & 3) != 0, "Not divisible by 4: " << cols); __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { __m128i vsum16 = vzero; for (int x = 0; x < cols; x += 4) { __m128i vsrc1 = _mm_loadl_epi64((const __m128i *) &pSrc1[x]); __m128i vsrc2 = _mm_loadl_epi64((const __m128i *) &pSrc2[x]); vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); } __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); vsum32 = _mm_add_epi32( vsum32, vsumtemp ); pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm_hadd_epi32( vsum32, vzero ); vsum32 = _mm_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32(vsum32); } sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } template< X86_VEXT vext > Distortion RdCost::xGetSAD_IBD_SIMD(const DistParam &rcDtParam) { if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) { return RdCost::xGetSAD(rcDtParam); } const short* src0 = (const short*)rcDtParam.org.buf; const short* src1 = (const short*)rcDtParam.cur.buf; int width = rcDtParam.org.height; int height = rcDtParam.org.width; int subShift = rcDtParam.subShift; int subStep = (1 << subShift); const ptrdiff_t src0Stride = rcDtParam.org.stride * subStep; const ptrdiff_t src1Stride = rcDtParam.cur.stride * subStep; __m128i vtotalsum32 = _mm_setzero_si128(); __m128i vzero = _mm_setzero_si128(); for (int y = 0; y < height; y += subStep) { for (int x = 0; x < width; x += 4) { __m128i vsrc1 = _mm_loadl_epi64((const __m128i*)(src0 + x)); __m128i vsrc2 = _mm_loadl_epi64((const __m128i*)(src1 + x)); vsrc1 = _mm_cvtepi16_epi32(vsrc1); vsrc2 = _mm_cvtepi16_epi32(vsrc2); vtotalsum32 = _mm_add_epi32(vtotalsum32, _mm_abs_epi32(_mm_sub_epi32(vsrc1, vsrc2))); } src0 += src0Stride; src1 += src1Stride; } vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); vtotalsum32 = _mm_hadd_epi32(vtotalsum32, vzero); Distortion sum = _mm_cvtsi128_si32(vtotalsum32); sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } template Distortion RdCost::xGetSAD_NxN_SIMD(const DistParam &rcDtParam) { if( rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) { return RdCost::xGetSAD( rcDtParam ); } // assert( rcDtParam.cols == width); const short* pSrc1 = (const short*)rcDtParam.org.buf; const short* pSrc2 = (const short*)rcDtParam.cur.buf; int rows = rcDtParam.org.height; int subShift = rcDtParam.subShift; int subStep = (1 << subShift); const ptrdiff_t strideSrc1 = rcDtParam.org.stride * subStep; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride * subStep; uint32_t sum = 0; if (width == 4) { if (rows == 4 && subShift == 0) { __m128i vzero = _mm_setzero_si128(); __m128i vsum = vzero; __m128i vsrc1 = _mm_loadl_epi64( ( const __m128i* )pSrc1 ); vsrc1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(vsrc1), (__m64 *) &pSrc1[strideSrc1])); __m128i vsrc2 = _mm_loadl_epi64( ( const __m128i* )pSrc2 ); vsrc2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(vsrc2), (__m64 *) &pSrc2[strideSrc2])); vsum = _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ); vsrc1 = _mm_loadl_epi64((const __m128i *) &pSrc1[2 * strideSrc1]); vsrc1 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(vsrc1), (__m64 *) &pSrc1[3 * strideSrc1])); vsrc2 = _mm_loadl_epi64((const __m128i *) &pSrc2[2 * strideSrc2]); vsrc2 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(vsrc2), (__m64 *) &pSrc2[3 * strideSrc2])); vsum = _mm_hadd_epi16( vsum, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); vsum = _mm_hadd_epi16( vsum, vzero ); vsum = _mm_hadd_epi16( vsum, vzero ); vsum = _mm_hadd_epi16( vsum, vzero ); sum = _mm_cvtsi128_si32(vsum); } else { __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { __m128i vsum16 = vzero; { __m128i vsrc1 = _mm_loadl_epi64( ( const __m128i* )pSrc1 ); __m128i vsrc2 = _mm_loadl_epi64( ( const __m128i* )pSrc2 ); vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); } __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); vsum32 = _mm_add_epi32( vsum32, vsumtemp ); pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm_hadd_epi32( vsum32, vzero ); vsum32 = _mm_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32(vsum32); } } else { if (vext >= AVX2 && width >= 16) { #ifdef USE_AVX2 // Do for width that multiple of 16 __m256i vzero = _mm256_setzero_si256(); __m256i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { __m256i vsum16 = vzero; for (int x = 0; x < width; x += 16) { __m256i vsrc1 = _mm256_lddqu_si256((__m256i *) (&pSrc1[x])); __m256i vsrc2 = _mm256_lddqu_si256((__m256i *) (&pSrc2[x])); vsum16 = _mm256_add_epi16( vsum16, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) ); } __m256i vsumtemp = _mm256_add_epi32( _mm256_unpacklo_epi16( vsum16, vzero ), _mm256_unpackhi_epi16( vsum16, vzero ) ); vsum32 = _mm256_add_epi32( vsum32, vsumtemp ); pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm256_hadd_epi32( vsum32, vzero ); vsum32 = _mm256_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11))); #endif } else { // For width that multiple of 8 __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { __m128i vsum16 = vzero; for (int x = 0; x < width; x += 8) { __m128i vsrc1 = _mm_loadu_si128((const __m128i *) (&pSrc1[x])); __m128i vsrc2 = _mm_lddqu_si128((const __m128i *) (&pSrc2[x])); vsum16 = _mm_add_epi16( vsum16, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ); } __m128i vsumtemp = _mm_add_epi32( _mm_unpacklo_epi16( vsum16, vzero ), _mm_unpackhi_epi16( vsum16, vzero ) ); vsum32 = _mm_add_epi32( vsum32, vsumtemp ); pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm_hadd_epi32( vsum32, vzero ); vsum32 = _mm_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32(vsum32); } } sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } #if RExt__HIGH_BIT_DEPTH_SUPPORT static Distortion xCalcHAD2x2_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i m1[2], m2[2]; for (int k = 0; k < 2; k++) { m1[k] = _mm_sub_epi32(_mm_loadl_epi64((const __m128i*)piOrg), _mm_loadl_epi64((const __m128i*)piCur)); piOrg += strideOrg; piCur += strideCur; } // vertical m2[0] = _mm_add_epi32(m1[0], m1[1]); m2[1] = _mm_sub_epi32(m1[0], m1[1]); // transpose m1[0] = _mm_unpacklo_epi32(m2[0], m2[1]); m1[1] = _mm_shuffle_epi32(m1[0], 0xee); // horizontal m2[0] = _mm_abs_epi32(_mm_add_epi32(m1[0], m1[1])); m2[1] = _mm_abs_epi32(_mm_sub_epi32(m1[0], m1[1])); Distortion absDc = _mm_cvtsi128_si32(m2[0]); // abs __m128i Sum = _mm_add_epi32(m2[0], m2[1]); Sum = _mm_hadd_epi32(Sum, Sum); Distortion sad = _mm_cvtsi128_si32(Sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif return sad; } static Distortion xCalcHAD4x4_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i r0 = _mm_lddqu_si128((const __m128i*)&piOrg[0]); __m128i r1 = _mm_lddqu_si128((const __m128i *) &piOrg[strideOrg]); __m128i r2 = _mm_lddqu_si128((const __m128i *) &piOrg[2 * strideOrg]); __m128i r3 = _mm_lddqu_si128((const __m128i *) &piOrg[3 * strideOrg]); __m128i r4 = _mm_lddqu_si128((const __m128i*)&piCur[0]); __m128i r5 = _mm_lddqu_si128((const __m128i *) &piCur[strideCur]); __m128i r6 = _mm_lddqu_si128((const __m128i *) &piCur[2 * strideCur]); __m128i r7 = _mm_lddqu_si128((const __m128i *) &piCur[3 * strideCur]); r0 = _mm_sub_epi32(r0, r4); r1 = _mm_sub_epi32(r1, r5); r2 = _mm_sub_epi32(r2, r6); r3 = _mm_sub_epi32(r3, r7); // first stage r4 = r0; r5 = r1; r0 = _mm_add_epi32(r0, r3); r1 = _mm_add_epi32(r1, r2); r4 = _mm_sub_epi32(r4, r3); r5 = _mm_sub_epi32(r5, r2); r2 = r0; r3 = r4; r0 = _mm_add_epi32(r0, r1); r2 = _mm_sub_epi32(r2, r1); r3 = _mm_sub_epi32(r3, r5); r5 = _mm_add_epi32(r5, r4); // shuffle - flip matrix for vertical transform r4 = _mm_unpacklo_epi32(r0, r5); r5 = _mm_unpackhi_epi32(r0, r5); r6 = _mm_unpacklo_epi32(r2, r3); r7 = _mm_unpackhi_epi32(r2, r3); r0 = _mm_unpacklo_epi64(r4, r6); r1 = _mm_unpackhi_epi64(r4, r6); r2 = _mm_unpacklo_epi64(r5, r7); r3 = _mm_unpackhi_epi64(r5, r7); // second stage r4 = r0; r5 = r1; r0 = _mm_add_epi32(r0, r3); r1 = _mm_add_epi32(r1, r2); r4 = _mm_sub_epi32(r4, r3); r5 = _mm_sub_epi32(r5, r2); r2 = r0; r3 = r4; r0 = _mm_add_epi32(r0, r1); r2 = _mm_sub_epi32(r2, r1); r3 = _mm_sub_epi32(r3, r5); r5 = _mm_add_epi32(r5, r4); // abs __m128i Sum = _mm_abs_epi32(r0); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(Sum); #endif Sum = _mm_add_epi32(Sum, _mm_abs_epi32(r2)); Sum = _mm_add_epi32(Sum, _mm_abs_epi32(r3)); Sum = _mm_add_epi32(Sum, _mm_abs_epi32(r5)); Sum = _mm_hadd_epi32(Sum, Sum); Sum = _mm_hadd_epi32(Sum, Sum); Distortion sad = _mm_cvtsi128_si32(Sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = ((sad + 1) >> 1); return sad; } static Distortion xCalcHAD8x8_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i m1[8][2], m2[8][2]; for (int k = 0; k < 8; k++) { m2[k][0] = _mm_sub_epi32(_mm_lddqu_si128((__m128i *) piOrg), _mm_lddqu_si128((__m128i *) piCur)); m2[k][1] = _mm_sub_epi32(_mm_lddqu_si128((__m128i *)(piOrg + 4)), _mm_lddqu_si128((__m128i *)(piCur + 4))); piCur += strideCur; piOrg += strideOrg; } for (int i = 0; i < 2; i++) { // vertical m1[0][i] = _mm_add_epi32(m2[0][i], m2[4][i]); m1[1][i] = _mm_add_epi32(m2[1][i], m2[5][i]); m1[2][i] = _mm_add_epi32(m2[2][i], m2[6][i]); m1[3][i] = _mm_add_epi32(m2[3][i], m2[7][i]); m1[4][i] = _mm_sub_epi32(m2[0][i], m2[4][i]); m1[5][i] = _mm_sub_epi32(m2[1][i], m2[5][i]); m1[6][i] = _mm_sub_epi32(m2[2][i], m2[6][i]); m1[7][i] = _mm_sub_epi32(m2[3][i], m2[7][i]); m2[0][i] = _mm_add_epi32(m1[0][i], m1[2][i]); m2[1][i] = _mm_add_epi32(m1[1][i], m1[3][i]); m2[2][i] = _mm_sub_epi32(m1[0][i], m1[2][i]); m2[3][i] = _mm_sub_epi32(m1[1][i], m1[3][i]); m2[4][i] = _mm_add_epi32(m1[4][i], m1[6][i]); m2[5][i] = _mm_add_epi32(m1[5][i], m1[7][i]); m2[6][i] = _mm_sub_epi32(m1[4][i], m1[6][i]); m2[7][i] = _mm_sub_epi32(m1[5][i], m1[7][i]); m1[0][i] = _mm_add_epi32(m2[0][i], m2[1][i]); m1[1][i] = _mm_sub_epi32(m2[0][i], m2[1][i]); m1[2][i] = _mm_add_epi32(m2[2][i], m2[3][i]); m1[3][i] = _mm_sub_epi32(m2[2][i], m2[3][i]); m1[4][i] = _mm_add_epi32(m2[4][i], m2[5][i]); m1[5][i] = _mm_sub_epi32(m2[4][i], m2[5][i]); m1[6][i] = _mm_add_epi32(m2[6][i], m2[7][i]); m1[7][i] = _mm_sub_epi32(m2[6][i], m2[7][i]); // transpose m2[0][i] = _mm_unpacklo_epi32(m1[0][i], m1[1][i]); m2[1][i] = _mm_unpacklo_epi32(m1[2][i], m1[3][i]); m2[2][i] = _mm_unpackhi_epi32(m1[0][i], m1[1][i]); m2[3][i] = _mm_unpackhi_epi32(m1[2][i], m1[3][i]); m2[4][i] = _mm_unpacklo_epi32(m1[4][i], m1[5][i]); m2[5][i] = _mm_unpacklo_epi32(m1[6][i], m1[7][i]); m2[6][i] = _mm_unpackhi_epi32(m1[4][i], m1[5][i]); m2[7][i] = _mm_unpackhi_epi32(m1[6][i], m1[7][i]); m1[0][i] = _mm_unpacklo_epi64(m2[0][i], m2[1][i]); m1[1][i] = _mm_unpackhi_epi64(m2[0][i], m2[1][i]); m1[2][i] = _mm_unpacklo_epi64(m2[2][i], m2[3][i]); m1[3][i] = _mm_unpackhi_epi64(m2[2][i], m2[3][i]); m1[4][i] = _mm_unpacklo_epi64(m2[4][i], m2[5][i]); m1[5][i] = _mm_unpackhi_epi64(m2[4][i], m2[5][i]); m1[6][i] = _mm_unpacklo_epi64(m2[6][i], m2[7][i]); m1[7][i] = _mm_unpackhi_epi64(m2[6][i], m2[7][i]); } // transpose __m128i n1[8][2]; __m128i n2[8][2]; for (int i = 0; i < 8; i++) { int ii = i % 4; int ij = i >> 2; n2[i][0] = m1[ii][ij]; n2[i][1] = m1[ii + 4][ij]; } for (int i = 0; i < 2; i++) { // horizontal n1[0][i] = _mm_add_epi32(n2[0][i], n2[4][i]); n1[1][i] = _mm_add_epi32(n2[1][i], n2[5][i]); n1[2][i] = _mm_add_epi32(n2[2][i], n2[6][i]); n1[3][i] = _mm_add_epi32(n2[3][i], n2[7][i]); n1[4][i] = _mm_sub_epi32(n2[0][i], n2[4][i]); n1[5][i] = _mm_sub_epi32(n2[1][i], n2[5][i]); n1[6][i] = _mm_sub_epi32(n2[2][i], n2[6][i]); n1[7][i] = _mm_sub_epi32(n2[3][i], n2[7][i]); n2[0][i] = _mm_add_epi32(n1[0][i], n1[2][i]); n2[1][i] = _mm_add_epi32(n1[1][i], n1[3][i]); n2[2][i] = _mm_sub_epi32(n1[0][i], n1[2][i]); n2[3][i] = _mm_sub_epi32(n1[1][i], n1[3][i]); n2[4][i] = _mm_add_epi32(n1[4][i], n1[6][i]); n2[5][i] = _mm_add_epi32(n1[5][i], n1[7][i]); n2[6][i] = _mm_sub_epi32(n1[4][i], n1[6][i]); n2[7][i] = _mm_sub_epi32(n1[5][i], n1[7][i]); n1[0][i] = _mm_abs_epi32(_mm_add_epi32(n2[0][i], n2[1][i])); n1[1][i] = _mm_abs_epi32(_mm_sub_epi32(n2[0][i], n2[1][i])); n1[2][i] = _mm_abs_epi32(_mm_add_epi32(n2[2][i], n2[3][i])); n1[3][i] = _mm_abs_epi32(_mm_sub_epi32(n2[2][i], n2[3][i])); n1[4][i] = _mm_abs_epi32(_mm_add_epi32(n2[4][i], n2[5][i])); n1[5][i] = _mm_abs_epi32(_mm_sub_epi32(n2[4][i], n2[5][i])); n1[6][i] = _mm_abs_epi32(_mm_add_epi32(n2[6][i], n2[7][i])); n1[7][i] = _mm_abs_epi32(_mm_sub_epi32(n2[6][i], n2[7][i])); } for (int i = 0; i < 8; i++) { m1[i][0] = _mm_add_epi32(n1[i][0], n1[i][1]); } m1[0][0] = _mm_add_epi32(m1[0][0], m1[1][0]); m1[2][0] = _mm_add_epi32(m1[2][0], m1[3][0]); m1[4][0] = _mm_add_epi32(m1[4][0], m1[5][0]); m1[6][0] = _mm_add_epi32(m1[6][0], m1[7][0]); m1[0][0] = _mm_add_epi32(m1[0][0], m1[2][0]); m1[4][0] = _mm_add_epi32(m1[4][0], m1[6][0]); __m128i sum = _mm_add_epi32(m1[0][0], m1[4][0]); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(n1[0][0]); sad -= absDc; sad += absDc >> 2; #endif sad = ((sad + 2) >> 2); return sad; } static Distortion xCalcHAD4x8_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i m1[8], m2[8]; for (int k = 0; k < 8; k++) { m2[k] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*)piOrg), _mm_lddqu_si128((__m128i*)piCur)); piCur += strideCur; piOrg += strideOrg; } // vertical m1[0] = _mm_add_epi32(m2[0], m2[4]); m1[1] = _mm_add_epi32(m2[1], m2[5]); m1[2] = _mm_add_epi32(m2[2], m2[6]); m1[3] = _mm_add_epi32(m2[3], m2[7]); m1[4] = _mm_sub_epi32(m2[0], m2[4]); m1[5] = _mm_sub_epi32(m2[1], m2[5]); m1[6] = _mm_sub_epi32(m2[2], m2[6]); m1[7] = _mm_sub_epi32(m2[3], m2[7]); m2[0] = _mm_add_epi32(m1[0], m1[2]); m2[1] = _mm_add_epi32(m1[1], m1[3]); m2[2] = _mm_sub_epi32(m1[0], m1[2]); m2[3] = _mm_sub_epi32(m1[1], m1[3]); m2[4] = _mm_add_epi32(m1[4], m1[6]); m2[5] = _mm_add_epi32(m1[5], m1[7]); m2[6] = _mm_sub_epi32(m1[4], m1[6]); m2[7] = _mm_sub_epi32(m1[5], m1[7]); m1[0] = _mm_add_epi32(m2[0], m2[1]); m1[1] = _mm_sub_epi32(m2[0], m2[1]); m1[2] = _mm_add_epi32(m2[2], m2[3]); m1[3] = _mm_sub_epi32(m2[2], m2[3]); m1[4] = _mm_add_epi32(m2[4], m2[5]); m1[5] = _mm_sub_epi32(m2[4], m2[5]); m1[6] = _mm_add_epi32(m2[6], m2[7]); m1[7] = _mm_sub_epi32(m2[6], m2[7]); // transpose __m128i n1[4][2], n2[4][2]; n2[0][0] = _mm_unpacklo_epi32(m1[0], m1[1]); n2[0][1] = _mm_unpackhi_epi32(m1[0], m1[1]); n2[1][0] = _mm_unpacklo_epi32(m1[2], m1[3]); n2[1][1] = _mm_unpackhi_epi32(m1[2], m1[3]); n2[2][0] = _mm_unpacklo_epi32(m1[4], m1[5]); n2[2][1] = _mm_unpackhi_epi32(m1[4], m1[5]); n2[3][0] = _mm_unpacklo_epi32(m1[6], m1[7]); n2[3][1] = _mm_unpackhi_epi32(m1[6], m1[7]); n1[0][0] = _mm_unpacklo_epi64(n2[0][0], n2[1][0]); n1[0][1] = _mm_unpacklo_epi64(n2[2][0], n2[3][0]); n1[1][0] = _mm_unpackhi_epi64(n2[0][0], n2[1][0]); n1[1][1] = _mm_unpackhi_epi64(n2[2][0], n2[3][0]); n1[2][0] = _mm_unpacklo_epi64(n2[0][1], n2[1][1]); n1[2][1] = _mm_unpacklo_epi64(n2[2][1], n2[3][1]); n1[3][0] = _mm_unpackhi_epi64(n2[0][1], n2[1][1]); n1[3][1] = _mm_unpackhi_epi64(n2[2][1], n2[3][1]); // horizontal for (int i = 0; i < 2; i++) { n2[0][i] = _mm_add_epi32(n1[0][i], n1[2][i]); n2[1][i] = _mm_add_epi32(n1[1][i], n1[3][i]); n2[2][i] = _mm_sub_epi32(n1[0][i], n1[2][i]); n2[3][i] = _mm_sub_epi32(n1[1][i], n1[3][i]); n1[0][i] = _mm_abs_epi32(_mm_add_epi32(n2[0][i], n2[1][i])); n1[1][i] = _mm_abs_epi32(_mm_sub_epi32(n2[0][i], n2[1][i])); n1[2][i] = _mm_abs_epi32(_mm_add_epi32(n2[2][i], n2[3][i])); n1[3][i] = _mm_abs_epi32(_mm_sub_epi32(n2[2][i], n2[3][i])); } for (int i = 0; i < 4; i++) { m1[i] = _mm_add_epi32(n1[i][0], n1[i][1]); } Distortion absDc = _mm_cvtsi128_si32(n1[0][0]); m1[0] = _mm_add_epi32(m1[0], m1[1]); m1[2] = _mm_add_epi32(m1[2], m1[3]); __m128i sum = _mm_add_epi32(m1[0], m1[2]); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (Distortion)(sad / sqrt(4.0 * 8) * 2); return sad; } static Distortion xCalcHAD8x4_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i m1[8][2], m2[8][2]; for (int k = 0; k < 4; k++) { m1[k][0] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*) piOrg), _mm_lddqu_si128((__m128i*) piCur)); m1[k][1] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*)(piOrg + 4)), _mm_lddqu_si128((__m128i*)(piCur + 4))); piCur += strideCur; piOrg += strideOrg; } // vertical for (int i = 0; i < 2; i++) { m2[0][i] = _mm_add_epi32(m1[0][i], m1[2][i]); m2[1][i] = _mm_add_epi32(m1[1][i], m1[3][i]); m2[2][i] = _mm_sub_epi32(m1[0][i], m1[2][i]); m2[3][i] = _mm_sub_epi32(m1[1][i], m1[3][i]); m1[0][i] = _mm_add_epi32(m2[0][i], m2[1][i]); m1[1][i] = _mm_sub_epi32(m2[0][i], m2[1][i]); m1[2][i] = _mm_add_epi32(m2[2][i], m2[3][i]); m1[3][i] = _mm_sub_epi32(m2[2][i], m2[3][i]); } // transpose m2[0][0] = _mm_unpacklo_epi32(m1[0][0], m1[1][0]); m2[0][1] = _mm_unpacklo_epi32(m1[0][1], m1[1][1]); m2[1][0] = _mm_unpacklo_epi32(m1[2][0], m1[3][0]); m2[1][1] = _mm_unpacklo_epi32(m1[2][1], m1[3][1]); m2[2][0] = _mm_unpackhi_epi32(m1[0][0], m1[1][0]); m2[2][1] = _mm_unpackhi_epi32(m1[0][1], m1[1][1]); m2[3][0] = _mm_unpackhi_epi32(m1[2][0], m1[3][0]); m2[3][1] = _mm_unpackhi_epi32(m1[2][1], m1[3][1]); __m128i n1[8], n2[8]; n2[0] = _mm_unpacklo_epi64(m2[0][0], m2[1][0]); n2[1] = _mm_unpackhi_epi64(m2[0][0], m2[1][0]); n2[2] = _mm_unpacklo_epi64(m2[2][0], m2[3][0]); n2[3] = _mm_unpackhi_epi64(m2[2][0], m2[3][0]); n2[4] = _mm_unpacklo_epi64(m2[0][1], m2[1][1]); n2[5] = _mm_unpackhi_epi64(m2[0][1], m2[1][1]); n2[6] = _mm_unpacklo_epi64(m2[2][1], m2[3][1]); n2[7] = _mm_unpackhi_epi64(m2[2][1], m2[3][1]); // horizontal n1[0] = _mm_add_epi32(n2[0], n2[4]); n1[1] = _mm_add_epi32(n2[1], n2[5]); n1[2] = _mm_add_epi32(n2[2], n2[6]); n1[3] = _mm_add_epi32(n2[3], n2[7]); n1[4] = _mm_sub_epi32(n2[0], n2[4]); n1[5] = _mm_sub_epi32(n2[1], n2[5]); n1[6] = _mm_sub_epi32(n2[2], n2[6]); n1[7] = _mm_sub_epi32(n2[3], n2[7]); n2[0] = _mm_add_epi32(n1[0], n1[2]); n2[1] = _mm_add_epi32(n1[1], n1[3]); n2[2] = _mm_sub_epi32(n1[0], n1[2]); n2[3] = _mm_sub_epi32(n1[1], n1[3]); n2[4] = _mm_add_epi32(n1[4], n1[6]); n2[5] = _mm_add_epi32(n1[5], n1[7]); n2[6] = _mm_sub_epi32(n1[4], n1[6]); n2[7] = _mm_sub_epi32(n1[5], n1[7]); n1[0] = _mm_abs_epi32(_mm_add_epi32(n2[0], n2[1])); n1[1] = _mm_abs_epi32(_mm_sub_epi32(n2[0], n2[1])); n1[2] = _mm_abs_epi32(_mm_add_epi32(n2[2], n2[3])); n1[3] = _mm_abs_epi32(_mm_sub_epi32(n2[2], n2[3])); n1[4] = _mm_abs_epi32(_mm_add_epi32(n2[4], n2[5])); n1[5] = _mm_abs_epi32(_mm_sub_epi32(n2[4], n2[5])); n1[6] = _mm_abs_epi32(_mm_add_epi32(n2[6], n2[7])); n1[7] = _mm_abs_epi32(_mm_sub_epi32(n2[6], n2[7])); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(n1[0]); #endif n1[0] = _mm_add_epi32(n1[0], n1[1]); n1[1] = _mm_add_epi32(n1[2], n1[3]); n1[2] = _mm_add_epi32(n1[4], n1[5]); n1[3] = _mm_add_epi32(n1[6], n1[7]); n1[0] = _mm_add_epi32(n1[0], n1[1]); n1[1] = _mm_add_epi32(n1[2], n1[3]); __m128i sum = _mm_add_epi32(n1[0], n1[1]); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (Distortion)(sad / sqrt(4.0 * 8) * 2); return sad; } static Distortion xCalcHAD16x8_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i m1[16][2][2], m2[16][2][2]; __m128i sum = _mm_setzero_si128(); for (int l = 0; l < 2; l++) { const Torg *piOrgPtr = piOrg + l * 8; const Tcur *piCurPtr = piCur + l * 8; for (int k = 0; k < 8; k++) { m2[k][l][0] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*) piOrgPtr), _mm_lddqu_si128((__m128i*) piCurPtr)); m2[k][l][1] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*) (piOrgPtr + 4)), _mm_lddqu_si128((__m128i*) (piCurPtr + 4))); piCurPtr += strideCur; piOrgPtr += strideOrg; } for (int i = 0; i < 2; i++) { //vertical m1[0][l][i] = _mm_add_epi32(m2[0][l][i], m2[4][l][i]); m1[1][l][i] = _mm_add_epi32(m2[1][l][i], m2[5][l][i]); m1[2][l][i] = _mm_add_epi32(m2[2][l][i], m2[6][l][i]); m1[3][l][i] = _mm_add_epi32(m2[3][l][i], m2[7][l][i]); m1[4][l][i] = _mm_sub_epi32(m2[0][l][i], m2[4][l][i]); m1[5][l][i] = _mm_sub_epi32(m2[1][l][i], m2[5][l][i]); m1[6][l][i] = _mm_sub_epi32(m2[2][l][i], m2[6][l][i]); m1[7][l][i] = _mm_sub_epi32(m2[3][l][i], m2[7][l][i]); m2[0][l][i] = _mm_add_epi32(m1[0][l][i], m1[2][l][i]); m2[1][l][i] = _mm_add_epi32(m1[1][l][i], m1[3][l][i]); m2[2][l][i] = _mm_sub_epi32(m1[0][l][i], m1[2][l][i]); m2[3][l][i] = _mm_sub_epi32(m1[1][l][i], m1[3][l][i]); m2[4][l][i] = _mm_add_epi32(m1[4][l][i], m1[6][l][i]); m2[5][l][i] = _mm_add_epi32(m1[5][l][i], m1[7][l][i]); m2[6][l][i] = _mm_sub_epi32(m1[4][l][i], m1[6][l][i]); m2[7][l][i] = _mm_sub_epi32(m1[5][l][i], m1[7][l][i]); m1[0][l][i] = _mm_add_epi32(m2[0][l][i], m2[1][l][i]); m1[1][l][i] = _mm_sub_epi32(m2[0][l][i], m2[1][l][i]); m1[2][l][i] = _mm_add_epi32(m2[2][l][i], m2[3][l][i]); m1[3][l][i] = _mm_sub_epi32(m2[2][l][i], m2[3][l][i]); m1[4][l][i] = _mm_add_epi32(m2[4][l][i], m2[5][l][i]); m1[5][l][i] = _mm_sub_epi32(m2[4][l][i], m2[5][l][i]); m1[6][l][i] = _mm_add_epi32(m2[6][l][i], m2[7][l][i]); m1[7][l][i] = _mm_sub_epi32(m2[6][l][i], m2[7][l][i]); } } // 4 x 8x4 blocks // 0 1 // 2 3 #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = 0; #endif // transpose and do horizontal in two steps for (int l = 0; l < 2; l++) { int off = l * 4; __m128i n1[16]; __m128i n2[16]; m2[0][0][0] = _mm_unpacklo_epi32(m1[0 + off][0][0], m1[1 + off][0][0]); m2[1][0][0] = _mm_unpacklo_epi32(m1[2 + off][0][0], m1[3 + off][0][0]); m2[2][0][0] = _mm_unpackhi_epi32(m1[0 + off][0][0], m1[1 + off][0][0]); m2[3][0][0] = _mm_unpackhi_epi32(m1[2 + off][0][0], m1[3 + off][0][0]); m2[0][0][1] = _mm_unpacklo_epi32(m1[0 + off][0][1], m1[1 + off][0][1]); m2[1][0][1] = _mm_unpacklo_epi32(m1[2 + off][0][1], m1[3 + off][0][1]); m2[2][0][1] = _mm_unpackhi_epi32(m1[0 + off][0][1], m1[1 + off][0][1]); m2[3][0][1] = _mm_unpackhi_epi32(m1[2 + off][0][1], m1[3 + off][0][1]); n1[0] = _mm_unpacklo_epi64(m2[0][0][0], m2[1][0][0]); n1[1] = _mm_unpackhi_epi64(m2[0][0][0], m2[1][0][0]); n1[2] = _mm_unpacklo_epi64(m2[2][0][0], m2[3][0][0]); n1[3] = _mm_unpackhi_epi64(m2[2][0][0], m2[3][0][0]); n1[4] = _mm_unpacklo_epi64(m2[0][0][1], m2[1][0][1]); n1[5] = _mm_unpackhi_epi64(m2[0][0][1], m2[1][0][1]); n1[6] = _mm_unpacklo_epi64(m2[2][0][1], m2[3][0][1]); n1[7] = _mm_unpackhi_epi64(m2[2][0][1], m2[3][0][1]); // transpose 8x4 -> 4x8, block 1(3) m2[8 + 0][0][0] = _mm_unpacklo_epi32(m1[0 + off][1][0], m1[1 + off][1][0]); m2[8 + 1][0][0] = _mm_unpacklo_epi32(m1[2 + off][1][0], m1[3 + off][1][0]); m2[8 + 2][0][0] = _mm_unpackhi_epi32(m1[0 + off][1][0], m1[1 + off][1][0]); m2[8 + 3][0][0] = _mm_unpackhi_epi32(m1[2 + off][1][0], m1[3 + off][1][0]); m2[8 + 0][0][1] = _mm_unpacklo_epi32(m1[0 + off][1][1], m1[1 + off][1][1]); m2[8 + 1][0][1] = _mm_unpacklo_epi32(m1[2 + off][1][1], m1[3 + off][1][1]); m2[8 + 2][0][1] = _mm_unpackhi_epi32(m1[0 + off][1][1], m1[1 + off][1][1]); m2[8 + 3][0][1] = _mm_unpackhi_epi32(m1[2 + off][1][1], m1[3 + off][1][1]); n1[8 + 0] = _mm_unpacklo_epi64(m2[8 + 0][0][0], m2[8 + 1][0][0]); n1[8 + 1] = _mm_unpackhi_epi64(m2[8 + 0][0][0], m2[8 + 1][0][0]); n1[8 + 2] = _mm_unpacklo_epi64(m2[8 + 2][0][0], m2[8 + 3][0][0]); n1[8 + 3] = _mm_unpackhi_epi64(m2[8 + 2][0][0], m2[8 + 3][0][0]); n1[8 + 4] = _mm_unpacklo_epi64(m2[8 + 0][0][1], m2[8 + 1][0][1]); n1[8 + 5] = _mm_unpackhi_epi64(m2[8 + 0][0][1], m2[8 + 1][0][1]); n1[8 + 6] = _mm_unpacklo_epi64(m2[8 + 2][0][1], m2[8 + 3][0][1]); n1[8 + 7] = _mm_unpackhi_epi64(m2[8 + 2][0][1], m2[8 + 3][0][1]); n2[0] = _mm_add_epi32(n1[0], n1[8]); n2[1] = _mm_add_epi32(n1[1], n1[9]); n2[2] = _mm_add_epi32(n1[2], n1[10]); n2[3] = _mm_add_epi32(n1[3], n1[11]); n2[4] = _mm_add_epi32(n1[4], n1[12]); n2[5] = _mm_add_epi32(n1[5], n1[13]); n2[6] = _mm_add_epi32(n1[6], n1[14]); n2[7] = _mm_add_epi32(n1[7], n1[15]); n2[8] = _mm_sub_epi32(n1[0], n1[8]); n2[9] = _mm_sub_epi32(n1[1], n1[9]); n2[10] = _mm_sub_epi32(n1[2], n1[10]); n2[11] = _mm_sub_epi32(n1[3], n1[11]); n2[12] = _mm_sub_epi32(n1[4], n1[12]); n2[13] = _mm_sub_epi32(n1[5], n1[13]); n2[14] = _mm_sub_epi32(n1[6], n1[14]); n2[15] = _mm_sub_epi32(n1[7], n1[15]); n1[0] = _mm_add_epi32(n2[0], n2[4]); n1[1] = _mm_add_epi32(n2[1], n2[5]); n1[2] = _mm_add_epi32(n2[2], n2[6]); n1[3] = _mm_add_epi32(n2[3], n2[7]); n1[4] = _mm_sub_epi32(n2[0], n2[4]); n1[5] = _mm_sub_epi32(n2[1], n2[5]); n1[6] = _mm_sub_epi32(n2[2], n2[6]); n1[7] = _mm_sub_epi32(n2[3], n2[7]); n1[8] = _mm_add_epi32(n2[8], n2[12]); n1[9] = _mm_add_epi32(n2[9], n2[13]); n1[10] = _mm_add_epi32(n2[10], n2[14]); n1[11] = _mm_add_epi32(n2[11], n2[15]); n1[12] = _mm_sub_epi32(n2[8], n2[12]); n1[13] = _mm_sub_epi32(n2[9], n2[13]); n1[14] = _mm_sub_epi32(n2[10], n2[14]); n1[15] = _mm_sub_epi32(n2[11], n2[15]); n2[0] = _mm_add_epi32(n1[0], n1[2]); n2[1] = _mm_add_epi32(n1[1], n1[3]); n2[2] = _mm_sub_epi32(n1[0], n1[2]); n2[3] = _mm_sub_epi32(n1[1], n1[3]); n2[4] = _mm_add_epi32(n1[4], n1[6]); n2[5] = _mm_add_epi32(n1[5], n1[7]); n2[6] = _mm_sub_epi32(n1[4], n1[6]); n2[7] = _mm_sub_epi32(n1[5], n1[7]); n2[8] = _mm_add_epi32(n1[8], n1[10]); n2[9] = _mm_add_epi32(n1[9], n1[11]); n2[10] = _mm_sub_epi32(n1[8], n1[10]); n2[11] = _mm_sub_epi32(n1[9], n1[11]); n2[12] = _mm_add_epi32(n1[12], n1[14]); n2[13] = _mm_add_epi32(n1[13], n1[15]); n2[14] = _mm_sub_epi32(n1[12], n1[14]); n2[15] = _mm_sub_epi32(n1[13], n1[15]); n1[0] = _mm_abs_epi32(_mm_add_epi32(n2[0], n2[1])); n1[1] = _mm_abs_epi32(_mm_sub_epi32(n2[0], n2[1])); n1[2] = _mm_abs_epi32(_mm_add_epi32(n2[2], n2[3])); n1[3] = _mm_abs_epi32(_mm_sub_epi32(n2[2], n2[3])); n1[4] = _mm_abs_epi32(_mm_add_epi32(n2[4], n2[5])); n1[5] = _mm_abs_epi32(_mm_sub_epi32(n2[4], n2[5])); n1[6] = _mm_abs_epi32(_mm_add_epi32(n2[6], n2[7])); n1[7] = _mm_abs_epi32(_mm_sub_epi32(n2[6], n2[7])); n1[8] = _mm_abs_epi32(_mm_add_epi32(n2[8], n2[9])); n1[9] = _mm_abs_epi32(_mm_sub_epi32(n2[8], n2[9])); n1[10] = _mm_abs_epi32(_mm_add_epi32(n2[10], n2[11])); n1[11] = _mm_abs_epi32(_mm_sub_epi32(n2[10], n2[11])); n1[12] = _mm_abs_epi32(_mm_add_epi32(n2[12], n2[13])); n1[13] = _mm_abs_epi32(_mm_sub_epi32(n2[12], n2[13])); n1[14] = _mm_abs_epi32(_mm_add_epi32(n2[14], n2[15])); n1[15] = _mm_abs_epi32(_mm_sub_epi32(n2[14], n2[15])); #if JVET_R0164_MEAN_SCALED_SATD if (l == 0) { absDc = _mm_cvtsi128_si32(n1[0]); } #endif // sum up n1[0] = _mm_add_epi32(n1[0], n1[1]); n1[2] = _mm_add_epi32(n1[2], n1[3]); n1[4] = _mm_add_epi32(n1[4], n1[5]); n1[6] = _mm_add_epi32(n1[6], n1[7]); n1[8] = _mm_add_epi32(n1[8], n1[9]); n1[10] = _mm_add_epi32(n1[10], n1[11]); n1[12] = _mm_add_epi32(n1[12], n1[13]); n1[14] = _mm_add_epi32(n1[14], n1[15]); n1[0] = _mm_add_epi32(n1[0], n1[2]); n1[4] = _mm_add_epi32(n1[4], n1[6]); n1[8] = _mm_add_epi32(n1[8], n1[10]); n1[12] = _mm_add_epi32(n1[12], n1[14]); n1[0] = _mm_add_epi32(n1[0], n1[4]); n1[8] = _mm_add_epi32(n1[8], n1[12]); n1[0] = _mm_add_epi32(n1[0], n1[8]); sum = _mm_add_epi32(sum, n1[0]); } sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (Distortion)(sad / sqrt(16.0 * 8) * 2); return sad; } static Distortion xCalcHAD8x16_HBD_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i m1[2][16], m2[2][16]; __m128i sum = _mm_setzero_si128(); for (int k = 0; k < 16; k++) { m1[0][k] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*) piOrg), _mm_lddqu_si128((__m128i*) piCur)); m1[1][k] = _mm_sub_epi32(_mm_lddqu_si128((__m128i*)(piOrg + 4)), _mm_lddqu_si128((__m128i*)(piCur + 4))); piCur += strideCur; piOrg += strideOrg; } for (int i = 0; i < 2; i++) { // vertical m2[i][0] = _mm_add_epi32(m1[i][0], m1[i][8]); m2[i][1] = _mm_add_epi32(m1[i][1], m1[i][9]); m2[i][2] = _mm_add_epi32(m1[i][2], m1[i][10]); m2[i][3] = _mm_add_epi32(m1[i][3], m1[i][11]); m2[i][4] = _mm_add_epi32(m1[i][4], m1[i][12]); m2[i][5] = _mm_add_epi32(m1[i][5], m1[i][13]); m2[i][6] = _mm_add_epi32(m1[i][6], m1[i][14]); m2[i][7] = _mm_add_epi32(m1[i][7], m1[i][15]); m2[i][8] = _mm_sub_epi32(m1[i][0], m1[i][8]); m2[i][9] = _mm_sub_epi32(m1[i][1], m1[i][9]); m2[i][10] = _mm_sub_epi32(m1[i][2], m1[i][10]); m2[i][11] = _mm_sub_epi32(m1[i][3], m1[i][11]); m2[i][12] = _mm_sub_epi32(m1[i][4], m1[i][12]); m2[i][13] = _mm_sub_epi32(m1[i][5], m1[i][13]); m2[i][14] = _mm_sub_epi32(m1[i][6], m1[i][14]); m2[i][15] = _mm_sub_epi32(m1[i][7], m1[i][15]); m1[i][0] = _mm_add_epi32(m2[i][0], m2[i][4]); m1[i][1] = _mm_add_epi32(m2[i][1], m2[i][5]); m1[i][2] = _mm_add_epi32(m2[i][2], m2[i][6]); m1[i][3] = _mm_add_epi32(m2[i][3], m2[i][7]); m1[i][4] = _mm_sub_epi32(m2[i][0], m2[i][4]); m1[i][5] = _mm_sub_epi32(m2[i][1], m2[i][5]); m1[i][6] = _mm_sub_epi32(m2[i][2], m2[i][6]); m1[i][7] = _mm_sub_epi32(m2[i][3], m2[i][7]); m1[i][8] = _mm_add_epi32(m2[i][8], m2[i][12]); m1[i][9] = _mm_add_epi32(m2[i][9], m2[i][13]); m1[i][10] = _mm_add_epi32(m2[i][10], m2[i][14]); m1[i][11] = _mm_add_epi32(m2[i][11], m2[i][15]); m1[i][12] = _mm_sub_epi32(m2[i][8], m2[i][12]); m1[i][13] = _mm_sub_epi32(m2[i][9], m2[i][13]); m1[i][14] = _mm_sub_epi32(m2[i][10], m2[i][14]); m1[i][15] = _mm_sub_epi32(m2[i][11], m2[i][15]); m2[i][0] = _mm_add_epi32(m1[i][0], m1[i][2]); m2[i][1] = _mm_add_epi32(m1[i][1], m1[i][3]); m2[i][2] = _mm_sub_epi32(m1[i][0], m1[i][2]); m2[i][3] = _mm_sub_epi32(m1[i][1], m1[i][3]); m2[i][4] = _mm_add_epi32(m1[i][4], m1[i][6]); m2[i][5] = _mm_add_epi32(m1[i][5], m1[i][7]); m2[i][6] = _mm_sub_epi32(m1[i][4], m1[i][6]); m2[i][7] = _mm_sub_epi32(m1[i][5], m1[i][7]); m2[i][8] = _mm_add_epi32(m1[i][8], m1[i][10]); m2[i][9] = _mm_add_epi32(m1[i][9], m1[i][11]); m2[i][10] = _mm_sub_epi32(m1[i][8], m1[i][10]); m2[i][11] = _mm_sub_epi32(m1[i][9], m1[i][11]); m2[i][12] = _mm_add_epi32(m1[i][12], m1[i][14]); m2[i][13] = _mm_add_epi32(m1[i][13], m1[i][15]); m2[i][14] = _mm_sub_epi32(m1[i][12], m1[i][14]); m2[i][15] = _mm_sub_epi32(m1[i][13], m1[i][15]); m1[i][0] = _mm_add_epi32(m2[i][0], m2[i][1]); m1[i][1] = _mm_sub_epi32(m2[i][0], m2[i][1]); m1[i][2] = _mm_add_epi32(m2[i][2], m2[i][3]); m1[i][3] = _mm_sub_epi32(m2[i][2], m2[i][3]); m1[i][4] = _mm_add_epi32(m2[i][4], m2[i][5]); m1[i][5] = _mm_sub_epi32(m2[i][4], m2[i][5]); m1[i][6] = _mm_add_epi32(m2[i][6], m2[i][7]); m1[i][7] = _mm_sub_epi32(m2[i][6], m2[i][7]); m1[i][8] = _mm_add_epi32(m2[i][8], m2[i][9]); m1[i][9] = _mm_sub_epi32(m2[i][8], m2[i][9]); m1[i][10] = _mm_add_epi32(m2[i][10], m2[i][11]); m1[i][11] = _mm_sub_epi32(m2[i][10], m2[i][11]); m1[i][12] = _mm_add_epi32(m2[i][12], m2[i][13]); m1[i][13] = _mm_sub_epi32(m2[i][12], m2[i][13]); m1[i][14] = _mm_add_epi32(m2[i][14], m2[i][15]); m1[i][15] = _mm_sub_epi32(m2[i][14], m2[i][15]); } // process horizontal in two steps ( 2 x 8x8 blocks ) for (int l = 0; l < 4; l++) { int off = l * 4; for (int i = 0; i < 2; i++) { // transpose 4x4 m2[i][0 + off] = _mm_unpacklo_epi32(m1[i][0 + off], m1[i][1 + off]); m2[i][1 + off] = _mm_unpackhi_epi32(m1[i][0 + off], m1[i][1 + off]); m2[i][2 + off] = _mm_unpacklo_epi32(m1[i][2 + off], m1[i][3 + off]); m2[i][3 + off] = _mm_unpackhi_epi32(m1[i][2 + off], m1[i][3 + off]); m1[i][0 + off] = _mm_unpacklo_epi64(m2[i][0 + off], m2[i][2 + off]); m1[i][1 + off] = _mm_unpackhi_epi64(m2[i][0 + off], m2[i][2 + off]); m1[i][2 + off] = _mm_unpacklo_epi64(m2[i][1 + off], m2[i][3 + off]); m1[i][3 + off] = _mm_unpackhi_epi64(m2[i][1 + off], m2[i][3 + off]); } } #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = 0; #endif for (int l = 0; l < 2; l++) { int off = l * 8; __m128i n1[2][8]; __m128i n2[2][8]; for (int i = 0; i < 8; i++) { int ii = i % 4; int ij = i >> 2; n2[0][i] = m1[ij][off + ii]; n2[1][i] = m1[ij][off + ii + 4]; } for (int i = 0; i < 2; i++) { n1[i][0] = _mm_add_epi32(n2[i][0], n2[i][4]); n1[i][1] = _mm_add_epi32(n2[i][1], n2[i][5]); n1[i][2] = _mm_add_epi32(n2[i][2], n2[i][6]); n1[i][3] = _mm_add_epi32(n2[i][3], n2[i][7]); n1[i][4] = _mm_sub_epi32(n2[i][0], n2[i][4]); n1[i][5] = _mm_sub_epi32(n2[i][1], n2[i][5]); n1[i][6] = _mm_sub_epi32(n2[i][2], n2[i][6]); n1[i][7] = _mm_sub_epi32(n2[i][3], n2[i][7]); n2[i][0] = _mm_add_epi32(n1[i][0], n1[i][2]); n2[i][1] = _mm_add_epi32(n1[i][1], n1[i][3]); n2[i][2] = _mm_sub_epi32(n1[i][0], n1[i][2]); n2[i][3] = _mm_sub_epi32(n1[i][1], n1[i][3]); n2[i][4] = _mm_add_epi32(n1[i][4], n1[i][6]); n2[i][5] = _mm_add_epi32(n1[i][5], n1[i][7]); n2[i][6] = _mm_sub_epi32(n1[i][4], n1[i][6]); n2[i][7] = _mm_sub_epi32(n1[i][5], n1[i][7]); n1[i][0] = _mm_abs_epi32(_mm_add_epi32(n2[i][0], n2[i][1])); n1[i][1] = _mm_abs_epi32(_mm_sub_epi32(n2[i][0], n2[i][1])); n1[i][2] = _mm_abs_epi32(_mm_add_epi32(n2[i][2], n2[i][3])); n1[i][3] = _mm_abs_epi32(_mm_sub_epi32(n2[i][2], n2[i][3])); n1[i][4] = _mm_abs_epi32(_mm_add_epi32(n2[i][4], n2[i][5])); n1[i][5] = _mm_abs_epi32(_mm_sub_epi32(n2[i][4], n2[i][5])); n1[i][6] = _mm_abs_epi32(_mm_add_epi32(n2[i][6], n2[i][7])); n1[i][7] = _mm_abs_epi32(_mm_sub_epi32(n2[i][6], n2[i][7])); #if JVET_R0164_MEAN_SCALED_SATD if (l + i == 0) { absDc = _mm_cvtsi128_si32(n1[i][0]); } #endif } for (int i = 0; i < 8; i++) { n2[0][i] = _mm_add_epi32(n1[0][i], n1[1][i]); } n2[0][0] = _mm_add_epi32(n2[0][0], n2[0][1]); n2[0][2] = _mm_add_epi32(n2[0][2], n2[0][3]); n2[0][4] = _mm_add_epi32(n2[0][4], n2[0][5]); n2[0][6] = _mm_add_epi32(n2[0][6], n2[0][7]); n2[0][0] = _mm_add_epi32(n2[0][0], n2[0][2]); n2[0][4] = _mm_add_epi32(n2[0][4], n2[0][6]); sum = _mm_add_epi32(sum, _mm_add_epi32(n2[0][0], n2[0][4])); } sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (Distortion)(sad / sqrt(16.0 * 8) * 2); return sad; } #ifdef USE_AVX2 static Distortion xCalcHAD4x4_HBD_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m256i r0 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i*)&piOrg[0])); __m256i r1 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i *) &piOrg[strideOrg])); __m256i r2 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i *) &piOrg[2 * strideOrg])); __m256i r3 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i *) &piOrg[3 * strideOrg])); __m256i r4 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i*)&piCur[0])); __m256i r5 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i *) &piCur[strideCur])); __m256i r6 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i *) &piCur[2 * strideCur])); __m256i r7 = _mm256_castsi128_si256(_mm_lddqu_si128((const __m128i *) &piCur[3 * strideCur])); r0 = _mm256_sub_epi32(r0, r4); r1 = _mm256_sub_epi32(r1, r5); r2 = _mm256_sub_epi32(r2, r6); r3 = _mm256_sub_epi32(r3, r7); // first stage r4 = r0; r5 = r1; r0 = _mm256_add_epi32(r0, r3); r1 = _mm256_add_epi32(r1, r2); r4 = _mm256_sub_epi32(r4, r3); r5 = _mm256_sub_epi32(r5, r2); r2 = r0; r3 = r4; r0 = _mm256_add_epi32(r0, r1); r2 = _mm256_sub_epi32(r2, r1); r3 = _mm256_sub_epi32(r3, r5); r5 = _mm256_add_epi32(r5, r4); // shuffle - flip matrix for vertical transform r0 = _mm256_permute4x64_epi64(r0, 0x50); r2 = _mm256_permute4x64_epi64(r2, 0x50); r3 = _mm256_permute4x64_epi64(r3, 0x50); r5 = _mm256_permute4x64_epi64(r5, 0x50); r0 = _mm256_unpacklo_epi32(r0, r5); r2 = _mm256_unpacklo_epi32(r2, r3); r1 = r0; r0 = _mm256_unpacklo_epi64(r0, r2); r1 = _mm256_unpackhi_epi64(r1, r2); r2 = _mm256_permute4x64_epi64(r0, 0xee); r3 = _mm256_permute4x64_epi64(r1, 0xee); // second stage r4 = r0; r5 = r1; r0 = _mm256_add_epi32(r0, r3); r1 = _mm256_add_epi32(r1, r2); r4 = _mm256_sub_epi32(r4, r3); r5 = _mm256_sub_epi32(r5, r2); r2 = r0; r3 = r4; r0 = _mm256_add_epi32(r0, r1); r2 = _mm256_sub_epi32(r2, r1); r3 = _mm256_sub_epi32(r3, r5); r5 = _mm256_add_epi32(r5, r4); // abs __m256i Sum = _mm256_abs_epi32(r0); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(_mm256_castsi256_si128(Sum)); #endif Sum = _mm256_add_epi32(Sum, _mm256_abs_epi32(r2)); Sum = _mm256_add_epi32(Sum, _mm256_abs_epi32(r3)); Sum = _mm256_add_epi32(Sum, _mm256_abs_epi32(r5)); Sum = _mm256_hadd_epi32(Sum, Sum); Sum = _mm256_hadd_epi32(Sum, Sum); Distortion sad = _mm_cvtsi128_si32(_mm256_castsi256_si128(Sum)); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = ((sad + 1) >> 1); return sad; } static Distortion xCalcHAD8x8_HBD_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m256i m1[8], m2[8]; for (int k = 0; k < 8; k++) { m2[k] = _mm256_sub_epi32(_mm256_lddqu_si256((__m256i *) piOrg), _mm256_lddqu_si256((__m256i *) piCur)); piCur += strideCur; piOrg += strideOrg; } // vertical m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m1[0] = _mm256_add_epi32(m2[0], m2[1]); m1[1] = _mm256_sub_epi32(m2[0], m2[1]); m1[2] = _mm256_add_epi32(m2[2], m2[3]); m1[3] = _mm256_sub_epi32(m2[2], m2[3]); m1[4] = _mm256_add_epi32(m2[4], m2[5]); m1[5] = _mm256_sub_epi32(m2[4], m2[5]); m1[6] = _mm256_add_epi32(m2[6], m2[7]); m1[7] = _mm256_sub_epi32(m2[6], m2[7]); // transpose m2[0] = _mm256_unpacklo_epi32(m1[0], m1[1]); m2[1] = _mm256_unpacklo_epi32(m1[2], m1[3]); m2[2] = _mm256_unpacklo_epi32(m1[4], m1[5]); m2[3] = _mm256_unpacklo_epi32(m1[6], m1[7]); m2[4] = _mm256_unpackhi_epi32(m1[0], m1[1]); m2[5] = _mm256_unpackhi_epi32(m1[2], m1[3]); m2[6] = _mm256_unpackhi_epi32(m1[4], m1[5]); m2[7] = _mm256_unpackhi_epi32(m1[6], m1[7]); m1[0] = _mm256_unpacklo_epi64(m2[0], m2[1]); m1[1] = _mm256_unpacklo_epi64(m2[2], m2[3]); m1[2] = _mm256_unpacklo_epi64(m2[4], m2[5]); m1[3] = _mm256_unpacklo_epi64(m2[6], m2[7]); m1[4] = _mm256_unpackhi_epi64(m2[0], m2[1]); m1[5] = _mm256_unpackhi_epi64(m2[2], m2[3]); m1[6] = _mm256_unpackhi_epi64(m2[4], m2[5]); m1[7] = _mm256_unpackhi_epi64(m2[6], m2[7]); m2[0] = _mm256_permute2x128_si256(m1[0], m1[1], 0x20); m2[4] = _mm256_permute2x128_si256(m1[0], m1[1], 0x31); m2[2] = _mm256_permute2x128_si256(m1[2], m1[3], 0x20); m2[6] = _mm256_permute2x128_si256(m1[2], m1[3], 0x31); m2[1] = _mm256_permute2x128_si256(m1[4], m1[5], 0x20); m2[5] = _mm256_permute2x128_si256(m1[4], m1[5], 0x31); m2[3] = _mm256_permute2x128_si256(m1[6], m1[7], 0x20); m2[7] = _mm256_permute2x128_si256(m1[6], m1[7], 0x31); // horizontal m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m1[0] = _mm256_abs_epi32(_mm256_add_epi32(m2[0], m2[1])); m1[1] = _mm256_abs_epi32(_mm256_sub_epi32(m2[0], m2[1])); m1[2] = _mm256_abs_epi32(_mm256_add_epi32(m2[2], m2[3])); m1[3] = _mm256_abs_epi32(_mm256_sub_epi32(m2[2], m2[3])); m1[4] = _mm256_abs_epi32(_mm256_add_epi32(m2[4], m2[5])); m1[5] = _mm256_abs_epi32(_mm256_sub_epi32(m2[4], m2[5])); m1[6] = _mm256_abs_epi32(_mm256_add_epi32(m2[6], m2[7])); m1[7] = _mm256_abs_epi32(_mm256_sub_epi32(m2[6], m2[7])); m2[0] = _mm256_add_epi32(m1[0], m1[1]); m2[2] = _mm256_add_epi32(m1[2], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[5]); m2[6] = _mm256_add_epi32(m1[6], m1[7]); m2[0] = _mm256_add_epi32(m2[0], m2[2]); m2[4] = _mm256_add_epi32(m2[4], m2[6]); __m256i sum = _mm256_add_epi32(m2[0], m2[4]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); sad += _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute4x64_epi64(sum, 0xee))); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(_mm256_castsi256_si128(m1[0])); sad -= absDc; sad += absDc >> 2; #endif sad = ((sad + 2) >> 2); return sad; } static Distortion xCalcHAD4x8_HBD_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m256i m1[8], m2[8], n1[4], n2[4]; for (int k = 0; k < 8; k++) { m2[k] = _mm256_sub_epi32(_mm256_castsi128_si256(_mm_lddqu_si128((__m128i*)piOrg)), _mm256_castsi128_si256(_mm_lddqu_si128((__m128i*)piCur))); piCur += strideCur; piOrg += strideOrg; } // vertical m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m1[0] = _mm256_permute4x64_epi64(_mm256_add_epi32(m2[0], m2[1]), 0x50); m1[1] = _mm256_permute4x64_epi64(_mm256_sub_epi32(m2[0], m2[1]), 0x50); m1[2] = _mm256_permute4x64_epi64(_mm256_add_epi32(m2[2], m2[3]), 0x50); m1[3] = _mm256_permute4x64_epi64(_mm256_sub_epi32(m2[2], m2[3]), 0x50); m1[4] = _mm256_permute4x64_epi64(_mm256_add_epi32(m2[4], m2[5]), 0x50); m1[5] = _mm256_permute4x64_epi64(_mm256_sub_epi32(m2[4], m2[5]), 0x50); m1[6] = _mm256_permute4x64_epi64(_mm256_add_epi32(m2[6], m2[7]), 0x50); m1[7] = _mm256_permute4x64_epi64(_mm256_sub_epi32(m2[6], m2[7]), 0x50); // transpose m2[0] = _mm256_unpacklo_epi32(m1[0], m1[1]); m2[1] = _mm256_unpacklo_epi32(m1[2], m1[3]); m2[2] = _mm256_unpacklo_epi32(m1[4], m1[5]); m2[3] = _mm256_unpacklo_epi32(m1[6], m1[7]); m1[0] = _mm256_unpacklo_epi64(m2[0], m2[1]); m1[1] = _mm256_unpackhi_epi64(m2[0], m2[1]); m1[2] = _mm256_unpacklo_epi64(m2[2], m2[3]); m1[3] = _mm256_unpackhi_epi64(m2[2], m2[3]); n1[0] = _mm256_inserti128_si256(m1[0], _mm256_castsi256_si128(m1[2]), 1); n1[1] = _mm256_inserti128_si256(m1[1], _mm256_castsi256_si128(m1[3]), 1); n1[2] = _mm256_inserti128_si256(m1[2], _mm256_castsi256_si128(_mm256_permute4x64_epi64(m1[0], 0xee)), 0); n1[3] = _mm256_inserti128_si256(m1[3], _mm256_castsi256_si128(_mm256_permute4x64_epi64(m1[1], 0xee)), 0); n2[0] = _mm256_add_epi32(n1[0], n1[2]); n2[1] = _mm256_add_epi32(n1[1], n1[3]); n2[2] = _mm256_sub_epi32(n1[0], n1[2]); n2[3] = _mm256_sub_epi32(n1[1], n1[3]); n1[0] = _mm256_abs_epi32(_mm256_add_epi32(n2[0], n2[1])); n1[1] = _mm256_abs_epi32(_mm256_sub_epi32(n2[0], n2[1])); n1[2] = _mm256_abs_epi32(_mm256_add_epi32(n2[2], n2[3])); n1[3] = _mm256_abs_epi32(_mm256_sub_epi32(n2[2], n2[3])); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(_mm256_castsi256_si128(n1[0])); #endif m1[0] = _mm256_add_epi32(n1[0], n1[1]); m1[2] = _mm256_add_epi32(n1[2], n1[3]); __m256i sum = _mm256_add_epi32(m1[0], m1[2]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); sad += _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute4x64_epi64(sum, 0xee))); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (Distortion)(sad / sqrt(4.0 * 8) * 2); return sad; } static Distortion xCalcHAD8x4_HBD_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m256i m1[8], m2[8]; for (int k = 0; k < 4; k++) { m1[k] = _mm256_sub_epi32(_mm256_lddqu_si256((__m256i*) piOrg), _mm256_lddqu_si256((__m256i*) piCur)); piCur += strideCur; piOrg += strideOrg; } // vertical m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m1[0] = _mm256_add_epi32(m2[0], m2[1]); m1[1] = _mm256_sub_epi32(m2[0], m2[1]); m1[2] = _mm256_add_epi32(m2[2], m2[3]); m1[3] = _mm256_sub_epi32(m2[2], m2[3]); // transpose m2[0] = _mm256_unpacklo_epi32(m1[0], m1[1]); m2[1] = _mm256_unpacklo_epi32(m1[2], m1[3]); m2[2] = _mm256_unpackhi_epi32(m1[0], m1[1]); m2[3] = _mm256_unpackhi_epi32(m1[2], m1[3]); m1[0] = _mm256_unpacklo_epi64(m2[0], m2[1]); m1[1] = _mm256_unpackhi_epi64(m2[0], m2[1]); m1[2] = _mm256_unpacklo_epi64(m2[2], m2[3]); m1[3] = _mm256_unpackhi_epi64(m2[2], m2[3]); m2[0] = m1[0]; m2[1] = m1[1]; m2[2] = m1[2]; m2[3] = m1[3]; m2[4] = _mm256_permute4x64_epi64(m1[0], 0xee); m2[5] = _mm256_permute4x64_epi64(m1[1], 0xee); m2[6] = _mm256_permute4x64_epi64(m1[2], 0xee); m2[7] = _mm256_permute4x64_epi64(m1[3], 0xee); // horizontal m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m1[0] = _mm256_abs_epi32(_mm256_add_epi32(m2[0], m2[1])); m1[1] = _mm256_abs_epi32(_mm256_sub_epi32(m2[0], m2[1])); m1[2] = _mm256_abs_epi32(_mm256_add_epi32(m2[2], m2[3])); m1[3] = _mm256_abs_epi32(_mm256_sub_epi32(m2[2], m2[3])); m1[4] = _mm256_abs_epi32(_mm256_add_epi32(m2[4], m2[5])); m1[5] = _mm256_abs_epi32(_mm256_sub_epi32(m2[4], m2[5])); m1[6] = _mm256_abs_epi32(_mm256_add_epi32(m2[6], m2[7])); m1[7] = _mm256_abs_epi32(_mm256_sub_epi32(m2[6], m2[7])); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(_mm256_castsi256_si128(m1[0])); #endif m1[0] = _mm256_add_epi32(m1[0], m1[1]); m1[1] = _mm256_add_epi32(m1[2], m1[3]); m1[2] = _mm256_add_epi32(m1[4], m1[5]); m1[3] = _mm256_add_epi32(m1[6], m1[7]); m1[0] = _mm256_add_epi32(m1[0], m1[1]); m1[1] = _mm256_add_epi32(m1[2], m1[3]); __m256i sum = _mm256_add_epi32(m1[0], m1[1]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); Distortion sad = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (Distortion)(sad / sqrt(4.0 * 8) * 2); return sad; } static Distortion xCalcHAD16x8_HBD_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m256i m1[16], m2[16]; for (int k = 0; k < 8; k++) { m1[k] = _mm256_sub_epi32(_mm256_lddqu_si256((__m256i*) piOrg), _mm256_lddqu_si256((__m256i*) piCur)); m1[k + 8] = _mm256_sub_epi32(_mm256_lddqu_si256((__m256i*)(piOrg + 8)), _mm256_lddqu_si256((__m256i*)(piCur + 8))); piCur += strideCur; piOrg += strideOrg; } // vertical, first 8x8 m2[0] = _mm256_add_epi32(m1[0], m1[4]); m2[1] = _mm256_add_epi32(m1[1], m1[5]); m2[2] = _mm256_add_epi32(m1[2], m1[6]); m2[3] = _mm256_add_epi32(m1[3], m1[7]); m2[4] = _mm256_sub_epi32(m1[0], m1[4]); m2[5] = _mm256_sub_epi32(m1[1], m1[5]); m2[6] = _mm256_sub_epi32(m1[2], m1[6]); m2[7] = _mm256_sub_epi32(m1[3], m1[7]); m1[0] = _mm256_add_epi32(m2[0], m2[2]); m1[1] = _mm256_add_epi32(m2[1], m2[3]); m1[2] = _mm256_sub_epi32(m2[0], m2[2]); m1[3] = _mm256_sub_epi32(m2[1], m2[3]); m1[4] = _mm256_add_epi32(m2[4], m2[6]); m1[5] = _mm256_add_epi32(m2[5], m2[7]); m1[6] = _mm256_sub_epi32(m2[4], m2[6]); m1[7] = _mm256_sub_epi32(m2[5], m2[7]); m2[0] = _mm256_add_epi32(m1[0], m1[1]); m2[1] = _mm256_sub_epi32(m1[0], m1[1]); m2[2] = _mm256_add_epi32(m1[2], m1[3]); m2[3] = _mm256_sub_epi32(m1[2], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[5]); m2[5] = _mm256_sub_epi32(m1[4], m1[5]); m2[6] = _mm256_add_epi32(m1[6], m1[7]); m2[7] = _mm256_sub_epi32(m1[6], m1[7]); // vertical, second 8x8 m2[8 + 0] = _mm256_add_epi32(m1[8 + 0], m1[8 + 4]); m2[8 + 1] = _mm256_add_epi32(m1[8 + 1], m1[8 + 5]); m2[8 + 2] = _mm256_add_epi32(m1[8 + 2], m1[8 + 6]); m2[8 + 3] = _mm256_add_epi32(m1[8 + 3], m1[8 + 7]); m2[8 + 4] = _mm256_sub_epi32(m1[8 + 0], m1[8 + 4]); m2[8 + 5] = _mm256_sub_epi32(m1[8 + 1], m1[8 + 5]); m2[8 + 6] = _mm256_sub_epi32(m1[8 + 2], m1[8 + 6]); m2[8 + 7] = _mm256_sub_epi32(m1[8 + 3], m1[8 + 7]); m1[8 + 0] = _mm256_add_epi32(m2[8 + 0], m2[8 + 2]); m1[8 + 1] = _mm256_add_epi32(m2[8 + 1], m2[8 + 3]); m1[8 + 2] = _mm256_sub_epi32(m2[8 + 0], m2[8 + 2]); m1[8 + 3] = _mm256_sub_epi32(m2[8 + 1], m2[8 + 3]); m1[8 + 4] = _mm256_add_epi32(m2[8 + 4], m2[8 + 6]); m1[8 + 5] = _mm256_add_epi32(m2[8 + 5], m2[8 + 7]); m1[8 + 6] = _mm256_sub_epi32(m2[8 + 4], m2[8 + 6]); m1[8 + 7] = _mm256_sub_epi32(m2[8 + 5], m2[8 + 7]); m2[8 + 0] = _mm256_add_epi32(m1[8 + 0], m1[8 + 1]); m2[8 + 1] = _mm256_sub_epi32(m1[8 + 0], m1[8 + 1]); m2[8 + 2] = _mm256_add_epi32(m1[8 + 2], m1[8 + 3]); m2[8 + 3] = _mm256_sub_epi32(m1[8 + 2], m1[8 + 3]); m2[8 + 4] = _mm256_add_epi32(m1[8 + 4], m1[8 + 5]); m2[8 + 5] = _mm256_sub_epi32(m1[8 + 4], m1[8 + 5]); m2[8 + 6] = _mm256_add_epi32(m1[8 + 6], m1[8 + 7]); m2[8 + 7] = _mm256_sub_epi32(m1[8 + 6], m1[8 + 7]); // transpose constexpr int perm_unpacklo_epi128 = (0 << 0) + (2 << 4); constexpr int perm_unpackhi_epi128 = (1 << 0) + (3 << 4); m1[0] = _mm256_unpacklo_epi32(m2[0], m2[1]); m1[1] = _mm256_unpacklo_epi32(m2[2], m2[3]); m1[2] = _mm256_unpacklo_epi32(m2[4], m2[5]); m1[3] = _mm256_unpacklo_epi32(m2[6], m2[7]); m1[4] = _mm256_unpackhi_epi32(m2[0], m2[1]); m1[5] = _mm256_unpackhi_epi32(m2[2], m2[3]); m1[6] = _mm256_unpackhi_epi32(m2[4], m2[5]); m1[7] = _mm256_unpackhi_epi32(m2[6], m2[7]); m2[0] = _mm256_unpacklo_epi64(m1[0], m1[1]); m2[1] = _mm256_unpackhi_epi64(m1[0], m1[1]); m2[2] = _mm256_unpacklo_epi64(m1[2], m1[3]); m2[3] = _mm256_unpackhi_epi64(m1[2], m1[3]); m2[4] = _mm256_unpacklo_epi64(m1[4], m1[5]); m2[5] = _mm256_unpackhi_epi64(m1[4], m1[5]); m2[6] = _mm256_unpacklo_epi64(m1[6], m1[7]); m2[7] = _mm256_unpackhi_epi64(m1[6], m1[7]); m1[0] = _mm256_permute2x128_si256(m2[0], m2[2], perm_unpacklo_epi128); m1[1] = _mm256_permute2x128_si256(m2[0], m2[2], perm_unpackhi_epi128); m1[2] = _mm256_permute2x128_si256(m2[1], m2[3], perm_unpacklo_epi128); m1[3] = _mm256_permute2x128_si256(m2[1], m2[3], perm_unpackhi_epi128); m1[4] = _mm256_permute2x128_si256(m2[4], m2[6], perm_unpacklo_epi128); m1[5] = _mm256_permute2x128_si256(m2[4], m2[6], perm_unpackhi_epi128); m1[6] = _mm256_permute2x128_si256(m2[5], m2[7], perm_unpacklo_epi128); m1[7] = _mm256_permute2x128_si256(m2[5], m2[7], perm_unpackhi_epi128); m1[8 + 0] = _mm256_unpacklo_epi32(m2[8 + 0], m2[8 + 1]); m1[8 + 1] = _mm256_unpacklo_epi32(m2[8 + 2], m2[8 + 3]); m1[8 + 2] = _mm256_unpacklo_epi32(m2[8 + 4], m2[8 + 5]); m1[8 + 3] = _mm256_unpacklo_epi32(m2[8 + 6], m2[8 + 7]); m1[8 + 4] = _mm256_unpackhi_epi32(m2[8 + 0], m2[8 + 1]); m1[8 + 5] = _mm256_unpackhi_epi32(m2[8 + 2], m2[8 + 3]); m1[8 + 6] = _mm256_unpackhi_epi32(m2[8 + 4], m2[8 + 5]); m1[8 + 7] = _mm256_unpackhi_epi32(m2[8 + 6], m2[8 + 7]); m2[8 + 0] = _mm256_unpacklo_epi64(m1[8 + 0], m1[8 + 1]); m2[8 + 1] = _mm256_unpackhi_epi64(m1[8 + 0], m1[8 + 1]); m2[8 + 2] = _mm256_unpacklo_epi64(m1[8 + 2], m1[8 + 3]); m2[8 + 3] = _mm256_unpackhi_epi64(m1[8 + 2], m1[8 + 3]); m2[8 + 4] = _mm256_unpacklo_epi64(m1[8 + 4], m1[8 + 5]); m2[8 + 5] = _mm256_unpackhi_epi64(m1[8 + 4], m1[8 + 5]); m2[8 + 6] = _mm256_unpacklo_epi64(m1[8 + 6], m1[8 + 7]); m2[8 + 7] = _mm256_unpackhi_epi64(m1[8 + 6], m1[8 + 7]); m1[8 + 0] = _mm256_permute2x128_si256(m2[8 + 0], m2[8 + 2], perm_unpacklo_epi128); m1[8 + 1] = _mm256_permute2x128_si256(m2[8 + 0], m2[8 + 2], perm_unpackhi_epi128); m1[8 + 2] = _mm256_permute2x128_si256(m2[8 + 1], m2[8 + 3], perm_unpacklo_epi128); m1[8 + 3] = _mm256_permute2x128_si256(m2[8 + 1], m2[8 + 3], perm_unpackhi_epi128); m1[8 + 4] = _mm256_permute2x128_si256(m2[8 + 4], m2[8 + 6], perm_unpacklo_epi128); m1[8 + 5] = _mm256_permute2x128_si256(m2[8 + 4], m2[8 + 6], perm_unpackhi_epi128); m1[8 + 6] = _mm256_permute2x128_si256(m2[8 + 5], m2[8 + 7], perm_unpacklo_epi128); m1[8 + 7] = _mm256_permute2x128_si256(m2[8 + 5], m2[8 + 7], perm_unpackhi_epi128); // horizontal m2[0] = _mm256_add_epi32(m1[0], m1[8]); m2[1] = _mm256_add_epi32(m1[1], m1[9]); m2[2] = _mm256_add_epi32(m1[2], m1[10]); m2[3] = _mm256_add_epi32(m1[3], m1[11]); m2[4] = _mm256_add_epi32(m1[4], m1[12]); m2[5] = _mm256_add_epi32(m1[5], m1[13]); m2[6] = _mm256_add_epi32(m1[6], m1[14]); m2[7] = _mm256_add_epi32(m1[7], m1[15]); m2[8] = _mm256_sub_epi32(m1[0], m1[8]); m2[9] = _mm256_sub_epi32(m1[1], m1[9]); m2[10] = _mm256_sub_epi32(m1[2], m1[10]); m2[11] = _mm256_sub_epi32(m1[3], m1[11]); m2[12] = _mm256_sub_epi32(m1[4], m1[12]); m2[13] = _mm256_sub_epi32(m1[5], m1[13]); m2[14] = _mm256_sub_epi32(m1[6], m1[14]); m2[15] = _mm256_sub_epi32(m1[7], m1[15]); m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m1[8] = _mm256_add_epi32(m2[8], m2[12]); m1[9] = _mm256_add_epi32(m2[9], m2[13]); m1[10] = _mm256_add_epi32(m2[10], m2[14]); m1[11] = _mm256_add_epi32(m2[11], m2[15]); m1[12] = _mm256_sub_epi32(m2[8], m2[12]); m1[13] = _mm256_sub_epi32(m2[9], m2[13]); m1[14] = _mm256_sub_epi32(m2[10], m2[14]); m1[15] = _mm256_sub_epi32(m2[11], m2[15]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m2[8] = _mm256_add_epi32(m1[8], m1[10]); m2[9] = _mm256_add_epi32(m1[9], m1[11]); m2[10] = _mm256_sub_epi32(m1[8], m1[10]); m2[11] = _mm256_sub_epi32(m1[9], m1[11]); m2[12] = _mm256_add_epi32(m1[12], m1[14]); m2[13] = _mm256_add_epi32(m1[13], m1[15]); m2[14] = _mm256_sub_epi32(m1[12], m1[14]); m2[15] = _mm256_sub_epi32(m1[13], m1[15]); m1[0] = _mm256_abs_epi32(_mm256_add_epi32(m2[0], m2[1])); m1[1] = _mm256_abs_epi32(_mm256_sub_epi32(m2[0], m2[1])); m1[2] = _mm256_abs_epi32(_mm256_add_epi32(m2[2], m2[3])); m1[3] = _mm256_abs_epi32(_mm256_sub_epi32(m2[2], m2[3])); m1[4] = _mm256_abs_epi32(_mm256_add_epi32(m2[4], m2[5])); m1[5] = _mm256_abs_epi32(_mm256_sub_epi32(m2[4], m2[5])); m1[6] = _mm256_abs_epi32(_mm256_add_epi32(m2[6], m2[7])); m1[7] = _mm256_abs_epi32(_mm256_sub_epi32(m2[6], m2[7])); m1[8] = _mm256_abs_epi32(_mm256_add_epi32(m2[8], m2[9])); m1[9] = _mm256_abs_epi32(_mm256_sub_epi32(m2[8], m2[9])); m1[10] = _mm256_abs_epi32(_mm256_add_epi32(m2[10], m2[11])); m1[11] = _mm256_abs_epi32(_mm256_sub_epi32(m2[10], m2[11])); m1[12] = _mm256_abs_epi32(_mm256_add_epi32(m2[12], m2[13])); m1[13] = _mm256_abs_epi32(_mm256_sub_epi32(m2[12], m2[13])); m1[14] = _mm256_abs_epi32(_mm256_add_epi32(m2[14], m2[15])); m1[15] = _mm256_abs_epi32(_mm256_sub_epi32(m2[14], m2[15])); #if JVET_R0164_MEAN_SCALED_SATD Distortion absDc = _mm_cvtsi128_si32(_mm256_castsi256_si128(m1[0])); #endif // sum up m1[0] = _mm256_add_epi32(m1[0], m1[1]); m1[2] = _mm256_add_epi32(m1[2], m1[3]); m1[4] = _mm256_add_epi32(m1[4], m1[5]); m1[6] = _mm256_add_epi32(m1[6], m1[7]); m1[8] = _mm256_add_epi32(m1[8], m1[9]); m1[10] = _mm256_add_epi32(m1[10], m1[11]); m1[12] = _mm256_add_epi32(m1[12], m1[13]); m1[14] = _mm256_add_epi32(m1[14], m1[15]); m1[0] = _mm256_add_epi32(m1[0], m1[2]); m1[4] = _mm256_add_epi32(m1[4], m1[6]); m1[8] = _mm256_add_epi32(m1[8], m1[10]); m1[12] = _mm256_add_epi32(m1[12], m1[14]); m1[0] = _mm256_add_epi32(m1[0], m1[4]); m1[8] = _mm256_add_epi32(m1[8], m1[12]); __m256i sum = _mm256_add_epi32(m1[0], m1[8]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_add_epi32(sum, _mm256_permute2x128_si256(sum, sum, 0x11)); Distortion sad = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = (uint32_t)(sad / sqrt(16.0 * 8) * 2); return (sad); } static Distortion xCalcHAD8x16_HBD_AVX2(const Pel *piOrg, const Pel *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m256i m1[16], m2[16]; for (int k = 0; k < 16; k++) { m1[k] = _mm256_sub_epi32(_mm256_lddqu_si256((__m256i*)piOrg), _mm256_lddqu_si256((__m256i*)piCur)); piCur += strideCur; piOrg += strideOrg; } // vertical m2[0] = _mm256_add_epi32(m1[0], m1[8]); m2[1] = _mm256_add_epi32(m1[1], m1[9]); m2[2] = _mm256_add_epi32(m1[2], m1[10]); m2[3] = _mm256_add_epi32(m1[3], m1[11]); m2[4] = _mm256_add_epi32(m1[4], m1[12]); m2[5] = _mm256_add_epi32(m1[5], m1[13]); m2[6] = _mm256_add_epi32(m1[6], m1[14]); m2[7] = _mm256_add_epi32(m1[7], m1[15]); m2[8] = _mm256_sub_epi32(m1[0], m1[8]); m2[9] = _mm256_sub_epi32(m1[1], m1[9]); m2[10] = _mm256_sub_epi32(m1[2], m1[10]); m2[11] = _mm256_sub_epi32(m1[3], m1[11]); m2[12] = _mm256_sub_epi32(m1[4], m1[12]); m2[13] = _mm256_sub_epi32(m1[5], m1[13]); m2[14] = _mm256_sub_epi32(m1[6], m1[14]); m2[15] = _mm256_sub_epi32(m1[7], m1[15]); m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m1[8] = _mm256_add_epi32(m2[8], m2[12]); m1[9] = _mm256_add_epi32(m2[9], m2[13]); m1[10] = _mm256_add_epi32(m2[10], m2[14]); m1[11] = _mm256_add_epi32(m2[11], m2[15]); m1[12] = _mm256_sub_epi32(m2[8], m2[12]); m1[13] = _mm256_sub_epi32(m2[9], m2[13]); m1[14] = _mm256_sub_epi32(m2[10], m2[14]); m1[15] = _mm256_sub_epi32(m2[11], m2[15]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m2[8] = _mm256_add_epi32(m1[8], m1[10]); m2[9] = _mm256_add_epi32(m1[9], m1[11]); m2[10] = _mm256_sub_epi32(m1[8], m1[10]); m2[11] = _mm256_sub_epi32(m1[9], m1[11]); m2[12] = _mm256_add_epi32(m1[12], m1[14]); m2[13] = _mm256_add_epi32(m1[13], m1[15]); m2[14] = _mm256_sub_epi32(m1[12], m1[14]); m2[15] = _mm256_sub_epi32(m1[13], m1[15]); m1[0] = _mm256_add_epi32(m2[0], m2[1]); m1[1] = _mm256_sub_epi32(m2[0], m2[1]); m1[2] = _mm256_add_epi32(m2[2], m2[3]); m1[3] = _mm256_sub_epi32(m2[2], m2[3]); m1[4] = _mm256_add_epi32(m2[4], m2[5]); m1[5] = _mm256_sub_epi32(m2[4], m2[5]); m1[6] = _mm256_add_epi32(m2[6], m2[7]); m1[7] = _mm256_sub_epi32(m2[6], m2[7]); m1[8] = _mm256_add_epi32(m2[8], m2[9]); m1[9] = _mm256_sub_epi32(m2[8], m2[9]); m1[10] = _mm256_add_epi32(m2[10], m2[11]); m1[11] = _mm256_sub_epi32(m2[10], m2[11]); m1[12] = _mm256_add_epi32(m2[12], m2[13]); m1[13] = _mm256_sub_epi32(m2[12], m2[13]); m1[14] = _mm256_add_epi32(m2[14], m2[15]); m1[15] = _mm256_sub_epi32(m2[14], m2[15]); // transpose constexpr int perm_unpacklo_epi128 = (0 << 0) + (2 << 4); constexpr int perm_unpackhi_epi128 = (1 << 0) + (3 << 4); // 1. 8x8 m2[0] = _mm256_unpacklo_epi32(m1[0], m1[1]); m2[1] = _mm256_unpacklo_epi32(m1[2], m1[3]); m2[2] = _mm256_unpacklo_epi32(m1[4], m1[5]); m2[3] = _mm256_unpacklo_epi32(m1[6], m1[7]); m2[4] = _mm256_unpackhi_epi32(m1[0], m1[1]); m2[5] = _mm256_unpackhi_epi32(m1[2], m1[3]); m2[6] = _mm256_unpackhi_epi32(m1[4], m1[5]); m2[7] = _mm256_unpackhi_epi32(m1[6], m1[7]); m1[0] = _mm256_unpacklo_epi64(m2[0], m2[1]); m1[1] = _mm256_unpackhi_epi64(m2[0], m2[1]); m1[2] = _mm256_unpacklo_epi64(m2[2], m2[3]); m1[3] = _mm256_unpackhi_epi64(m2[2], m2[3]); m1[4] = _mm256_unpacklo_epi64(m2[4], m2[5]); m1[5] = _mm256_unpackhi_epi64(m2[4], m2[5]); m1[6] = _mm256_unpacklo_epi64(m2[6], m2[7]); m1[7] = _mm256_unpackhi_epi64(m2[6], m2[7]); m2[0] = _mm256_permute2x128_si256(m1[0], m1[2], perm_unpacklo_epi128); m2[1] = _mm256_permute2x128_si256(m1[0], m1[2], perm_unpackhi_epi128); m2[2] = _mm256_permute2x128_si256(m1[1], m1[3], perm_unpacklo_epi128); m2[3] = _mm256_permute2x128_si256(m1[1], m1[3], perm_unpackhi_epi128); m2[4] = _mm256_permute2x128_si256(m1[4], m1[6], perm_unpacklo_epi128); m2[5] = _mm256_permute2x128_si256(m1[4], m1[6], perm_unpackhi_epi128); m2[6] = _mm256_permute2x128_si256(m1[5], m1[7], perm_unpacklo_epi128); m2[7] = _mm256_permute2x128_si256(m1[5], m1[7], perm_unpackhi_epi128); // 2. 8x8 m2[0 + 8] = _mm256_unpacklo_epi32(m1[0 + 8], m1[1 + 8]); m2[1 + 8] = _mm256_unpacklo_epi32(m1[2 + 8], m1[3 + 8]); m2[2 + 8] = _mm256_unpacklo_epi32(m1[4 + 8], m1[5 + 8]); m2[3 + 8] = _mm256_unpacklo_epi32(m1[6 + 8], m1[7 + 8]); m2[4 + 8] = _mm256_unpackhi_epi32(m1[0 + 8], m1[1 + 8]); m2[5 + 8] = _mm256_unpackhi_epi32(m1[2 + 8], m1[3 + 8]); m2[6 + 8] = _mm256_unpackhi_epi32(m1[4 + 8], m1[5 + 8]); m2[7 + 8] = _mm256_unpackhi_epi32(m1[6 + 8], m1[7 + 8]); m1[0 + 8] = _mm256_unpacklo_epi64(m2[0 + 8], m2[1 + 8]); m1[1 + 8] = _mm256_unpackhi_epi64(m2[0 + 8], m2[1 + 8]); m1[2 + 8] = _mm256_unpacklo_epi64(m2[2 + 8], m2[3 + 8]); m1[3 + 8] = _mm256_unpackhi_epi64(m2[2 + 8], m2[3 + 8]); m1[4 + 8] = _mm256_unpacklo_epi64(m2[4 + 8], m2[5 + 8]); m1[5 + 8] = _mm256_unpackhi_epi64(m2[4 + 8], m2[5 + 8]); m1[6 + 8] = _mm256_unpacklo_epi64(m2[6 + 8], m2[7 + 8]); m1[7 + 8] = _mm256_unpackhi_epi64(m2[6 + 8], m2[7 + 8]); m2[0 + 8] = _mm256_permute2x128_si256(m1[0 + 8], m1[2 + 8], perm_unpacklo_epi128); m2[1 + 8] = _mm256_permute2x128_si256(m1[0 + 8], m1[2 + 8], perm_unpackhi_epi128); m2[2 + 8] = _mm256_permute2x128_si256(m1[1 + 8], m1[3 + 8], perm_unpacklo_epi128); m2[3 + 8] = _mm256_permute2x128_si256(m1[1 + 8], m1[3 + 8], perm_unpackhi_epi128); m2[4 + 8] = _mm256_permute2x128_si256(m1[4 + 8], m1[6 + 8], perm_unpacklo_epi128); m2[5 + 8] = _mm256_permute2x128_si256(m1[4 + 8], m1[6 + 8], perm_unpackhi_epi128); m2[6 + 8] = _mm256_permute2x128_si256(m1[5 + 8], m1[7 + 8], perm_unpacklo_epi128); m2[7 + 8] = _mm256_permute2x128_si256(m1[5 + 8], m1[7 + 8], perm_unpackhi_epi128); // horizontal m1[0] = _mm256_add_epi32(m2[0], m2[4]); m1[1] = _mm256_add_epi32(m2[1], m2[5]); m1[2] = _mm256_add_epi32(m2[2], m2[6]); m1[3] = _mm256_add_epi32(m2[3], m2[7]); m1[4] = _mm256_sub_epi32(m2[0], m2[4]); m1[5] = _mm256_sub_epi32(m2[1], m2[5]); m1[6] = _mm256_sub_epi32(m2[2], m2[6]); m1[7] = _mm256_sub_epi32(m2[3], m2[7]); m2[0] = _mm256_add_epi32(m1[0], m1[2]); m2[1] = _mm256_add_epi32(m1[1], m1[3]); m2[2] = _mm256_sub_epi32(m1[0], m1[2]); m2[3] = _mm256_sub_epi32(m1[1], m1[3]); m2[4] = _mm256_add_epi32(m1[4], m1[6]); m2[5] = _mm256_add_epi32(m1[5], m1[7]); m2[6] = _mm256_sub_epi32(m1[4], m1[6]); m2[7] = _mm256_sub_epi32(m1[5], m1[7]); m1[0] = _mm256_abs_epi32(_mm256_add_epi32(m2[0], m2[1])); m1[1] = _mm256_abs_epi32(_mm256_sub_epi32(m2[0], m2[1])); m1[2] = _mm256_abs_epi32(_mm256_add_epi32(m2[2], m2[3])); m1[3] = _mm256_abs_epi32(_mm256_sub_epi32(m2[2], m2[3])); m1[4] = _mm256_abs_epi32(_mm256_add_epi32(m2[4], m2[5])); m1[5] = _mm256_abs_epi32(_mm256_sub_epi32(m2[4], m2[5])); m1[6] = _mm256_abs_epi32(_mm256_add_epi32(m2[6], m2[7])); m1[7] = _mm256_abs_epi32(_mm256_sub_epi32(m2[6], m2[7])); #if JVET_R0164_MEAN_SCALED_SATD int absDc = _mm_cvtsi128_si32(_mm256_castsi256_si128(m1[0])); #endif m1[0 + 8] = _mm256_add_epi32(m2[0 + 8], m2[4 + 8]); m1[1 + 8] = _mm256_add_epi32(m2[1 + 8], m2[5 + 8]); m1[2 + 8] = _mm256_add_epi32(m2[2 + 8], m2[6 + 8]); m1[3 + 8] = _mm256_add_epi32(m2[3 + 8], m2[7 + 8]); m1[4 + 8] = _mm256_sub_epi32(m2[0 + 8], m2[4 + 8]); m1[5 + 8] = _mm256_sub_epi32(m2[1 + 8], m2[5 + 8]); m1[6 + 8] = _mm256_sub_epi32(m2[2 + 8], m2[6 + 8]); m1[7 + 8] = _mm256_sub_epi32(m2[3 + 8], m2[7 + 8]); m2[0 + 8] = _mm256_add_epi32(m1[0 + 8], m1[2 + 8]); m2[1 + 8] = _mm256_add_epi32(m1[1 + 8], m1[3 + 8]); m2[2 + 8] = _mm256_sub_epi32(m1[0 + 8], m1[2 + 8]); m2[3 + 8] = _mm256_sub_epi32(m1[1 + 8], m1[3 + 8]); m2[4 + 8] = _mm256_add_epi32(m1[4 + 8], m1[6 + 8]); m2[5 + 8] = _mm256_add_epi32(m1[5 + 8], m1[7 + 8]); m2[6 + 8] = _mm256_sub_epi32(m1[4 + 8], m1[6 + 8]); m2[7 + 8] = _mm256_sub_epi32(m1[5 + 8], m1[7 + 8]); m1[0 + 8] = _mm256_abs_epi32(_mm256_add_epi32(m2[0 + 8], m2[1 + 8])); m1[1 + 8] = _mm256_abs_epi32(_mm256_sub_epi32(m2[0 + 8], m2[1 + 8])); m1[2 + 8] = _mm256_abs_epi32(_mm256_add_epi32(m2[2 + 8], m2[3 + 8])); m1[3 + 8] = _mm256_abs_epi32(_mm256_sub_epi32(m2[2 + 8], m2[3 + 8])); m1[4 + 8] = _mm256_abs_epi32(_mm256_add_epi32(m2[4 + 8], m2[5 + 8])); m1[5 + 8] = _mm256_abs_epi32(_mm256_sub_epi32(m2[4 + 8], m2[5 + 8])); m1[6 + 8] = _mm256_abs_epi32(_mm256_add_epi32(m2[6 + 8], m2[7 + 8])); m1[7 + 8] = _mm256_abs_epi32(_mm256_sub_epi32(m2[6 + 8], m2[7 + 8])); // sum up m1[0] = _mm256_add_epi32(m1[0], m1[1]); m1[1] = _mm256_add_epi32(m1[2], m1[3]); m1[2] = _mm256_add_epi32(m1[4], m1[5]); m1[3] = _mm256_add_epi32(m1[6], m1[7]); m1[4] = _mm256_add_epi32(m1[8], m1[9]); m1[5] = _mm256_add_epi32(m1[10], m1[11]); m1[6] = _mm256_add_epi32(m1[12], m1[13]); m1[7] = _mm256_add_epi32(m1[14], m1[15]); // sum up m1[0] = _mm256_add_epi32(m1[0], m1[1]); m1[1] = _mm256_add_epi32(m1[2], m1[3]); m1[2] = _mm256_add_epi32(m1[4], m1[5]); m1[3] = _mm256_add_epi32(m1[6], m1[7]); m1[0] = _mm256_add_epi32(m1[0], m1[1]); m1[1] = _mm256_add_epi32(m1[2], m1[3]); __m256i sum = _mm256_add_epi32(m1[0], m1[1]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_add_epi32(sum, _mm256_permute2x128_si256(sum, sum, 0x11)); Distortion sad2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); #if JVET_R0164_MEAN_SCALED_SATD sad2 -= absDc; sad2 += absDc >> 2; #endif Distortion sad = (uint32_t)(sad2 / sqrt(16.0 * 8) * 2); return (sad); } #endif #else static constexpr uint64_t INV_SQRT_2 = 0xb504f334U; // 2^32 / sqrt(2.0) static uint32_t xCalcHAD4x4_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur) { __m128i r0 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadl_epi64( ( const __m128i* )&piOrg[0] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piOrg[0] ), _mm_setzero_si128() ) ); __m128i r1 = (sizeof(Torg) > 1) ? (_mm_loadl_epi64((const __m128i *) &piOrg[strideOrg])) : (_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *) &piOrg[strideOrg]), _mm_setzero_si128())); __m128i r2 = (sizeof(Torg) > 1) ? (_mm_loadl_epi64((const __m128i *) &piOrg[2 * strideOrg])) : (_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *) &piOrg[2 * strideOrg]), _mm_setzero_si128())); __m128i r3 = (sizeof(Torg) > 1) ? (_mm_loadl_epi64((const __m128i *) &piOrg[3 * strideOrg])) : (_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *) &piOrg[3 * strideOrg]), _mm_setzero_si128())); __m128i r4 = ( sizeof( Tcur ) > 1 ) ? ( _mm_loadl_epi64( ( const __m128i* )&piCur[0] ) ) : ( _mm_unpacklo_epi8( _mm_cvtsi32_si128( *(const int*)&piCur[0] ), _mm_setzero_si128() ) ); __m128i r5 = (sizeof(Tcur) > 1) ? (_mm_loadl_epi64((const __m128i *) &piCur[strideCur])) : (_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *) &piCur[strideCur]), _mm_setzero_si128())); __m128i r6 = (sizeof(Tcur) > 1) ? (_mm_loadl_epi64((const __m128i *) &piCur[2 * strideCur])) : (_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *) &piCur[2 * strideCur]), _mm_setzero_si128())); __m128i r7 = (sizeof(Tcur) > 1) ? (_mm_loadl_epi64((const __m128i *) &piCur[3 * strideCur])) : (_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *) &piCur[3 * strideCur]), _mm_setzero_si128())); r0 = _mm_sub_epi16( r0, r4 ); r1 = _mm_sub_epi16( r1, r5 ); r2 = _mm_sub_epi16( r2, r6 ); r3 = _mm_sub_epi16( r3, r7 ); // first stage r4 = r0; r5 = r1; r0 = _mm_add_epi16( r0, r3 ); r1 = _mm_add_epi16( r1, r2 ); r4 = _mm_sub_epi16( r4, r3 ); r5 = _mm_sub_epi16( r5, r2 ); r2 = r0; r3 = r4; r0 = _mm_add_epi16( r0, r1 ); r2 = _mm_sub_epi16( r2, r1 ); r3 = _mm_sub_epi16( r3, r5 ); r5 = _mm_add_epi16( r5, r4 ); // shuffle - flip matrix for vertical transform r0 = _mm_unpacklo_epi16( r0, r5 ); r2 = _mm_unpacklo_epi16( r2, r3 ); r3 = r0; r0 = _mm_unpacklo_epi32( r0, r2 ); r3 = _mm_unpackhi_epi32( r3, r2 ); r1 = r0; r2 = r3; r1 = _mm_srli_si128( r1, 8 ); r3 = _mm_srli_si128( r3, 8 ); // second stage r4 = r0; r5 = r1; r0 = _mm_add_epi16( r0, r3 ); r1 = _mm_add_epi16( r1, r2 ); r4 = _mm_sub_epi16( r4, r3 ); r5 = _mm_sub_epi16( r5, r2 ); r2 = r0; r3 = r4; r0 = _mm_add_epi16( r0, r1 ); r2 = _mm_sub_epi16( r2, r1 ); r3 = _mm_sub_epi16( r3, r5 ); r5 = _mm_add_epi16( r5, r4 ); // abs __m128i Sum = _mm_abs_epi16( r0 ); #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = _mm_cvtsi128_si32( Sum ) & 0x0000ffff; #endif Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r2 ) ); Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r3 ) ); Sum = _mm_add_epi16( Sum, _mm_abs_epi16( r5 ) ); __m128i iZero = _mm_set1_epi16( 0 ); Sum = _mm_unpacklo_epi16( Sum, iZero ); Sum = _mm_hadd_epi32( Sum, Sum ); Sum = _mm_hadd_epi32( Sum, Sum ); uint32_t sad = _mm_cvtsi128_si32( Sum ); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = ( ( sad + 1 ) >> 1 ); return sad; } //working up to 12-bit static uint32_t xCalcHAD8x8_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { __m128i m1[8][2], m2[8][2]; for( int k = 0; k < 8; k++ ) { __m128i r0 = ( sizeof( Torg ) > 1 ) ? ( _mm_loadu_si128( ( __m128i* )piOrg ) ) : ( _mm_unpacklo_epi8( _mm_loadl_epi64( ( const __m128i* )piOrg ), _mm_setzero_si128() ) ); __m128i r1 = ( sizeof( Tcur ) > 1 ) ? ( _mm_lddqu_si128( ( __m128i* )piCur ) ) : ( _mm_unpacklo_epi8( _mm_loadl_epi64( ( const __m128i* )piCur ), _mm_setzero_si128() ) ); // th _mm_loadu_si128( (__m128i*)piCur ) m2[k][0] = _mm_sub_epi16( r0, r1 ); m2[k][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m2[k][0], 8 ) ); m2[k][0] = _mm_cvtepi16_epi32( m2[k][0] ); piCur += strideCur; piOrg += strideOrg; } for( int i = 0; i < 2; i++ ) { //horizontal m1[0][i] = _mm_add_epi32( m2[0][i], m2[4][i] ); m1[1][i] = _mm_add_epi32( m2[1][i], m2[5][i] ); m1[2][i] = _mm_add_epi32( m2[2][i], m2[6][i] ); m1[3][i] = _mm_add_epi32( m2[3][i], m2[7][i] ); m1[4][i] = _mm_sub_epi32( m2[0][i], m2[4][i] ); m1[5][i] = _mm_sub_epi32( m2[1][i], m2[5][i] ); m1[6][i] = _mm_sub_epi32( m2[2][i], m2[6][i] ); m1[7][i] = _mm_sub_epi32( m2[3][i], m2[7][i] ); m2[0][i] = _mm_add_epi32( m1[0][i], m1[2][i] ); m2[1][i] = _mm_add_epi32( m1[1][i], m1[3][i] ); m2[2][i] = _mm_sub_epi32( m1[0][i], m1[2][i] ); m2[3][i] = _mm_sub_epi32( m1[1][i], m1[3][i] ); m2[4][i] = _mm_add_epi32( m1[4][i], m1[6][i] ); m2[5][i] = _mm_add_epi32( m1[5][i], m1[7][i] ); m2[6][i] = _mm_sub_epi32( m1[4][i], m1[6][i] ); m2[7][i] = _mm_sub_epi32( m1[5][i], m1[7][i] ); m1[0][i] = _mm_add_epi32( m2[0][i], m2[1][i] ); m1[1][i] = _mm_sub_epi32( m2[0][i], m2[1][i] ); m1[2][i] = _mm_add_epi32( m2[2][i], m2[3][i] ); m1[3][i] = _mm_sub_epi32( m2[2][i], m2[3][i] ); m1[4][i] = _mm_add_epi32( m2[4][i], m2[5][i] ); m1[5][i] = _mm_sub_epi32( m2[4][i], m2[5][i] ); m1[6][i] = _mm_add_epi32( m2[6][i], m2[7][i] ); m1[7][i] = _mm_sub_epi32( m2[6][i], m2[7][i] ); m2[0][i] = _mm_unpacklo_epi32( m1[0][i], m1[1][i] ); m2[1][i] = _mm_unpacklo_epi32( m1[2][i], m1[3][i] ); m2[2][i] = _mm_unpackhi_epi32( m1[0][i], m1[1][i] ); m2[3][i] = _mm_unpackhi_epi32( m1[2][i], m1[3][i] ); m2[4][i] = _mm_unpacklo_epi32( m1[4][i], m1[5][i] ); m2[5][i] = _mm_unpacklo_epi32( m1[6][i], m1[7][i] ); m2[6][i] = _mm_unpackhi_epi32( m1[4][i], m1[5][i] ); m2[7][i] = _mm_unpackhi_epi32( m1[6][i], m1[7][i] ); m1[0][i] = _mm_unpacklo_epi64( m2[0][i], m2[1][i] ); m1[1][i] = _mm_unpackhi_epi64( m2[0][i], m2[1][i] ); m1[2][i] = _mm_unpacklo_epi64( m2[2][i], m2[3][i] ); m1[3][i] = _mm_unpackhi_epi64( m2[2][i], m2[3][i] ); m1[4][i] = _mm_unpacklo_epi64( m2[4][i], m2[5][i] ); m1[5][i] = _mm_unpackhi_epi64( m2[4][i], m2[5][i] ); m1[6][i] = _mm_unpacklo_epi64( m2[6][i], m2[7][i] ); m1[7][i] = _mm_unpackhi_epi64( m2[6][i], m2[7][i] ); } __m128i n1[8][2]; __m128i n2[8][2]; for( int i = 0; i < 8; i++ ) { int ii = i % 4; int ij = i >> 2; n2[i][0] = m1[ii ][ij]; n2[i][1] = m1[ii + 4][ij]; } for( int i = 0; i < 2; i++ ) { n1[0][i] = _mm_add_epi32( n2[0][i], n2[4][i] ); n1[1][i] = _mm_add_epi32( n2[1][i], n2[5][i] ); n1[2][i] = _mm_add_epi32( n2[2][i], n2[6][i] ); n1[3][i] = _mm_add_epi32( n2[3][i], n2[7][i] ); n1[4][i] = _mm_sub_epi32( n2[0][i], n2[4][i] ); n1[5][i] = _mm_sub_epi32( n2[1][i], n2[5][i] ); n1[6][i] = _mm_sub_epi32( n2[2][i], n2[6][i] ); n1[7][i] = _mm_sub_epi32( n2[3][i], n2[7][i] ); n2[0][i] = _mm_add_epi32( n1[0][i], n1[2][i] ); n2[1][i] = _mm_add_epi32( n1[1][i], n1[3][i] ); n2[2][i] = _mm_sub_epi32( n1[0][i], n1[2][i] ); n2[3][i] = _mm_sub_epi32( n1[1][i], n1[3][i] ); n2[4][i] = _mm_add_epi32( n1[4][i], n1[6][i] ); n2[5][i] = _mm_add_epi32( n1[5][i], n1[7][i] ); n2[6][i] = _mm_sub_epi32( n1[4][i], n1[6][i] ); n2[7][i] = _mm_sub_epi32( n1[5][i], n1[7][i] ); n1[0][i] = _mm_abs_epi32( _mm_add_epi32( n2[0][i], n2[1][i] ) ); n1[1][i] = _mm_abs_epi32( _mm_sub_epi32( n2[0][i], n2[1][i] ) ); n1[2][i] = _mm_abs_epi32( _mm_add_epi32( n2[2][i], n2[3][i] ) ); n1[3][i] = _mm_abs_epi32( _mm_sub_epi32( n2[2][i], n2[3][i] ) ); n1[4][i] = _mm_abs_epi32( _mm_add_epi32( n2[4][i], n2[5][i] ) ); n1[5][i] = _mm_abs_epi32( _mm_sub_epi32( n2[4][i], n2[5][i] ) ); n1[6][i] = _mm_abs_epi32( _mm_add_epi32( n2[6][i], n2[7][i] ) ); n1[7][i] = _mm_abs_epi32( _mm_sub_epi32( n2[6][i], n2[7][i] ) ); } for( int i = 0; i < 8; i++ ) { m1[i][0] = _mm_add_epi32( n1[i][0], n1[i][1] ); } m1[0][0] = _mm_add_epi32( m1[0][0], m1[1][0] ); m1[2][0] = _mm_add_epi32( m1[2][0], m1[3][0] ); m1[4][0] = _mm_add_epi32( m1[4][0], m1[5][0] ); m1[6][0] = _mm_add_epi32( m1[6][0], m1[7][0] ); m1[0][0] = _mm_add_epi32( m1[0][0], m1[2][0] ); m1[4][0] = _mm_add_epi32( m1[4][0], m1[6][0] ); __m128i sum = _mm_add_epi32(m1[0][0], m1[4][0]); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); uint32_t sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = _mm_cvtsi128_si32( n1[0][0] ); sad -= absDc; sad += absDc >> 2; #endif sad = ( ( sad + 2 ) >> 2 ); return sad; } //working up to 12-bit static uint32_t xCalcHAD16x8_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { __m128i m1[16][2][2], m2[16][2][2]; __m128i sum = _mm_setzero_si128(); for( int l = 0; l < 2; l++ ) { const Torg *piOrgPtr = piOrg + l*8; const Tcur *piCurPtr = piCur + l*8; for( int k = 0; k < 8; k++ ) { __m128i r0 = _mm_loadu_si128( (__m128i*) piOrgPtr ); __m128i r1 = _mm_lddqu_si128( (__m128i*) piCurPtr ); m2[k][l][0] = _mm_sub_epi16( r0, r1 ); m2[k][l][1] = _mm_cvtepi16_epi32( _mm_srli_si128( m2[k][l][0], 8 ) ); m2[k][l][0] = _mm_cvtepi16_epi32( m2[k][l][0] ); piCurPtr += strideCur; piOrgPtr += strideOrg; } for( int i = 0; i < 2; i++ ) { //vertical m1[0][l][i] = _mm_add_epi32( m2[0][l][i], m2[4][l][i] ); m1[1][l][i] = _mm_add_epi32( m2[1][l][i], m2[5][l][i] ); m1[2][l][i] = _mm_add_epi32( m2[2][l][i], m2[6][l][i] ); m1[3][l][i] = _mm_add_epi32( m2[3][l][i], m2[7][l][i] ); m1[4][l][i] = _mm_sub_epi32( m2[0][l][i], m2[4][l][i] ); m1[5][l][i] = _mm_sub_epi32( m2[1][l][i], m2[5][l][i] ); m1[6][l][i] = _mm_sub_epi32( m2[2][l][i], m2[6][l][i] ); m1[7][l][i] = _mm_sub_epi32( m2[3][l][i], m2[7][l][i] ); m2[0][l][i] = _mm_add_epi32( m1[0][l][i], m1[2][l][i] ); m2[1][l][i] = _mm_add_epi32( m1[1][l][i], m1[3][l][i] ); m2[2][l][i] = _mm_sub_epi32( m1[0][l][i], m1[2][l][i] ); m2[3][l][i] = _mm_sub_epi32( m1[1][l][i], m1[3][l][i] ); m2[4][l][i] = _mm_add_epi32( m1[4][l][i], m1[6][l][i] ); m2[5][l][i] = _mm_add_epi32( m1[5][l][i], m1[7][l][i] ); m2[6][l][i] = _mm_sub_epi32( m1[4][l][i], m1[6][l][i] ); m2[7][l][i] = _mm_sub_epi32( m1[5][l][i], m1[7][l][i] ); m1[0][l][i] = _mm_add_epi32( m2[0][l][i], m2[1][l][i] ); m1[1][l][i] = _mm_sub_epi32( m2[0][l][i], m2[1][l][i] ); m1[2][l][i] = _mm_add_epi32( m2[2][l][i], m2[3][l][i] ); m1[3][l][i] = _mm_sub_epi32( m2[2][l][i], m2[3][l][i] ); m1[4][l][i] = _mm_add_epi32( m2[4][l][i], m2[5][l][i] ); m1[5][l][i] = _mm_sub_epi32( m2[4][l][i], m2[5][l][i] ); m1[6][l][i] = _mm_add_epi32( m2[6][l][i], m2[7][l][i] ); m1[7][l][i] = _mm_sub_epi32( m2[6][l][i], m2[7][l][i] ); } } // 4 x 8x4 blocks // 0 1 // 2 3 #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = 0; #endif // transpose and do horizontal in two steps for( int l = 0; l < 2; l++ ) { int off = l * 4; __m128i n1[16]; __m128i n2[16]; m2[0][0][0] = _mm_unpacklo_epi32( m1[0 + off][0][0], m1[1 + off][0][0] ); m2[1][0][0] = _mm_unpacklo_epi32( m1[2 + off][0][0], m1[3 + off][0][0] ); m2[2][0][0] = _mm_unpackhi_epi32( m1[0 + off][0][0], m1[1 + off][0][0] ); m2[3][0][0] = _mm_unpackhi_epi32( m1[2 + off][0][0], m1[3 + off][0][0] ); m2[0][0][1] = _mm_unpacklo_epi32( m1[0 + off][0][1], m1[1 + off][0][1] ); m2[1][0][1] = _mm_unpacklo_epi32( m1[2 + off][0][1], m1[3 + off][0][1] ); m2[2][0][1] = _mm_unpackhi_epi32( m1[0 + off][0][1], m1[1 + off][0][1] ); m2[3][0][1] = _mm_unpackhi_epi32( m1[2 + off][0][1], m1[3 + off][0][1] ); n1[0] = _mm_unpacklo_epi64( m2[0][0][0], m2[1][0][0] ); n1[1] = _mm_unpackhi_epi64( m2[0][0][0], m2[1][0][0] ); n1[2] = _mm_unpacklo_epi64( m2[2][0][0], m2[3][0][0] ); n1[3] = _mm_unpackhi_epi64( m2[2][0][0], m2[3][0][0] ); n1[4] = _mm_unpacklo_epi64( m2[0][0][1], m2[1][0][1] ); n1[5] = _mm_unpackhi_epi64( m2[0][0][1], m2[1][0][1] ); n1[6] = _mm_unpacklo_epi64( m2[2][0][1], m2[3][0][1] ); n1[7] = _mm_unpackhi_epi64( m2[2][0][1], m2[3][0][1] ); // transpose 8x4 -> 4x8, block 1(3) m2[8+0][0][0] = _mm_unpacklo_epi32( m1[0 + off][1][0], m1[1 + off][1][0] ); m2[8+1][0][0] = _mm_unpacklo_epi32( m1[2 + off][1][0], m1[3 + off][1][0] ); m2[8+2][0][0] = _mm_unpackhi_epi32( m1[0 + off][1][0], m1[1 + off][1][0] ); m2[8+3][0][0] = _mm_unpackhi_epi32( m1[2 + off][1][0], m1[3 + off][1][0] ); m2[8+0][0][1] = _mm_unpacklo_epi32( m1[0 + off][1][1], m1[1 + off][1][1] ); m2[8+1][0][1] = _mm_unpacklo_epi32( m1[2 + off][1][1], m1[3 + off][1][1] ); m2[8+2][0][1] = _mm_unpackhi_epi32( m1[0 + off][1][1], m1[1 + off][1][1] ); m2[8+3][0][1] = _mm_unpackhi_epi32( m1[2 + off][1][1], m1[3 + off][1][1] ); n1[8+0] = _mm_unpacklo_epi64( m2[8+0][0][0], m2[8+1][0][0] ); n1[8+1] = _mm_unpackhi_epi64( m2[8+0][0][0], m2[8+1][0][0] ); n1[8+2] = _mm_unpacklo_epi64( m2[8+2][0][0], m2[8+3][0][0] ); n1[8+3] = _mm_unpackhi_epi64( m2[8+2][0][0], m2[8+3][0][0] ); n1[8+4] = _mm_unpacklo_epi64( m2[8+0][0][1], m2[8+1][0][1] ); n1[8+5] = _mm_unpackhi_epi64( m2[8+0][0][1], m2[8+1][0][1] ); n1[8+6] = _mm_unpacklo_epi64( m2[8+2][0][1], m2[8+3][0][1] ); n1[8+7] = _mm_unpackhi_epi64( m2[8+2][0][1], m2[8+3][0][1] ); n2[0] = _mm_add_epi32( n1[0], n1[8] ); n2[1] = _mm_add_epi32( n1[1], n1[9] ); n2[2] = _mm_add_epi32( n1[2], n1[10] ); n2[3] = _mm_add_epi32( n1[3], n1[11] ); n2[4] = _mm_add_epi32( n1[4], n1[12] ); n2[5] = _mm_add_epi32( n1[5], n1[13] ); n2[6] = _mm_add_epi32( n1[6], n1[14] ); n2[7] = _mm_add_epi32( n1[7], n1[15] ); n2[8] = _mm_sub_epi32( n1[0], n1[8] ); n2[9] = _mm_sub_epi32( n1[1], n1[9] ); n2[10] = _mm_sub_epi32( n1[2], n1[10] ); n2[11] = _mm_sub_epi32( n1[3], n1[11] ); n2[12] = _mm_sub_epi32( n1[4], n1[12] ); n2[13] = _mm_sub_epi32( n1[5], n1[13] ); n2[14] = _mm_sub_epi32( n1[6], n1[14] ); n2[15] = _mm_sub_epi32( n1[7], n1[15] ); n1[0] = _mm_add_epi32( n2[0], n2[4] ); n1[1] = _mm_add_epi32( n2[1], n2[5] ); n1[2] = _mm_add_epi32( n2[2], n2[6] ); n1[3] = _mm_add_epi32( n2[3], n2[7] ); n1[4] = _mm_sub_epi32( n2[0], n2[4] ); n1[5] = _mm_sub_epi32( n2[1], n2[5] ); n1[6] = _mm_sub_epi32( n2[2], n2[6] ); n1[7] = _mm_sub_epi32( n2[3], n2[7] ); n1[8] = _mm_add_epi32( n2[8], n2[12] ); n1[9] = _mm_add_epi32( n2[9], n2[13] ); n1[10] = _mm_add_epi32( n2[10], n2[14] ); n1[11] = _mm_add_epi32( n2[11], n2[15] ); n1[12] = _mm_sub_epi32( n2[8], n2[12] ); n1[13] = _mm_sub_epi32( n2[9], n2[13] ); n1[14] = _mm_sub_epi32( n2[10], n2[14] ); n1[15] = _mm_sub_epi32( n2[11], n2[15] ); n2[0] = _mm_add_epi32( n1[0], n1[2] ); n2[1] = _mm_add_epi32( n1[1], n1[3] ); n2[2] = _mm_sub_epi32( n1[0], n1[2] ); n2[3] = _mm_sub_epi32( n1[1], n1[3] ); n2[4] = _mm_add_epi32( n1[4], n1[6] ); n2[5] = _mm_add_epi32( n1[5], n1[7] ); n2[6] = _mm_sub_epi32( n1[4], n1[6] ); n2[7] = _mm_sub_epi32( n1[5], n1[7] ); n2[8] = _mm_add_epi32( n1[8], n1[10] ); n2[9] = _mm_add_epi32( n1[9], n1[11] ); n2[10] = _mm_sub_epi32( n1[8], n1[10] ); n2[11] = _mm_sub_epi32( n1[9], n1[11] ); n2[12] = _mm_add_epi32( n1[12], n1[14] ); n2[13] = _mm_add_epi32( n1[13], n1[15] ); n2[14] = _mm_sub_epi32( n1[12], n1[14] ); n2[15] = _mm_sub_epi32( n1[13], n1[15] ); n1[0] = _mm_abs_epi32( _mm_add_epi32( n2[0], n2[1] ) ); n1[1] = _mm_abs_epi32( _mm_sub_epi32( n2[0], n2[1] ) ); n1[2] = _mm_abs_epi32( _mm_add_epi32( n2[2], n2[3] ) ); n1[3] = _mm_abs_epi32( _mm_sub_epi32( n2[2], n2[3] ) ); n1[4] = _mm_abs_epi32( _mm_add_epi32( n2[4], n2[5] ) ); n1[5] = _mm_abs_epi32( _mm_sub_epi32( n2[4], n2[5] ) ); n1[6] = _mm_abs_epi32( _mm_add_epi32( n2[6], n2[7] ) ); n1[7] = _mm_abs_epi32( _mm_sub_epi32( n2[6], n2[7] ) ); n1[8] = _mm_abs_epi32( _mm_add_epi32( n2[8], n2[9] ) ); n1[9] = _mm_abs_epi32( _mm_sub_epi32( n2[8], n2[9] ) ); n1[10] = _mm_abs_epi32( _mm_add_epi32( n2[10], n2[11] ) ); n1[11] = _mm_abs_epi32( _mm_sub_epi32( n2[10], n2[11] ) ); n1[12] = _mm_abs_epi32( _mm_add_epi32( n2[12], n2[13] ) ); n1[13] = _mm_abs_epi32( _mm_sub_epi32( n2[12], n2[13] ) ); n1[14] = _mm_abs_epi32( _mm_add_epi32( n2[14], n2[15] ) ); n1[15] = _mm_abs_epi32( _mm_sub_epi32( n2[14], n2[15] ) ); #if JVET_R0164_MEAN_SCALED_SATD if (l == 0) absDc = _mm_cvtsi128_si32( n1[0] ); #endif // sum up n1[0] = _mm_add_epi32( n1[0], n1[1] ); n1[2] = _mm_add_epi32( n1[2], n1[3] ); n1[4] = _mm_add_epi32( n1[4], n1[5] ); n1[6] = _mm_add_epi32( n1[6], n1[7] ); n1[8] = _mm_add_epi32( n1[8], n1[9] ); n1[10] = _mm_add_epi32( n1[10], n1[11] ); n1[12] = _mm_add_epi32( n1[12], n1[13] ); n1[14] = _mm_add_epi32( n1[14], n1[15] ); n1[0] = _mm_add_epi32( n1[0], n1[2] ); n1[4] = _mm_add_epi32( n1[4], n1[6] ); n1[8] = _mm_add_epi32( n1[8], n1[10] ); n1[12] = _mm_add_epi32( n1[12], n1[14] ); n1[0] = _mm_add_epi32( n1[0], n1[4] ); n1[8] = _mm_add_epi32( n1[8], n1[12] ); n1[0] = _mm_add_epi32( n1[0], n1[8] ); sum = _mm_add_epi32(sum, n1[0]); } sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); uint32_t sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = sad * INV_SQRT_2 >> 32; sad >>= 2; return sad; } //working up to 12-bit static uint32_t xCalcHAD8x16_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { __m128i m1[2][16], m2[2][16]; __m128i sum = _mm_setzero_si128(); for( int k = 0; k < 16; k++ ) { __m128i r0 =_mm_loadu_si128( (__m128i*)piOrg ); __m128i r1 =_mm_lddqu_si128( (__m128i*)piCur ); m1[0][k] = _mm_sub_epi16( r0, r1 ); m1[1][k] = _mm_cvtepi16_epi32( _mm_srli_si128( m1[0][k], 8 ) ); m1[0][k] = _mm_cvtepi16_epi32( m1[0][k] ); piCur += strideCur; piOrg += strideOrg; } for( int i = 0; i < 2; i++ ) { // vertical m2[i][ 0] = _mm_add_epi32( m1[i][ 0], m1[i][ 8] ); m2[i][ 1] = _mm_add_epi32( m1[i][ 1], m1[i][ 9] ); m2[i][ 2] = _mm_add_epi32( m1[i][ 2], m1[i][10] ); m2[i][ 3] = _mm_add_epi32( m1[i][ 3], m1[i][11] ); m2[i][ 4] = _mm_add_epi32( m1[i][ 4], m1[i][12] ); m2[i][ 5] = _mm_add_epi32( m1[i][ 5], m1[i][13] ); m2[i][ 6] = _mm_add_epi32( m1[i][ 6], m1[i][14] ); m2[i][ 7] = _mm_add_epi32( m1[i][ 7], m1[i][15] ); m2[i][ 8] = _mm_sub_epi32( m1[i][ 0], m1[i][ 8] ); m2[i][ 9] = _mm_sub_epi32( m1[i][ 1], m1[i][ 9] ); m2[i][10] = _mm_sub_epi32( m1[i][ 2], m1[i][10] ); m2[i][11] = _mm_sub_epi32( m1[i][ 3], m1[i][11] ); m2[i][12] = _mm_sub_epi32( m1[i][ 4], m1[i][12] ); m2[i][13] = _mm_sub_epi32( m1[i][ 5], m1[i][13] ); m2[i][14] = _mm_sub_epi32( m1[i][ 6], m1[i][14] ); m2[i][15] = _mm_sub_epi32( m1[i][ 7], m1[i][15] ); m1[i][ 0] = _mm_add_epi32( m2[i][ 0], m2[i][ 4] ); m1[i][ 1] = _mm_add_epi32( m2[i][ 1], m2[i][ 5] ); m1[i][ 2] = _mm_add_epi32( m2[i][ 2], m2[i][ 6] ); m1[i][ 3] = _mm_add_epi32( m2[i][ 3], m2[i][ 7] ); m1[i][ 4] = _mm_sub_epi32( m2[i][ 0], m2[i][ 4] ); m1[i][ 5] = _mm_sub_epi32( m2[i][ 1], m2[i][ 5] ); m1[i][ 6] = _mm_sub_epi32( m2[i][ 2], m2[i][ 6] ); m1[i][ 7] = _mm_sub_epi32( m2[i][ 3], m2[i][ 7] ); m1[i][ 8] = _mm_add_epi32( m2[i][ 8], m2[i][12] ); m1[i][ 9] = _mm_add_epi32( m2[i][ 9], m2[i][13] ); m1[i][10] = _mm_add_epi32( m2[i][10], m2[i][14] ); m1[i][11] = _mm_add_epi32( m2[i][11], m2[i][15] ); m1[i][12] = _mm_sub_epi32( m2[i][ 8], m2[i][12] ); m1[i][13] = _mm_sub_epi32( m2[i][ 9], m2[i][13] ); m1[i][14] = _mm_sub_epi32( m2[i][10], m2[i][14] ); m1[i][15] = _mm_sub_epi32( m2[i][11], m2[i][15] ); m2[i][ 0] = _mm_add_epi32( m1[i][ 0], m1[i][ 2] ); m2[i][ 1] = _mm_add_epi32( m1[i][ 1], m1[i][ 3] ); m2[i][ 2] = _mm_sub_epi32( m1[i][ 0], m1[i][ 2] ); m2[i][ 3] = _mm_sub_epi32( m1[i][ 1], m1[i][ 3] ); m2[i][ 4] = _mm_add_epi32( m1[i][ 4], m1[i][ 6] ); m2[i][ 5] = _mm_add_epi32( m1[i][ 5], m1[i][ 7] ); m2[i][ 6] = _mm_sub_epi32( m1[i][ 4], m1[i][ 6] ); m2[i][ 7] = _mm_sub_epi32( m1[i][ 5], m1[i][ 7] ); m2[i][ 8] = _mm_add_epi32( m1[i][ 8], m1[i][10] ); m2[i][ 9] = _mm_add_epi32( m1[i][ 9], m1[i][11] ); m2[i][10] = _mm_sub_epi32( m1[i][ 8], m1[i][10] ); m2[i][11] = _mm_sub_epi32( m1[i][ 9], m1[i][11] ); m2[i][12] = _mm_add_epi32( m1[i][12], m1[i][14] ); m2[i][13] = _mm_add_epi32( m1[i][13], m1[i][15] ); m2[i][14] = _mm_sub_epi32( m1[i][12], m1[i][14] ); m2[i][15] = _mm_sub_epi32( m1[i][13], m1[i][15] ); m1[i][ 0] = _mm_add_epi32( m2[i][ 0], m2[i][ 1] ); m1[i][ 1] = _mm_sub_epi32( m2[i][ 0], m2[i][ 1] ); m1[i][ 2] = _mm_add_epi32( m2[i][ 2], m2[i][ 3] ); m1[i][ 3] = _mm_sub_epi32( m2[i][ 2], m2[i][ 3] ); m1[i][ 4] = _mm_add_epi32( m2[i][ 4], m2[i][ 5] ); m1[i][ 5] = _mm_sub_epi32( m2[i][ 4], m2[i][ 5] ); m1[i][ 6] = _mm_add_epi32( m2[i][ 6], m2[i][ 7] ); m1[i][ 7] = _mm_sub_epi32( m2[i][ 6], m2[i][ 7] ); m1[i][ 8] = _mm_add_epi32( m2[i][ 8], m2[i][ 9] ); m1[i][ 9] = _mm_sub_epi32( m2[i][ 8], m2[i][ 9] ); m1[i][10] = _mm_add_epi32( m2[i][10], m2[i][11] ); m1[i][11] = _mm_sub_epi32( m2[i][10], m2[i][11] ); m1[i][12] = _mm_add_epi32( m2[i][12], m2[i][13] ); m1[i][13] = _mm_sub_epi32( m2[i][12], m2[i][13] ); m1[i][14] = _mm_add_epi32( m2[i][14], m2[i][15] ); m1[i][15] = _mm_sub_epi32( m2[i][14], m2[i][15] ); } // process horizontal in two steps ( 2 x 8x8 blocks ) for( int l = 0; l < 4; l++ ) { int off = l * 4; for( int i = 0; i < 2; i++ ) { // transpose 4x4 m2[i][0 + off] = _mm_unpacklo_epi32( m1[i][0 + off], m1[i][1 + off] ); m2[i][1 + off] = _mm_unpackhi_epi32( m1[i][0 + off], m1[i][1 + off] ); m2[i][2 + off] = _mm_unpacklo_epi32( m1[i][2 + off], m1[i][3 + off] ); m2[i][3 + off] = _mm_unpackhi_epi32( m1[i][2 + off], m1[i][3 + off] ); m1[i][0 + off] = _mm_unpacklo_epi64( m2[i][0 + off], m2[i][2 + off] ); m1[i][1 + off] = _mm_unpackhi_epi64( m2[i][0 + off], m2[i][2 + off] ); m1[i][2 + off] = _mm_unpacklo_epi64( m2[i][1 + off], m2[i][3 + off] ); m1[i][3 + off] = _mm_unpackhi_epi64( m2[i][1 + off], m2[i][3 + off] ); } } #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = 0; #endif for( int l = 0; l < 2; l++ ) { int off = l * 8; __m128i n1[2][8]; __m128i n2[2][8]; for( int i = 0; i < 8; i++ ) { int ii = i % 4; int ij = i >> 2; n2[0][i] = m1[ij][off + ii ]; n2[1][i] = m1[ij][off + ii + 4]; } for( int i = 0; i < 2; i++ ) { n1[i][0] = _mm_add_epi32( n2[i][0], n2[i][4] ); n1[i][1] = _mm_add_epi32( n2[i][1], n2[i][5] ); n1[i][2] = _mm_add_epi32( n2[i][2], n2[i][6] ); n1[i][3] = _mm_add_epi32( n2[i][3], n2[i][7] ); n1[i][4] = _mm_sub_epi32( n2[i][0], n2[i][4] ); n1[i][5] = _mm_sub_epi32( n2[i][1], n2[i][5] ); n1[i][6] = _mm_sub_epi32( n2[i][2], n2[i][6] ); n1[i][7] = _mm_sub_epi32( n2[i][3], n2[i][7] ); n2[i][0] = _mm_add_epi32( n1[i][0], n1[i][2] ); n2[i][1] = _mm_add_epi32( n1[i][1], n1[i][3] ); n2[i][2] = _mm_sub_epi32( n1[i][0], n1[i][2] ); n2[i][3] = _mm_sub_epi32( n1[i][1], n1[i][3] ); n2[i][4] = _mm_add_epi32( n1[i][4], n1[i][6] ); n2[i][5] = _mm_add_epi32( n1[i][5], n1[i][7] ); n2[i][6] = _mm_sub_epi32( n1[i][4], n1[i][6] ); n2[i][7] = _mm_sub_epi32( n1[i][5], n1[i][7] ); n1[i][0] = _mm_abs_epi32( _mm_add_epi32( n2[i][0], n2[i][1] ) ); n1[i][1] = _mm_abs_epi32( _mm_sub_epi32( n2[i][0], n2[i][1] ) ); n1[i][2] = _mm_abs_epi32( _mm_add_epi32( n2[i][2], n2[i][3] ) ); n1[i][3] = _mm_abs_epi32( _mm_sub_epi32( n2[i][2], n2[i][3] ) ); n1[i][4] = _mm_abs_epi32( _mm_add_epi32( n2[i][4], n2[i][5] ) ); n1[i][5] = _mm_abs_epi32( _mm_sub_epi32( n2[i][4], n2[i][5] ) ); n1[i][6] = _mm_abs_epi32( _mm_add_epi32( n2[i][6], n2[i][7] ) ); n1[i][7] = _mm_abs_epi32( _mm_sub_epi32( n2[i][6], n2[i][7] ) ); #if JVET_R0164_MEAN_SCALED_SATD if ( l + i == 0 ) { absDc = _mm_cvtsi128_si32( n1[i][0] ); } #endif } for( int i = 0; i < 8; i++ ) { n2[0][i] = _mm_add_epi32( n1[0][i], n1[1][i] ); } n2[0][0] = _mm_add_epi32( n2[0][0], n2[0][1] ); n2[0][2] = _mm_add_epi32( n2[0][2], n2[0][3] ); n2[0][4] = _mm_add_epi32( n2[0][4], n2[0][5] ); n2[0][6] = _mm_add_epi32( n2[0][6], n2[0][7] ); n2[0][0] = _mm_add_epi32( n2[0][0], n2[0][2] ); n2[0][4] = _mm_add_epi32( n2[0][4], n2[0][6] ); sum = _mm_add_epi32(sum, _mm_add_epi32(n2[0][0], n2[0][4])); } sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); uint32_t sad = _mm_cvtsi128_si32(sum); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = sad * INV_SQRT_2 >> 32; sad >>= 2; return sad; } template static uint32_t xCalcHAD8x4_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { __m128i m1[8], m2[8]; __m128i vzero = _mm_setzero_si128(); for( int k = 0; k < 4; k++ ) { __m128i r0 = (sizeof( Torg ) > 1) ? (_mm_loadu_si128 ( (__m128i*)piOrg )) : (_mm_unpacklo_epi8( _mm_loadl_epi64( (const __m128i*)piOrg ), _mm_setzero_si128() )); __m128i r1 = (sizeof( Tcur ) > 1) ? (_mm_lddqu_si128( (__m128i*)piCur )) : (_mm_unpacklo_epi8( _mm_loadl_epi64( (const __m128i*)piCur ), _mm_setzero_si128() )); // th _mm_loadu_si128( (__m128i*)piCur ) m1[k] = _mm_sub_epi16( r0, r1 ); piCur += strideCur; piOrg += strideOrg; } //vertical m2[0] = _mm_add_epi16( m1[0], m1[2] ); m2[1] = _mm_add_epi16( m1[1], m1[3] ); m2[2] = _mm_sub_epi16( m1[0], m1[2] ); m2[3] = _mm_sub_epi16( m1[1], m1[3] ); m1[0] = _mm_add_epi16( m2[0], m2[1] ); m1[1] = _mm_sub_epi16( m2[0], m2[1] ); m1[2] = _mm_add_epi16( m2[2], m2[3] ); m1[3] = _mm_sub_epi16( m2[2], m2[3] ); // transpose, partially { m2[0] = _mm_unpacklo_epi16( m1[0], m1[1] ); m2[1] = _mm_unpacklo_epi16( m1[2], m1[3] ); m2[2] = _mm_unpackhi_epi16( m1[0], m1[1] ); m2[3] = _mm_unpackhi_epi16( m1[2], m1[3] ); m1[0] = _mm_unpacklo_epi32( m2[0], m2[1] ); m1[1] = _mm_unpackhi_epi32( m2[0], m2[1] ); m1[2] = _mm_unpacklo_epi32( m2[2], m2[3] ); m1[3] = _mm_unpackhi_epi32( m2[2], m2[3] ); } // horizontal if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ ) { // finish transpose m2[0] = _mm_unpacklo_epi64( m1[0], vzero ); m2[1] = _mm_unpackhi_epi64( m1[0], vzero ); m2[2] = _mm_unpacklo_epi64( m1[1], vzero ); m2[3] = _mm_unpackhi_epi64( m1[1], vzero ); m2[4] = _mm_unpacklo_epi64( m1[2], vzero ); m2[5] = _mm_unpackhi_epi64( m1[2], vzero ); m2[6] = _mm_unpacklo_epi64( m1[3], vzero ); m2[7] = _mm_unpackhi_epi64( m1[3], vzero ); for( int i = 0; i < 8; i++ ) { m2[i] = _mm_cvtepi16_epi32( m2[i] ); } m1[0] = _mm_add_epi32( m2[0], m2[4] ); m1[1] = _mm_add_epi32( m2[1], m2[5] ); m1[2] = _mm_add_epi32( m2[2], m2[6] ); m1[3] = _mm_add_epi32( m2[3], m2[7] ); m1[4] = _mm_sub_epi32( m2[0], m2[4] ); m1[5] = _mm_sub_epi32( m2[1], m2[5] ); m1[6] = _mm_sub_epi32( m2[2], m2[6] ); m1[7] = _mm_sub_epi32( m2[3], m2[7] ); m2[0] = _mm_add_epi32( m1[0], m1[2] ); m2[1] = _mm_add_epi32( m1[1], m1[3] ); m2[2] = _mm_sub_epi32( m1[0], m1[2] ); m2[3] = _mm_sub_epi32( m1[1], m1[3] ); m2[4] = _mm_add_epi32( m1[4], m1[6] ); m2[5] = _mm_add_epi32( m1[5], m1[7] ); m2[6] = _mm_sub_epi32( m1[4], m1[6] ); m2[7] = _mm_sub_epi32( m1[5], m1[7] ); m1[0] = _mm_abs_epi32( _mm_add_epi32( m2[0], m2[1] ) ); m1[1] = _mm_abs_epi32( _mm_sub_epi32( m2[0], m2[1] ) ); m1[2] = _mm_abs_epi32( _mm_add_epi32( m2[2], m2[3] ) ); m1[3] = _mm_abs_epi32( _mm_sub_epi32( m2[2], m2[3] ) ); m1[4] = _mm_abs_epi32( _mm_add_epi32( m2[4], m2[5] ) ); m1[5] = _mm_abs_epi32( _mm_sub_epi32( m2[4], m2[5] ) ); m1[6] = _mm_abs_epi32( _mm_add_epi32( m2[6], m2[7] ) ); m1[7] = _mm_abs_epi32( _mm_sub_epi32( m2[6], m2[7] ) ); } else { m2[0] = _mm_add_epi16( m1[0], m1[2] ); m2[1] = _mm_add_epi16( m1[1], m1[3] ); m2[2] = _mm_sub_epi16( m1[0], m1[2] ); m2[3] = _mm_sub_epi16( m1[1], m1[3] ); m1[0] = _mm_add_epi16( m2[0], m2[1] ); m1[1] = _mm_sub_epi16( m2[0], m2[1] ); m1[2] = _mm_add_epi16( m2[2], m2[3] ); m1[3] = _mm_sub_epi16( m2[2], m2[3] ); // finish transpose m2[0] = _mm_unpacklo_epi64( m1[0], vzero ); m2[1] = _mm_unpackhi_epi64( m1[0], vzero ); m2[2] = _mm_unpacklo_epi64( m1[1], vzero ); m2[3] = _mm_unpackhi_epi64( m1[1], vzero ); m2[4] = _mm_unpacklo_epi64( m1[2], vzero ); m2[5] = _mm_unpackhi_epi64( m1[2], vzero ); m2[6] = _mm_unpacklo_epi64( m1[3], vzero ); m2[7] = _mm_unpackhi_epi64( m1[3], vzero ); m1[0] = _mm_abs_epi16( _mm_add_epi16( m2[0], m2[1] ) ); m1[1] = _mm_abs_epi16( _mm_sub_epi16( m2[0], m2[1] ) ); m1[2] = _mm_abs_epi16( _mm_add_epi16( m2[2], m2[3] ) ); m1[3] = _mm_abs_epi16( _mm_sub_epi16( m2[2], m2[3] ) ); m1[4] = _mm_abs_epi16( _mm_add_epi16( m2[4], m2[5] ) ); m1[5] = _mm_abs_epi16( _mm_sub_epi16( m2[4], m2[5] ) ); m1[6] = _mm_abs_epi16( _mm_add_epi16( m2[6], m2[7] ) ); m1[7] = _mm_abs_epi16( _mm_sub_epi16( m2[6], m2[7] ) ); for( int i = 0; i < 8; i++ ) { m1[i] = _mm_unpacklo_epi16( m1[i], vzero ); } } #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = _mm_cvtsi128_si32( m1[0] ); #endif m1[0] = _mm_add_epi32( m1[0], m1[1] ); m1[1] = _mm_add_epi32( m1[2], m1[3] ); m1[2] = _mm_add_epi32( m1[4], m1[5] ); m1[3] = _mm_add_epi32( m1[6], m1[7] ); m1[0] = _mm_add_epi32( m1[0], m1[1] ); m1[1] = _mm_add_epi32( m1[2], m1[3] ); __m128i sum = _mm_add_epi32(m1[0], m1[1]); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); uint32_t sad = _mm_cvtsi128_si32(sum); //sad = ((sad + 2) >> 2); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = sad * INV_SQRT_2 >> 32; sad >>= 1; return sad; } static uint32_t xCalcHAD4x8_SSE(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { __m128i m1[8], m2[8]; for( int k = 0; k < 8; k++ ) { __m128i r0 = (sizeof( Torg ) > 1) ? (_mm_loadl_epi64( (__m128i*)piOrg )) : (_mm_cvtsi32_si128( *(const int*)piOrg )); __m128i r1 = (sizeof( Tcur ) > 1) ? (_mm_loadl_epi64( (__m128i*)piCur )) : (_mm_cvtsi32_si128( *(const int*)piCur )); m2[k] = _mm_sub_epi16( r0, r1 ); piCur += strideCur; piOrg += strideOrg; } // vertical m1[0] = _mm_add_epi16( m2[0], m2[4] ); m1[1] = _mm_add_epi16( m2[1], m2[5] ); m1[2] = _mm_add_epi16( m2[2], m2[6] ); m1[3] = _mm_add_epi16( m2[3], m2[7] ); m1[4] = _mm_sub_epi16( m2[0], m2[4] ); m1[5] = _mm_sub_epi16( m2[1], m2[5] ); m1[6] = _mm_sub_epi16( m2[2], m2[6] ); m1[7] = _mm_sub_epi16( m2[3], m2[7] ); m2[0] = _mm_add_epi16( m1[0], m1[2] ); m2[1] = _mm_add_epi16( m1[1], m1[3] ); m2[2] = _mm_sub_epi16( m1[0], m1[2] ); m2[3] = _mm_sub_epi16( m1[1], m1[3] ); m2[4] = _mm_add_epi16( m1[4], m1[6] ); m2[5] = _mm_add_epi16( m1[5], m1[7] ); m2[6] = _mm_sub_epi16( m1[4], m1[6] ); m2[7] = _mm_sub_epi16( m1[5], m1[7] ); m1[0] = _mm_add_epi16( m2[0], m2[1] ); m1[1] = _mm_sub_epi16( m2[0], m2[1] ); m1[2] = _mm_add_epi16( m2[2], m2[3] ); m1[3] = _mm_sub_epi16( m2[2], m2[3] ); m1[4] = _mm_add_epi16( m2[4], m2[5] ); m1[5] = _mm_sub_epi16( m2[4], m2[5] ); m1[6] = _mm_add_epi16( m2[6], m2[7] ); m1[7] = _mm_sub_epi16( m2[6], m2[7] ); // horizontal // transpose { m2[0] = _mm_unpacklo_epi16( m1[0], m1[1] ); m2[1] = _mm_unpacklo_epi16( m1[2], m1[3] ); m2[2] = _mm_unpacklo_epi16( m1[4], m1[5] ); m2[3] = _mm_unpacklo_epi16( m1[6], m1[7] ); m1[0] = _mm_unpacklo_epi32( m2[0], m2[1] ); m1[1] = _mm_unpackhi_epi32( m2[0], m2[1] ); m1[2] = _mm_unpacklo_epi32( m2[2], m2[3] ); m1[3] = _mm_unpackhi_epi32( m2[2], m2[3] ); m2[0] = _mm_unpacklo_epi64( m1[0], m1[2] ); m2[1] = _mm_unpackhi_epi64( m1[0], m1[2] ); m2[2] = _mm_unpacklo_epi64( m1[1], m1[3] ); m2[3] = _mm_unpackhi_epi64( m1[1], m1[3] ); } #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = 0; #endif if( iBitDepth >= 10 /*sizeof( Torg ) > 1 || sizeof( Tcur ) > 1*/ ) { __m128i n1[4][2]; __m128i n2[4][2]; for( int i = 0; i < 4; i++ ) { n1[i][0] = _mm_cvtepi16_epi32( m2[i] ); n1[i][1] = _mm_cvtepi16_epi32( _mm_shuffle_epi32( m2[i], 0xEE ) ); } for( int i = 0; i < 2; i++ ) { n2[0][i] = _mm_add_epi32( n1[0][i], n1[2][i] ); n2[1][i] = _mm_add_epi32( n1[1][i], n1[3][i] ); n2[2][i] = _mm_sub_epi32( n1[0][i], n1[2][i] ); n2[3][i] = _mm_sub_epi32( n1[1][i], n1[3][i] ); n1[0][i] = _mm_abs_epi32( _mm_add_epi32( n2[0][i], n2[1][i] ) ); n1[1][i] = _mm_abs_epi32( _mm_sub_epi32( n2[0][i], n2[1][i] ) ); n1[2][i] = _mm_abs_epi32( _mm_add_epi32( n2[2][i], n2[3][i] ) ); n1[3][i] = _mm_abs_epi32( _mm_sub_epi32( n2[2][i], n2[3][i] ) ); } for( int i = 0; i < 4; i++ ) { m1[i] = _mm_add_epi32( n1[i][0], n1[i][1] ); } #if JVET_R0164_MEAN_SCALED_SATD absDc = _mm_cvtsi128_si32( n1[0][0] ); #endif } else { m1[0] = _mm_add_epi16( m2[0], m2[2] ); m1[1] = _mm_add_epi16( m2[1], m2[3] ); m1[2] = _mm_sub_epi16( m2[0], m2[2] ); m1[3] = _mm_sub_epi16( m2[1], m2[3] ); m2[0] = _mm_abs_epi16( _mm_add_epi16( m1[0], m1[1] ) ); m2[1] = _mm_abs_epi16( _mm_sub_epi16( m1[0], m1[1] ) ); m2[2] = _mm_abs_epi16( _mm_add_epi16( m1[2], m1[3] ) ); m2[3] = _mm_abs_epi16( _mm_sub_epi16( m1[2], m1[3] ) ); __m128i ma1, ma2; __m128i vzero = _mm_setzero_si128(); for( int i = 0; i < 4; i++ ) { ma1 = _mm_unpacklo_epi16( m2[i], vzero ); ma2 = _mm_unpackhi_epi16( m2[i], vzero ); m1[i] = _mm_add_epi32( ma1, ma2 ); } #if JVET_R0164_MEAN_SCALED_SATD absDc = _mm_cvtsi128_si32( m2[0] ) & 0x0000ffff; #endif } m1[0] = _mm_add_epi32( m1[0], m1[1] ); m1[2] = _mm_add_epi32( m1[2], m1[3] ); __m128i sum = _mm_add_epi32(m1[0], m1[2]); sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); uint32_t sad = _mm_cvtsi128_si32(sum); //sad = ((sad + 2) >> 2); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = sad * INV_SQRT_2 >> 32; sad >>= 1; return sad; } static uint32_t xCalcHAD16x16_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { uint32_t sad = 0; #ifdef USE_AVX2 const int iLoops = 2; __m256i m1[2][8], m2[2][8]; for( int l = 0; l < iLoops; l++ ) { { for( int k = 0; k < 8; k++ ) { __m256i r0 = _mm256_lddqu_si256( ( __m256i* ) piOrg ); __m256i r1 = _mm256_lddqu_si256( ( __m256i* ) piCur ); m2[0][k] = _mm256_sub_epi16( r0, r1 ); m2[1][k] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m2[0][k], 1 ) ); m2[0][k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128( m2[0][k] ) ); piCur += strideCur; piOrg += strideOrg; } } constexpr int perm_unpacklo_epi128 = ( 0 << 0 ) + ( 2 << 4 ); constexpr int perm_unpackhi_epi128 = ( 1 << 0 ) + ( 3 << 4 ); for( int i = 0; i < 2; i++ ) { m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][4] ); m1[i][1] = _mm256_add_epi32( m2[i][1], m2[i][5] ); m1[i][2] = _mm256_add_epi32( m2[i][2], m2[i][6] ); m1[i][3] = _mm256_add_epi32( m2[i][3], m2[i][7] ); m1[i][4] = _mm256_sub_epi32( m2[i][0], m2[i][4] ); m1[i][5] = _mm256_sub_epi32( m2[i][1], m2[i][5] ); m1[i][6] = _mm256_sub_epi32( m2[i][2], m2[i][6] ); m1[i][7] = _mm256_sub_epi32( m2[i][3], m2[i][7] ); m2[i][0] = _mm256_add_epi32( m1[i][0], m1[i][2] ); m2[i][1] = _mm256_add_epi32( m1[i][1], m1[i][3] ); m2[i][2] = _mm256_sub_epi32( m1[i][0], m1[i][2] ); m2[i][3] = _mm256_sub_epi32( m1[i][1], m1[i][3] ); m2[i][4] = _mm256_add_epi32( m1[i][4], m1[i][6] ); m2[i][5] = _mm256_add_epi32( m1[i][5], m1[i][7] ); m2[i][6] = _mm256_sub_epi32( m1[i][4], m1[i][6] ); m2[i][7] = _mm256_sub_epi32( m1[i][5], m1[i][7] ); m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][1] ); m1[i][1] = _mm256_sub_epi32( m2[i][0], m2[i][1] ); m1[i][2] = _mm256_add_epi32( m2[i][2], m2[i][3] ); m1[i][3] = _mm256_sub_epi32( m2[i][2], m2[i][3] ); m1[i][4] = _mm256_add_epi32( m2[i][4], m2[i][5] ); m1[i][5] = _mm256_sub_epi32( m2[i][4], m2[i][5] ); m1[i][6] = _mm256_add_epi32( m2[i][6], m2[i][7] ); m1[i][7] = _mm256_sub_epi32( m2[i][6], m2[i][7] ); // transpose // 8x8 m2[i][0] = _mm256_unpacklo_epi32( m1[i][0], m1[i][1] ); m2[i][1] = _mm256_unpacklo_epi32( m1[i][2], m1[i][3] ); m2[i][2] = _mm256_unpacklo_epi32( m1[i][4], m1[i][5] ); m2[i][3] = _mm256_unpacklo_epi32( m1[i][6], m1[i][7] ); m2[i][4] = _mm256_unpackhi_epi32( m1[i][0], m1[i][1] ); m2[i][5] = _mm256_unpackhi_epi32( m1[i][2], m1[i][3] ); m2[i][6] = _mm256_unpackhi_epi32( m1[i][4], m1[i][5] ); m2[i][7] = _mm256_unpackhi_epi32( m1[i][6], m1[i][7] ); m1[i][0] = _mm256_unpacklo_epi64( m2[i][0], m2[i][1] ); m1[i][1] = _mm256_unpackhi_epi64( m2[i][0], m2[i][1] ); m1[i][2] = _mm256_unpacklo_epi64( m2[i][2], m2[i][3] ); m1[i][3] = _mm256_unpackhi_epi64( m2[i][2], m2[i][3] ); m1[i][4] = _mm256_unpacklo_epi64( m2[i][4], m2[i][5] ); m1[i][5] = _mm256_unpackhi_epi64( m2[i][4], m2[i][5] ); m1[i][6] = _mm256_unpacklo_epi64( m2[i][6], m2[i][7] ); m1[i][7] = _mm256_unpackhi_epi64( m2[i][6], m2[i][7] ); m2[i][0] = _mm256_permute2x128_si256( m1[i][0], m1[i][2], perm_unpacklo_epi128 ); m2[i][1] = _mm256_permute2x128_si256( m1[i][0], m1[i][2], perm_unpackhi_epi128 ); m2[i][2] = _mm256_permute2x128_si256( m1[i][1], m1[i][3], perm_unpacklo_epi128 ); m2[i][3] = _mm256_permute2x128_si256( m1[i][1], m1[i][3], perm_unpackhi_epi128 ); m2[i][4] = _mm256_permute2x128_si256( m1[i][4], m1[i][6], perm_unpacklo_epi128 ); m2[i][5] = _mm256_permute2x128_si256( m1[i][4], m1[i][6], perm_unpackhi_epi128 ); m2[i][6] = _mm256_permute2x128_si256( m1[i][5], m1[i][7], perm_unpacklo_epi128 ); m2[i][7] = _mm256_permute2x128_si256( m1[i][5], m1[i][7], perm_unpackhi_epi128 ); } m1[0][0] = _mm256_permute2x128_si256( m2[0][0], m2[1][0], perm_unpacklo_epi128 ); m1[0][1] = _mm256_permute2x128_si256( m2[0][1], m2[1][1], perm_unpacklo_epi128 ); m1[0][2] = _mm256_permute2x128_si256( m2[0][2], m2[1][2], perm_unpacklo_epi128 ); m1[0][3] = _mm256_permute2x128_si256( m2[0][3], m2[1][3], perm_unpacklo_epi128 ); m1[0][4] = _mm256_permute2x128_si256( m2[0][4], m2[1][4], perm_unpacklo_epi128 ); m1[0][5] = _mm256_permute2x128_si256( m2[0][5], m2[1][5], perm_unpacklo_epi128 ); m1[0][6] = _mm256_permute2x128_si256( m2[0][6], m2[1][6], perm_unpacklo_epi128 ); m1[0][7] = _mm256_permute2x128_si256( m2[0][7], m2[1][7], perm_unpacklo_epi128 ); m1[1][0] = _mm256_permute2x128_si256( m2[0][0], m2[1][0], perm_unpackhi_epi128 ); m1[1][1] = _mm256_permute2x128_si256( m2[0][1], m2[1][1], perm_unpackhi_epi128 ); m1[1][2] = _mm256_permute2x128_si256( m2[0][2], m2[1][2], perm_unpackhi_epi128 ); m1[1][3] = _mm256_permute2x128_si256( m2[0][3], m2[1][3], perm_unpackhi_epi128 ); m1[1][4] = _mm256_permute2x128_si256( m2[0][4], m2[1][4], perm_unpackhi_epi128 ); m1[1][5] = _mm256_permute2x128_si256( m2[0][5], m2[1][5], perm_unpackhi_epi128 ); m1[1][6] = _mm256_permute2x128_si256( m2[0][6], m2[1][6], perm_unpackhi_epi128 ); m1[1][7] = _mm256_permute2x128_si256( m2[0][7], m2[1][7], perm_unpackhi_epi128 ); for( int i = 0; i < 2; i++ ) { m2[i][0] = _mm256_add_epi32( m1[i][0], m1[i][4] ); m2[i][1] = _mm256_add_epi32( m1[i][1], m1[i][5] ); m2[i][2] = _mm256_add_epi32( m1[i][2], m1[i][6] ); m2[i][3] = _mm256_add_epi32( m1[i][3], m1[i][7] ); m2[i][4] = _mm256_sub_epi32( m1[i][0], m1[i][4] ); m2[i][5] = _mm256_sub_epi32( m1[i][1], m1[i][5] ); m2[i][6] = _mm256_sub_epi32( m1[i][2], m1[i][6] ); m2[i][7] = _mm256_sub_epi32( m1[i][3], m1[i][7] ); m1[i][0] = _mm256_add_epi32( m2[i][0], m2[i][2] ); m1[i][1] = _mm256_add_epi32( m2[i][1], m2[i][3] ); m1[i][2] = _mm256_sub_epi32( m2[i][0], m2[i][2] ); m1[i][3] = _mm256_sub_epi32( m2[i][1], m2[i][3] ); m1[i][4] = _mm256_add_epi32( m2[i][4], m2[i][6] ); m1[i][5] = _mm256_add_epi32( m2[i][5], m2[i][7] ); m1[i][6] = _mm256_sub_epi32( m2[i][4], m2[i][6] ); m1[i][7] = _mm256_sub_epi32( m2[i][5], m2[i][7] ); m2[i][0] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][0], m1[i][1] ) ); m2[i][1] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][0], m1[i][1] ) ); m2[i][2] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][2], m1[i][3] ) ); m2[i][3] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][2], m1[i][3] ) ); m2[i][4] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][4], m1[i][5] ) ); m2[i][5] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][4], m1[i][5] ) ); m2[i][6] = _mm256_abs_epi32( _mm256_add_epi32( m1[i][6], m1[i][7] ) ); m2[i][7] = _mm256_abs_epi32( _mm256_sub_epi32( m1[i][6], m1[i][7] ) ); } #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc0 = _mm_cvtsi128_si32( _mm256_castsi256_si128( m2[0][0] ) ); uint32_t absDc1 = _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( m2[0][0], m2[0][0], 0x11 ) ) ); #endif for( int i = 0; i < 8; i++ ) { m1[0][i] = _mm256_add_epi32( m2[0][i], m2[1][i] ); } m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][1] ); m1[0][2] = _mm256_add_epi32( m1[0][2], m1[0][3] ); m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][5] ); m1[0][6] = _mm256_add_epi32( m1[0][6], m1[0][7] ); m1[0][0] = _mm256_add_epi32( m1[0][0], m1[0][2] ); m1[0][4] = _mm256_add_epi32( m1[0][4], m1[0][6] ); __m256i sum = _mm256_add_epi32(m1[0][0], m1[0][4]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); uint32_t tmp; tmp = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); #if JVET_R0164_MEAN_SCALED_SATD tmp -= absDc0; tmp += absDc0 >> 2; #endif tmp = ( ( tmp + 2 ) >> 2 ); sad += tmp; tmp = _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(sum, sum, 0x11))); #if JVET_R0164_MEAN_SCALED_SATD tmp -= absDc1; tmp += absDc1 >> 2; #endif tmp = ( ( tmp + 2 ) >> 2 ); sad += tmp; } #endif return ( sad ); } static uint32_t xCalcHAD16x8_AVX2(const Torg *piOrg, const Tcur *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { uint32_t sad = 0; #ifdef USE_AVX2 __m256i m1[16], m2[16]; { { for( int k = 0; k < 8; k++ ) { __m256i r0 = _mm256_lddqu_si256( (__m256i*)piOrg ); __m256i r1 = _mm256_lddqu_si256( (__m256i*)piCur ); m1[k] = _mm256_sub_epi16( r0, r1 ); m1[k+8] = _mm256_cvtepi16_epi32( _mm256_extracti128_si256( m1[k], 1 ) ); m1[k] = _mm256_cvtepi16_epi32( _mm256_castsi256_si128 ( m1[k] ) ); piCur += strideCur; piOrg += strideOrg; } } // vertical, first 8x8 m2[0] = _mm256_add_epi32( m1[0], m1[4] ); m2[1] = _mm256_add_epi32( m1[1], m1[5] ); m2[2] = _mm256_add_epi32( m1[2], m1[6] ); m2[3] = _mm256_add_epi32( m1[3], m1[7] ); m2[4] = _mm256_sub_epi32( m1[0], m1[4] ); m2[5] = _mm256_sub_epi32( m1[1], m1[5] ); m2[6] = _mm256_sub_epi32( m1[2], m1[6] ); m2[7] = _mm256_sub_epi32( m1[3], m1[7] ); m1[0] = _mm256_add_epi32( m2[0], m2[2] ); m1[1] = _mm256_add_epi32( m2[1], m2[3] ); m1[2] = _mm256_sub_epi32( m2[0], m2[2] ); m1[3] = _mm256_sub_epi32( m2[1], m2[3] ); m1[4] = _mm256_add_epi32( m2[4], m2[6] ); m1[5] = _mm256_add_epi32( m2[5], m2[7] ); m1[6] = _mm256_sub_epi32( m2[4], m2[6] ); m1[7] = _mm256_sub_epi32( m2[5], m2[7] ); m2[0] = _mm256_add_epi32( m1[0], m1[1] ); m2[1] = _mm256_sub_epi32( m1[0], m1[1] ); m2[2] = _mm256_add_epi32( m1[2], m1[3] ); m2[3] = _mm256_sub_epi32( m1[2], m1[3] ); m2[4] = _mm256_add_epi32( m1[4], m1[5] ); m2[5] = _mm256_sub_epi32( m1[4], m1[5] ); m2[6] = _mm256_add_epi32( m1[6], m1[7] ); m2[7] = _mm256_sub_epi32( m1[6], m1[7] ); // vertical, second 8x8 m2[8+0] = _mm256_add_epi32( m1[8+0], m1[8+4] ); m2[8+1] = _mm256_add_epi32( m1[8+1], m1[8+5] ); m2[8+2] = _mm256_add_epi32( m1[8+2], m1[8+6] ); m2[8+3] = _mm256_add_epi32( m1[8+3], m1[8+7] ); m2[8+4] = _mm256_sub_epi32( m1[8+0], m1[8+4] ); m2[8+5] = _mm256_sub_epi32( m1[8+1], m1[8+5] ); m2[8+6] = _mm256_sub_epi32( m1[8+2], m1[8+6] ); m2[8+7] = _mm256_sub_epi32( m1[8+3], m1[8+7] ); m1[8+0] = _mm256_add_epi32( m2[8+0], m2[8+2] ); m1[8+1] = _mm256_add_epi32( m2[8+1], m2[8+3] ); m1[8+2] = _mm256_sub_epi32( m2[8+0], m2[8+2] ); m1[8+3] = _mm256_sub_epi32( m2[8+1], m2[8+3] ); m1[8+4] = _mm256_add_epi32( m2[8+4], m2[8+6] ); m1[8+5] = _mm256_add_epi32( m2[8+5], m2[8+7] ); m1[8+6] = _mm256_sub_epi32( m2[8+4], m2[8+6] ); m1[8+7] = _mm256_sub_epi32( m2[8+5], m2[8+7] ); m2[8+0] = _mm256_add_epi32( m1[8+0], m1[8+1] ); m2[8+1] = _mm256_sub_epi32( m1[8+0], m1[8+1] ); m2[8+2] = _mm256_add_epi32( m1[8+2], m1[8+3] ); m2[8+3] = _mm256_sub_epi32( m1[8+2], m1[8+3] ); m2[8+4] = _mm256_add_epi32( m1[8+4], m1[8+5] ); m2[8+5] = _mm256_sub_epi32( m1[8+4], m1[8+5] ); m2[8+6] = _mm256_add_epi32( m1[8+6], m1[8+7] ); m2[8+7] = _mm256_sub_epi32( m1[8+6], m1[8+7] ); // transpose constexpr int perm_unpacklo_epi128 = ( 0 << 0 ) + ( 2 << 4 ); constexpr int perm_unpackhi_epi128 = ( 1 << 0 ) + ( 3 << 4 ); m1[0] = _mm256_unpacklo_epi32( m2[0], m2[1] ); m1[1] = _mm256_unpacklo_epi32( m2[2], m2[3] ); m1[2] = _mm256_unpacklo_epi32( m2[4], m2[5] ); m1[3] = _mm256_unpacklo_epi32( m2[6], m2[7] ); m1[4] = _mm256_unpackhi_epi32( m2[0], m2[1] ); m1[5] = _mm256_unpackhi_epi32( m2[2], m2[3] ); m1[6] = _mm256_unpackhi_epi32( m2[4], m2[5] ); m1[7] = _mm256_unpackhi_epi32( m2[6], m2[7] ); m2[0] = _mm256_unpacklo_epi64( m1[0], m1[1] ); m2[1] = _mm256_unpackhi_epi64( m1[0], m1[1] ); m2[2] = _mm256_unpacklo_epi64( m1[2], m1[3] ); m2[3] = _mm256_unpackhi_epi64( m1[2], m1[3] ); m2[4] = _mm256_unpacklo_epi64( m1[4], m1[5] ); m2[5] = _mm256_unpackhi_epi64( m1[4], m1[5] ); m2[6] = _mm256_unpacklo_epi64( m1[6], m1[7] ); m2[7] = _mm256_unpackhi_epi64( m1[6], m1[7] ); m1[0] = _mm256_permute2x128_si256( m2[0], m2[2], perm_unpacklo_epi128 ); m1[1] = _mm256_permute2x128_si256( m2[0], m2[2], perm_unpackhi_epi128 ); m1[2] = _mm256_permute2x128_si256( m2[1], m2[3], perm_unpacklo_epi128 ); m1[3] = _mm256_permute2x128_si256( m2[1], m2[3], perm_unpackhi_epi128 ); m1[4] = _mm256_permute2x128_si256( m2[4], m2[6], perm_unpacklo_epi128 ); m1[5] = _mm256_permute2x128_si256( m2[4], m2[6], perm_unpackhi_epi128 ); m1[6] = _mm256_permute2x128_si256( m2[5], m2[7], perm_unpacklo_epi128 ); m1[7] = _mm256_permute2x128_si256( m2[5], m2[7], perm_unpackhi_epi128 ); m1[8+0] = _mm256_unpacklo_epi32( m2[8+0], m2[8+1] ); m1[8+1] = _mm256_unpacklo_epi32( m2[8+2], m2[8+3] ); m1[8+2] = _mm256_unpacklo_epi32( m2[8+4], m2[8+5] ); m1[8+3] = _mm256_unpacklo_epi32( m2[8+6], m2[8+7] ); m1[8+4] = _mm256_unpackhi_epi32( m2[8+0], m2[8+1] ); m1[8+5] = _mm256_unpackhi_epi32( m2[8+2], m2[8+3] ); m1[8+6] = _mm256_unpackhi_epi32( m2[8+4], m2[8+5] ); m1[8+7] = _mm256_unpackhi_epi32( m2[8+6], m2[8+7] ); m2[8+0] = _mm256_unpacklo_epi64( m1[8+0], m1[8+1] ); m2[8+1] = _mm256_unpackhi_epi64( m1[8+0], m1[8+1] ); m2[8+2] = _mm256_unpacklo_epi64( m1[8+2], m1[8+3] ); m2[8+3] = _mm256_unpackhi_epi64( m1[8+2], m1[8+3] ); m2[8+4] = _mm256_unpacklo_epi64( m1[8+4], m1[8+5] ); m2[8+5] = _mm256_unpackhi_epi64( m1[8+4], m1[8+5] ); m2[8+6] = _mm256_unpacklo_epi64( m1[8+6], m1[8+7] ); m2[8+7] = _mm256_unpackhi_epi64( m1[8+6], m1[8+7] ); m1[8+0] = _mm256_permute2x128_si256( m2[8+0], m2[8+2], perm_unpacklo_epi128 ); m1[8+1] = _mm256_permute2x128_si256( m2[8+0], m2[8+2], perm_unpackhi_epi128 ); m1[8+2] = _mm256_permute2x128_si256( m2[8+1], m2[8+3], perm_unpacklo_epi128 ); m1[8+3] = _mm256_permute2x128_si256( m2[8+1], m2[8+3], perm_unpackhi_epi128 ); m1[8+4] = _mm256_permute2x128_si256( m2[8+4], m2[8+6], perm_unpacklo_epi128 ); m1[8+5] = _mm256_permute2x128_si256( m2[8+4], m2[8+6], perm_unpackhi_epi128 ); m1[8+6] = _mm256_permute2x128_si256( m2[8+5], m2[8+7], perm_unpacklo_epi128 ); m1[8+7] = _mm256_permute2x128_si256( m2[8+5], m2[8+7], perm_unpackhi_epi128 ); // horizontal { m2[ 0] = _mm256_add_epi32( m1[0], m1[ 8] ); m2[ 1] = _mm256_add_epi32( m1[1], m1[ 9] ); m2[ 2] = _mm256_add_epi32( m1[2], m1[10] ); m2[ 3] = _mm256_add_epi32( m1[3], m1[11] ); m2[ 4] = _mm256_add_epi32( m1[4], m1[12] ); m2[ 5] = _mm256_add_epi32( m1[5], m1[13] ); m2[ 6] = _mm256_add_epi32( m1[6], m1[14] ); m2[ 7] = _mm256_add_epi32( m1[7], m1[15] ); m2[ 8] = _mm256_sub_epi32( m1[0], m1[ 8] ); m2[ 9] = _mm256_sub_epi32( m1[1], m1[ 9] ); m2[10] = _mm256_sub_epi32( m1[2], m1[10] ); m2[11] = _mm256_sub_epi32( m1[3], m1[11] ); m2[12] = _mm256_sub_epi32( m1[4], m1[12] ); m2[13] = _mm256_sub_epi32( m1[5], m1[13] ); m2[14] = _mm256_sub_epi32( m1[6], m1[14] ); m2[15] = _mm256_sub_epi32( m1[7], m1[15] ); m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 4] ); m1[ 1] = _mm256_add_epi32( m2[ 1], m2[ 5] ); m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 6] ); m1[ 3] = _mm256_add_epi32( m2[ 3], m2[ 7] ); m1[ 4] = _mm256_sub_epi32( m2[ 0], m2[ 4] ); m1[ 5] = _mm256_sub_epi32( m2[ 1], m2[ 5] ); m1[ 6] = _mm256_sub_epi32( m2[ 2], m2[ 6] ); m1[ 7] = _mm256_sub_epi32( m2[ 3], m2[ 7] ); m1[ 8] = _mm256_add_epi32( m2[ 8], m2[12] ); m1[ 9] = _mm256_add_epi32( m2[ 9], m2[13] ); m1[10] = _mm256_add_epi32( m2[10], m2[14] ); m1[11] = _mm256_add_epi32( m2[11], m2[15] ); m1[12] = _mm256_sub_epi32( m2[ 8], m2[12] ); m1[13] = _mm256_sub_epi32( m2[ 9], m2[13] ); m1[14] = _mm256_sub_epi32( m2[10], m2[14] ); m1[15] = _mm256_sub_epi32( m2[11], m2[15] ); m2[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] ); m2[ 1] = _mm256_add_epi32( m1[ 1], m1[ 3] ); m2[ 2] = _mm256_sub_epi32( m1[ 0], m1[ 2] ); m2[ 3] = _mm256_sub_epi32( m1[ 1], m1[ 3] ); m2[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] ); m2[ 5] = _mm256_add_epi32( m1[ 5], m1[ 7] ); m2[ 6] = _mm256_sub_epi32( m1[ 4], m1[ 6] ); m2[ 7] = _mm256_sub_epi32( m1[ 5], m1[ 7] ); m2[ 8] = _mm256_add_epi32( m1[ 8], m1[10] ); m2[ 9] = _mm256_add_epi32( m1[ 9], m1[11] ); m2[10] = _mm256_sub_epi32( m1[ 8], m1[10] ); m2[11] = _mm256_sub_epi32( m1[ 9], m1[11] ); m2[12] = _mm256_add_epi32( m1[12], m1[14] ); m2[13] = _mm256_add_epi32( m1[13], m1[15] ); m2[14] = _mm256_sub_epi32( m1[12], m1[14] ); m2[15] = _mm256_sub_epi32( m1[13], m1[15] ); m1[ 0] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 0], m2[ 1] ) ); m1[ 1] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 0], m2[ 1] ) ); m1[ 2] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 2], m2[ 3] ) ); m1[ 3] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 2], m2[ 3] ) ); m1[ 4] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 4], m2[ 5] ) ); m1[ 5] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 4], m2[ 5] ) ); m1[ 6] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 6], m2[ 7] ) ); m1[ 7] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 6], m2[ 7] ) ); m1[ 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[ 8], m2[ 9] ) ); m1[ 9] = _mm256_abs_epi32( _mm256_sub_epi32( m2[ 8], m2[ 9] ) ); m1[10] = _mm256_abs_epi32( _mm256_add_epi32( m2[10], m2[11] ) ); m1[11] = _mm256_abs_epi32( _mm256_sub_epi32( m2[10], m2[11] ) ); m1[12] = _mm256_abs_epi32( _mm256_add_epi32( m2[12], m2[13] ) ); m1[13] = _mm256_abs_epi32( _mm256_sub_epi32( m2[12], m2[13] ) ); m1[14] = _mm256_abs_epi32( _mm256_add_epi32( m2[14], m2[15] ) ); m1[15] = _mm256_abs_epi32( _mm256_sub_epi32( m2[14], m2[15] ) ); } #if JVET_R0164_MEAN_SCALED_SATD uint32_t absDc = _mm_cvtsi128_si32( _mm256_castsi256_si128( m1[0] ) ); #endif // sum up m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] ); m1[ 2] = _mm256_add_epi32( m1[ 2], m1[ 3] ); m1[ 4] = _mm256_add_epi32( m1[ 4], m1[ 5] ); m1[ 6] = _mm256_add_epi32( m1[ 6], m1[ 7] ); m1[ 8] = _mm256_add_epi32( m1[ 8], m1[ 9] ); m1[10] = _mm256_add_epi32( m1[10], m1[11] ); m1[12] = _mm256_add_epi32( m1[12], m1[13] ); m1[14] = _mm256_add_epi32( m1[14], m1[15] ); m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] ); m1[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] ); m1[ 8] = _mm256_add_epi32( m1[ 8], m1[10] ); m1[12] = _mm256_add_epi32( m1[12], m1[14] ); m1[0] = _mm256_add_epi32( m1[0], m1[ 4] ); m1[8] = _mm256_add_epi32( m1[8], m1[12] ); __m256i sum = _mm256_add_epi32(m1[0], m1[8]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_add_epi32(sum, _mm256_permute2x128_si256(sum, sum, 0x11)); sad = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); #if JVET_R0164_MEAN_SCALED_SATD sad -= absDc; sad += absDc >> 2; #endif sad = sad * INV_SQRT_2 >> 32; sad >>= 2; } #endif //USE_AVX2 return (sad); } static uint32_t xCalcHAD8x16_AVX2(const Pel *piOrg, const Pel *piCur, const ptrdiff_t strideOrg, const ptrdiff_t strideCur, const int iBitDepth) { uint32_t sad = 0; #ifdef USE_AVX2 __m256i m1[16], m2[16]; { { for( int k = 0; k < 16; k++ ) { __m256i r0 = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*)piOrg ) ); __m256i r1 = _mm256_cvtepi16_epi32( _mm_lddqu_si128( (__m128i*)piCur ) ); m1[k] = _mm256_sub_epi32( r0, r1 ); piCur += strideCur; piOrg += strideOrg; } } // vertical m2[ 0] = _mm256_add_epi32( m1[0], m1[ 8] ); m2[ 1] = _mm256_add_epi32( m1[1], m1[ 9] ); m2[ 2] = _mm256_add_epi32( m1[2], m1[10] ); m2[ 3] = _mm256_add_epi32( m1[3], m1[11] ); m2[ 4] = _mm256_add_epi32( m1[4], m1[12] ); m2[ 5] = _mm256_add_epi32( m1[5], m1[13] ); m2[ 6] = _mm256_add_epi32( m1[6], m1[14] ); m2[ 7] = _mm256_add_epi32( m1[7], m1[15] ); m2[ 8] = _mm256_sub_epi32( m1[0], m1[ 8] ); m2[ 9] = _mm256_sub_epi32( m1[1], m1[ 9] ); m2[10] = _mm256_sub_epi32( m1[2], m1[10] ); m2[11] = _mm256_sub_epi32( m1[3], m1[11] ); m2[12] = _mm256_sub_epi32( m1[4], m1[12] ); m2[13] = _mm256_sub_epi32( m1[5], m1[13] ); m2[14] = _mm256_sub_epi32( m1[6], m1[14] ); m2[15] = _mm256_sub_epi32( m1[7], m1[15] ); m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 4] ); m1[ 1] = _mm256_add_epi32( m2[ 1], m2[ 5] ); m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 6] ); m1[ 3] = _mm256_add_epi32( m2[ 3], m2[ 7] ); m1[ 4] = _mm256_sub_epi32( m2[ 0], m2[ 4] ); m1[ 5] = _mm256_sub_epi32( m2[ 1], m2[ 5] ); m1[ 6] = _mm256_sub_epi32( m2[ 2], m2[ 6] ); m1[ 7] = _mm256_sub_epi32( m2[ 3], m2[ 7] ); m1[ 8] = _mm256_add_epi32( m2[ 8], m2[12] ); m1[ 9] = _mm256_add_epi32( m2[ 9], m2[13] ); m1[10] = _mm256_add_epi32( m2[10], m2[14] ); m1[11] = _mm256_add_epi32( m2[11], m2[15] ); m1[12] = _mm256_sub_epi32( m2[ 8], m2[12] ); m1[13] = _mm256_sub_epi32( m2[ 9], m2[13] ); m1[14] = _mm256_sub_epi32( m2[10], m2[14] ); m1[15] = _mm256_sub_epi32( m2[11], m2[15] ); m2[ 0] = _mm256_add_epi32( m1[ 0], m1[ 2] ); m2[ 1] = _mm256_add_epi32( m1[ 1], m1[ 3] ); m2[ 2] = _mm256_sub_epi32( m1[ 0], m1[ 2] ); m2[ 3] = _mm256_sub_epi32( m1[ 1], m1[ 3] ); m2[ 4] = _mm256_add_epi32( m1[ 4], m1[ 6] ); m2[ 5] = _mm256_add_epi32( m1[ 5], m1[ 7] ); m2[ 6] = _mm256_sub_epi32( m1[ 4], m1[ 6] ); m2[ 7] = _mm256_sub_epi32( m1[ 5], m1[ 7] ); m2[ 8] = _mm256_add_epi32( m1[ 8], m1[10] ); m2[ 9] = _mm256_add_epi32( m1[ 9], m1[11] ); m2[10] = _mm256_sub_epi32( m1[ 8], m1[10] ); m2[11] = _mm256_sub_epi32( m1[ 9], m1[11] ); m2[12] = _mm256_add_epi32( m1[12], m1[14] ); m2[13] = _mm256_add_epi32( m1[13], m1[15] ); m2[14] = _mm256_sub_epi32( m1[12], m1[14] ); m2[15] = _mm256_sub_epi32( m1[13], m1[15] ); m1[ 0] = _mm256_add_epi32( m2[ 0], m2[ 1] ); m1[ 1] = _mm256_sub_epi32( m2[ 0], m2[ 1] ); m1[ 2] = _mm256_add_epi32( m2[ 2], m2[ 3] ); m1[ 3] = _mm256_sub_epi32( m2[ 2], m2[ 3] ); m1[ 4] = _mm256_add_epi32( m2[ 4], m2[ 5] ); m1[ 5] = _mm256_sub_epi32( m2[ 4], m2[ 5] ); m1[ 6] = _mm256_add_epi32( m2[ 6], m2[ 7] ); m1[ 7] = _mm256_sub_epi32( m2[ 6], m2[ 7] ); m1[ 8] = _mm256_add_epi32( m2[ 8], m2[ 9] ); m1[ 9] = _mm256_sub_epi32( m2[ 8], m2[ 9] ); m1[10] = _mm256_add_epi32( m2[10], m2[11] ); m1[11] = _mm256_sub_epi32( m2[10], m2[11] ); m1[12] = _mm256_add_epi32( m2[12], m2[13] ); m1[13] = _mm256_sub_epi32( m2[12], m2[13] ); m1[14] = _mm256_add_epi32( m2[14], m2[15] ); m1[15] = _mm256_sub_epi32( m2[14], m2[15] ); // transpose constexpr int perm_unpacklo_epi128 = ( 0 << 0 ) + ( 2 << 4 ); constexpr int perm_unpackhi_epi128 = ( 1 << 0 ) + ( 3 << 4 ); // 1. 8x8 m2[0] = _mm256_unpacklo_epi32( m1[0], m1[1] ); m2[1] = _mm256_unpacklo_epi32( m1[2], m1[3] ); m2[2] = _mm256_unpacklo_epi32( m1[4], m1[5] ); m2[3] = _mm256_unpacklo_epi32( m1[6], m1[7] ); m2[4] = _mm256_unpackhi_epi32( m1[0], m1[1] ); m2[5] = _mm256_unpackhi_epi32( m1[2], m1[3] ); m2[6] = _mm256_unpackhi_epi32( m1[4], m1[5] ); m2[7] = _mm256_unpackhi_epi32( m1[6], m1[7] ); m1[0] = _mm256_unpacklo_epi64( m2[0], m2[1] ); m1[1] = _mm256_unpackhi_epi64( m2[0], m2[1] ); m1[2] = _mm256_unpacklo_epi64( m2[2], m2[3] ); m1[3] = _mm256_unpackhi_epi64( m2[2], m2[3] ); m1[4] = _mm256_unpacklo_epi64( m2[4], m2[5] ); m1[5] = _mm256_unpackhi_epi64( m2[4], m2[5] ); m1[6] = _mm256_unpacklo_epi64( m2[6], m2[7] ); m1[7] = _mm256_unpackhi_epi64( m2[6], m2[7] ); m2[0] = _mm256_permute2x128_si256( m1[0], m1[2], perm_unpacklo_epi128 ); m2[1] = _mm256_permute2x128_si256( m1[0], m1[2], perm_unpackhi_epi128 ); m2[2] = _mm256_permute2x128_si256( m1[1], m1[3], perm_unpacklo_epi128 ); m2[3] = _mm256_permute2x128_si256( m1[1], m1[3], perm_unpackhi_epi128 ); m2[4] = _mm256_permute2x128_si256( m1[4], m1[6], perm_unpacklo_epi128 ); m2[5] = _mm256_permute2x128_si256( m1[4], m1[6], perm_unpackhi_epi128 ); m2[6] = _mm256_permute2x128_si256( m1[5], m1[7], perm_unpacklo_epi128 ); m2[7] = _mm256_permute2x128_si256( m1[5], m1[7], perm_unpackhi_epi128 ); // 2. 8x8 m2[0+8] = _mm256_unpacklo_epi32( m1[0+8], m1[1+8] ); m2[1+8] = _mm256_unpacklo_epi32( m1[2+8], m1[3+8] ); m2[2+8] = _mm256_unpacklo_epi32( m1[4+8], m1[5+8] ); m2[3+8] = _mm256_unpacklo_epi32( m1[6+8], m1[7+8] ); m2[4+8] = _mm256_unpackhi_epi32( m1[0+8], m1[1+8] ); m2[5+8] = _mm256_unpackhi_epi32( m1[2+8], m1[3+8] ); m2[6+8] = _mm256_unpackhi_epi32( m1[4+8], m1[5+8] ); m2[7+8] = _mm256_unpackhi_epi32( m1[6+8], m1[7+8] ); m1[0+8] = _mm256_unpacklo_epi64( m2[0+8], m2[1+8] ); m1[1+8] = _mm256_unpackhi_epi64( m2[0+8], m2[1+8] ); m1[2+8] = _mm256_unpacklo_epi64( m2[2+8], m2[3+8] ); m1[3+8] = _mm256_unpackhi_epi64( m2[2+8], m2[3+8] ); m1[4+8] = _mm256_unpacklo_epi64( m2[4+8], m2[5+8] ); m1[5+8] = _mm256_unpackhi_epi64( m2[4+8], m2[5+8] ); m1[6+8] = _mm256_unpacklo_epi64( m2[6+8], m2[7+8] ); m1[7+8] = _mm256_unpackhi_epi64( m2[6+8], m2[7+8] ); m2[0+8] = _mm256_permute2x128_si256( m1[0+8], m1[2+8], perm_unpacklo_epi128 ); m2[1+8] = _mm256_permute2x128_si256( m1[0+8], m1[2+8], perm_unpackhi_epi128 ); m2[2+8] = _mm256_permute2x128_si256( m1[1+8], m1[3+8], perm_unpacklo_epi128 ); m2[3+8] = _mm256_permute2x128_si256( m1[1+8], m1[3+8], perm_unpackhi_epi128 ); m2[4+8] = _mm256_permute2x128_si256( m1[4+8], m1[6+8], perm_unpacklo_epi128 ); m2[5+8] = _mm256_permute2x128_si256( m1[4+8], m1[6+8], perm_unpackhi_epi128 ); m2[6+8] = _mm256_permute2x128_si256( m1[5+8], m1[7+8], perm_unpacklo_epi128 ); m2[7+8] = _mm256_permute2x128_si256( m1[5+8], m1[7+8], perm_unpackhi_epi128 ); // horizontal m1[0] = _mm256_add_epi32( m2[0], m2[4] ); m1[1] = _mm256_add_epi32( m2[1], m2[5] ); m1[2] = _mm256_add_epi32( m2[2], m2[6] ); m1[3] = _mm256_add_epi32( m2[3], m2[7] ); m1[4] = _mm256_sub_epi32( m2[0], m2[4] ); m1[5] = _mm256_sub_epi32( m2[1], m2[5] ); m1[6] = _mm256_sub_epi32( m2[2], m2[6] ); m1[7] = _mm256_sub_epi32( m2[3], m2[7] ); m2[0] = _mm256_add_epi32( m1[0], m1[2] ); m2[1] = _mm256_add_epi32( m1[1], m1[3] ); m2[2] = _mm256_sub_epi32( m1[0], m1[2] ); m2[3] = _mm256_sub_epi32( m1[1], m1[3] ); m2[4] = _mm256_add_epi32( m1[4], m1[6] ); m2[5] = _mm256_add_epi32( m1[5], m1[7] ); m2[6] = _mm256_sub_epi32( m1[4], m1[6] ); m2[7] = _mm256_sub_epi32( m1[5], m1[7] ); m1[0] = _mm256_abs_epi32( _mm256_add_epi32( m2[0], m2[1] ) ); m1[1] = _mm256_abs_epi32( _mm256_sub_epi32( m2[0], m2[1] ) ); m1[2] = _mm256_abs_epi32( _mm256_add_epi32( m2[2], m2[3] ) ); m1[3] = _mm256_abs_epi32( _mm256_sub_epi32( m2[2], m2[3] ) ); m1[4] = _mm256_abs_epi32( _mm256_add_epi32( m2[4], m2[5] ) ); m1[5] = _mm256_abs_epi32( _mm256_sub_epi32( m2[4], m2[5] ) ); m1[6] = _mm256_abs_epi32( _mm256_add_epi32( m2[6], m2[7] ) ); m1[7] = _mm256_abs_epi32( _mm256_sub_epi32( m2[6], m2[7] ) ); #if JVET_R0164_MEAN_SCALED_SATD int absDc = _mm_cvtsi128_si32( _mm256_castsi256_si128( m1[0] ) ); #endif m1[0 + 8] = _mm256_add_epi32( m2[0 + 8], m2[4 + 8] ); m1[1 + 8] = _mm256_add_epi32( m2[1 + 8], m2[5 + 8] ); m1[2 + 8] = _mm256_add_epi32( m2[2 + 8], m2[6 + 8] ); m1[3 + 8] = _mm256_add_epi32( m2[3 + 8], m2[7 + 8] ); m1[4 + 8] = _mm256_sub_epi32( m2[0 + 8], m2[4 + 8] ); m1[5 + 8] = _mm256_sub_epi32( m2[1 + 8], m2[5 + 8] ); m1[6 + 8] = _mm256_sub_epi32( m2[2 + 8], m2[6 + 8] ); m1[7 + 8] = _mm256_sub_epi32( m2[3 + 8], m2[7 + 8] ); m2[0 + 8] = _mm256_add_epi32( m1[0 + 8], m1[2 + 8] ); m2[1 + 8] = _mm256_add_epi32( m1[1 + 8], m1[3 + 8] ); m2[2 + 8] = _mm256_sub_epi32( m1[0 + 8], m1[2 + 8] ); m2[3 + 8] = _mm256_sub_epi32( m1[1 + 8], m1[3 + 8] ); m2[4 + 8] = _mm256_add_epi32( m1[4 + 8], m1[6 + 8] ); m2[5 + 8] = _mm256_add_epi32( m1[5 + 8], m1[7 + 8] ); m2[6 + 8] = _mm256_sub_epi32( m1[4 + 8], m1[6 + 8] ); m2[7 + 8] = _mm256_sub_epi32( m1[5 + 8], m1[7 + 8] ); m1[0 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[0 + 8], m2[1 + 8] ) ); m1[1 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[0 + 8], m2[1 + 8] ) ); m1[2 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[2 + 8], m2[3 + 8] ) ); m1[3 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[2 + 8], m2[3 + 8] ) ); m1[4 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[4 + 8], m2[5 + 8] ) ); m1[5 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[4 + 8], m2[5 + 8] ) ); m1[6 + 8] = _mm256_abs_epi32( _mm256_add_epi32( m2[6 + 8], m2[7 + 8] ) ); m1[7 + 8] = _mm256_abs_epi32( _mm256_sub_epi32( m2[6 + 8], m2[7 + 8] ) ); // sum up m1[0] = _mm256_add_epi32( m1[0], m1[1] ); m1[1] = _mm256_add_epi32( m1[2], m1[3] ); m1[2] = _mm256_add_epi32( m1[4], m1[5] ); m1[3] = _mm256_add_epi32( m1[6], m1[7] ); m1[4] = _mm256_add_epi32( m1[8], m1[9] ); m1[5] = _mm256_add_epi32( m1[10], m1[11] ); m1[6] = _mm256_add_epi32( m1[12], m1[13] ); m1[7] = _mm256_add_epi32( m1[14], m1[15] ); // sum up m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] ); m1[ 1] = _mm256_add_epi32( m1[ 2], m1[ 3] ); m1[ 2] = _mm256_add_epi32( m1[ 4], m1[ 5] ); m1[ 3] = _mm256_add_epi32( m1[ 6], m1[ 7] ); m1[ 0] = _mm256_add_epi32( m1[ 0], m1[ 1] ); m1[ 1] = _mm256_add_epi32( m1[ 2], m1[ 3] ); __m256i sum = _mm256_add_epi32(m1[0], m1[1]); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_hadd_epi32(sum, sum); sum = _mm256_add_epi32(sum, _mm256_permute2x128_si256(sum, sum, 0x11)); uint32_t sad2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum)); #if JVET_R0164_MEAN_SCALED_SATD sad2 -= absDc; sad2 += absDc >> 2; #endif sad = sad2 * INV_SQRT_2 >> 32; sad >>= 2; } #endif //USE_AVX2 return (sad); } #endif template< X86_VEXT vext > Distortion RdCost::xGetSADwMask_SIMD( const DistParam &rcDtParam ) { if (rcDtParam.org.width < 4 || rcDtParam.bitDepth > 10 || rcDtParam.applyWeight) return RdCost::xGetSADwMask( rcDtParam ); const short* src1 = (const short*)rcDtParam.org.buf; const short* src2 = (const short*)rcDtParam.cur.buf; const short* weightMask = (const short*)rcDtParam.mask; int rows = rcDtParam.org.height; int cols = rcDtParam.org.width; int subShift = rcDtParam.subShift; int subStep = ( 1 << subShift); const ptrdiff_t strideSrc1 = rcDtParam.org.stride * subStep; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride * subStep; const ptrdiff_t strideMask = rcDtParam.maskStride * subStep; Distortion sum = 0; if( vext >= AVX2 && (cols & 15 ) == 0 ) { #ifdef USE_AVX2 // Do for width that multiple of 16 __m256i vzero = _mm256_setzero_si256(); __m256i vsum32 = vzero; for( int y = 0; y < rows; y+= subStep) { for( int x = 0; x < cols; x+=16 ) { __m256i vsrc1 = _mm256_lddqu_si256( ( __m256i* )( &src1[x] ) ); __m256i vsrc2 = _mm256_lddqu_si256( ( __m256i* )( &src2[x] ) ); __m256i vmask; if (rcDtParam.stepX == -1) { vmask = _mm256_lddqu_si256((__m256i*)((&weightMask[x]) - (x << 1) - (16 - 1))); const __m256i shuffle_mask = _mm256_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); vmask = _mm256_shuffle_epi8(vmask, shuffle_mask); vmask = _mm256_permute4x64_epi64(vmask, _MM_SHUFFLE(1, 0, 3, 2)); } else { vmask = _mm256_lddqu_si256((__m256i*)(&weightMask[x])); } vsum32 = _mm256_add_epi32( vsum32, _mm256_madd_epi16( vmask, _mm256_abs_epi16( _mm256_sub_epi16( vsrc1, vsrc2 ) ) ) ); } src1 += strideSrc1; src2 += strideSrc2; weightMask += strideMask; } vsum32 = _mm256_hadd_epi32( vsum32, vzero ); vsum32 = _mm256_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32( _mm256_castsi256_si128( vsum32 ) ) + _mm_cvtsi128_si32( _mm256_castsi256_si128( _mm256_permute2x128_si256( vsum32, vsum32, 0x11 ) ) ); #endif } else { // Do with step of 8 __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; for( int y = 0; y < rows; y+= subStep) { for( int x = 0; x < cols; x+=8 ) { __m128i vsrc1 = _mm_loadu_si128( ( const __m128i* )( &src1[x] ) ); __m128i vsrc2 = _mm_lddqu_si128( ( const __m128i* )( &src2[x] ) ); __m128i vmask; if (rcDtParam.stepX == -1) { vmask = _mm_lddqu_si128((__m128i*)((&weightMask[x]) - (x << 1) - (8 - 1))); const __m128i shuffle_mask = _mm_set_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); vmask = _mm_shuffle_epi8(vmask, shuffle_mask); } else { vmask = _mm_lddqu_si128((const __m128i*)(&weightMask[x])); } vsum32 = _mm_add_epi32( vsum32, _mm_madd_epi16( vmask, _mm_abs_epi16( _mm_sub_epi16( vsrc1, vsrc2 ) ) ) ); } src1 += strideSrc1; src2 += strideSrc2; weightMask += strideMask; } vsum32 = _mm_hadd_epi32( vsum32, vzero ); vsum32 = _mm_hadd_epi32( vsum32, vzero ); sum = _mm_cvtsi128_si32( vsum32 ); } sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } #if RExt__HIGH_BIT_DEPTH_SUPPORT template Distortion RdCost::xGetHADs_HBD_SIMD(const DistParam &rcDtParam) { if (rcDtParam.applyWeight) { return RdCostWeightPrediction::xGetHADsw(rcDtParam); } const Pel* piOrg = rcDtParam.org.buf; const Pel* piCur = rcDtParam.cur.buf; const int rows = rcDtParam.org.height; const int cols = rcDtParam.org.width; const ptrdiff_t strideCur = rcDtParam.cur.stride; const ptrdiff_t strideOrg = rcDtParam.org.stride; const int step = rcDtParam.step; CHECK(step != 1, "the function only supports of step equal to 1"); int x = 0, y = 0; Distortion sum = 0; if (cols > rows && (rows & 7) == 0 && (cols & 15) == 0) { for (y = 0; y < rows; y += 8) { for (x = 0; x < cols; x += 16) { #ifdef USE_AVX2 if (vext >= AVX2) { sum += xCalcHAD16x8_HBD_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur); } else #endif sum += xCalcHAD16x8_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } piOrg += strideOrg * 8; piCur += strideCur * 8; } } else if (cols < rows && (cols & 7) == 0 && (rows & 15) == 0) { for (y = 0; y < rows; y += 16) { for (x = 0; x < cols; x += 8) { #ifdef USE_AVX2 if (vext >= AVX2) { sum += xCalcHAD8x16_HBD_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur); } else #endif { sum += xCalcHAD8x16_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } } piOrg += strideOrg * 16; piCur += strideCur * 16; } } else if (cols > rows && (rows & 3) == 0 && (cols & 7) == 0) { for (y = 0; y < rows; y += 4) { for (x = 0; x < cols; x += 8) { #ifdef USE_AVX2 if (vext >= AVX2) { sum += xCalcHAD8x4_HBD_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur); } else #endif { sum += xCalcHAD8x4_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } } piOrg += strideOrg * 4; piCur += strideCur * 4; } } else if (cols < rows && (cols & 3) == 0 && (rows & 7) == 0) { for (y = 0; y < rows; y += 8) { for (x = 0; x < cols; x += 4) { #ifdef USE_AVX2 if (vext >= AVX2) { sum += xCalcHAD4x8_HBD_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur); } else #endif { sum += xCalcHAD4x8_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } } piOrg += strideOrg * 8; piCur += strideCur * 8; } } else if ((rows % 8 == 0) && (cols % 8 == 0)) { ptrdiff_t offsetOrg = strideOrg << 3; ptrdiff_t offsetCur = strideCur << 3; for (y = 0; y < rows; y += 8) { for (x = 0; x < cols; x += 8) { #ifdef USE_AVX2 if (vext >= AVX2) { sum += xCalcHAD8x8_HBD_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur); } else #endif { sum += xCalcHAD8x8_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } } piOrg += offsetOrg; piCur += offsetCur; } } else if ((rows % 4 == 0) && (cols % 4 == 0)) { ptrdiff_t offsetOrg = strideOrg << 2; ptrdiff_t offsetCur = strideCur << 2; for (y = 0; y < rows; y += 4) { for (x = 0; x < cols; x += 4) { #ifdef USE_AVX2 if (vext >= AVX2) { sum += xCalcHAD4x4_HBD_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur); } else #endif { sum += xCalcHAD4x4_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } } piOrg += offsetOrg; piCur += offsetCur; } } else if ((rows % 2 == 0) && (cols % 2 == 0)) { ptrdiff_t offsetOrg = strideOrg << 1; ptrdiff_t offsetCur = strideCur << 1; for (y = 0; y < rows; y += 2) { for (x = 0; x < cols; x += 2) { sum += xCalcHAD2x2_HBD_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } piOrg += offsetOrg; piCur += offsetCur; } } else { THROW("Invalid size"); } return (sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth)); } template< X86_VEXT vext > Distortion RdCost::xGetSAD_HBD_SIMD(const DistParam &rcDtParam) { if (rcDtParam.applyWeight) { return RdCost::xGetSAD(rcDtParam); } const Pel* pSrc1 = (const Pel*)rcDtParam.org.buf; const Pel* pSrc2 = (const Pel*)rcDtParam.cur.buf; int rows = rcDtParam.org.height; int cols = rcDtParam.org.width; int subShift = rcDtParam.subShift; int subStep = (1 << subShift); const ptrdiff_t strideSrc1 = rcDtParam.org.stride * subStep; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride * subStep; if ((cols < 4) && (rows < (subStep << 1))) { return RdCost::xGetSAD(rcDtParam); } uint32_t sum = 0; #ifdef USE_AVX2 if ((vext >= AVX2) && ((cols & 7) == 0)) { __m256i vzero = _mm256_setzero_si256(); __m256i vsum32 = vzero; __m256i vsrc1, vsrc2, vsum; for (int y = 0; y < rows; y += subStep) { for (int x = 0; x < cols; x += 8) { vsrc1 = _mm256_lddqu_si256((__m256i *) (&pSrc1[x])); vsrc2 = _mm256_lddqu_si256((__m256i *) (&pSrc2[x])); vsum = _mm256_abs_epi32(_mm256_sub_epi32(vsrc1, vsrc2)); vsum32 = _mm256_add_epi32(vsum32, vsum); } pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm256_hadd_epi32(vsum32, vzero); vsum32 = _mm256_hadd_epi32(vsum32, vzero); sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)) + _mm_cvtsi128_si32(_mm256_castsi256_si128(_mm256_permute2x128_si256(vsum32, vsum32, 0x11))); } else #endif { if ((cols & 3) == 0) { __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; __m128i vsrc1, vsrc2, vsum; for (int y = 0; y < rows; y += subStep) { for (int x = 0; x < cols; x += 4) { vsrc1 = _mm_lddqu_si128((const __m128i *) (&pSrc1[x])); vsrc2 = _mm_lddqu_si128((const __m128i *) (&pSrc2[x])); vsum = _mm_abs_epi32(_mm_sub_epi32(vsrc1, vsrc2)); vsum32 = _mm_add_epi32(vsum32, vsum); } pSrc1 += strideSrc1; pSrc2 += strideSrc2; } vsum32 = _mm_hadd_epi32(vsum32, vzero); vsum32 = _mm_hadd_epi32(vsum32, vzero); sum = _mm_cvtsi128_si32(vsum32); } else { __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; __m128i vsrc10, vsrc20, vsrc11, vsrc21, vsum0, vsum1, vsum; ptrdiff_t i2StrideSrc1 = (strideSrc1 << 1); ptrdiff_t i2StrideSrc2 = (strideSrc2 << 1); for (int y = 0; y < rows; y += (subStep << 1)) { for (int x = 0; x < cols; x += 2) { vsrc10 = _mm_loadl_epi64((const __m128i *) (&pSrc1[x])); vsrc20 = _mm_loadl_epi64((const __m128i *) (&pSrc2[x])); vsum0 = _mm_abs_epi32(_mm_sub_epi32(vsrc10, vsrc20)); vsrc11 = _mm_loadl_epi64((const __m128i *) (&pSrc1[x + strideSrc1])); vsrc21 = _mm_loadl_epi64((const __m128i *) (&pSrc2[x + strideSrc2])); vsum1 = _mm_abs_epi32(_mm_sub_epi32(vsrc11, vsrc21)); vsum = _mm_unpacklo_epi32(vsum0, vsum1); vsum32 = _mm_add_epi32(vsum32, vsum); } pSrc1 += i2StrideSrc1; pSrc2 += i2StrideSrc2; } vsum32 = _mm_hadd_epi32(vsum32, vzero); vsum32 = _mm_hadd_epi32(vsum32, vzero); sum = _mm_cvtsi128_si32(vsum32); } } sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } template< X86_VEXT vext > Distortion RdCost::xGetSADwMask_HBD_SIMD(const DistParam &rcDtParam) { CHECK((rcDtParam.org.width & 7), "the function only support width multiple of 8"); CHECK(rcDtParam.applyWeight, "the function does not support weighted distortion"); const Pel* src1 = rcDtParam.org.buf; const Pel* src2 = rcDtParam.cur.buf; const Pel* weightMask = rcDtParam.mask; int rows = rcDtParam.org.height; int cols = rcDtParam.org.width; int subShift = rcDtParam.subShift; int subStep = (1 << subShift); const ptrdiff_t strideSrc1 = rcDtParam.org.stride * subStep; const ptrdiff_t strideSrc2 = rcDtParam.cur.stride * subStep; const ptrdiff_t strideMask = rcDtParam.maskStride * subStep; Distortion sum = 0; #ifdef USE_AVX2 if (vext >= AVX2) { __m256i vzero = _mm256_setzero_si256(); __m256i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { for (int x = 0; x < cols; x += 8) { __m256i vsrc1 = _mm256_lddqu_si256((const __m256i*)(&src1[x])); __m256i vsrc2 = _mm256_lddqu_si256((const __m256i*)(&src2[x])); __m256i vmask, vsum; if (rcDtParam.stepX == -1) { vmask = _mm256_lddqu_si256((__m256i*)((&weightMask[x]) - (x << 1) - (8 - 1))); vmask = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(vmask, 0x1b), 0x4e); } else { vmask = _mm256_lddqu_si256((const __m256i*)(&weightMask[x])); } vsum = _mm256_mullo_epi32(vmask, _mm256_abs_epi32(_mm256_sub_epi32(vsrc1, vsrc2))); vsum32 = _mm256_add_epi32(vsum32, vsum); } src1 += strideSrc1; src2 += strideSrc2; weightMask += strideMask; } vsum32 = _mm256_add_epi32(vsum32, _mm256_permute4x64_epi64(vsum32, 0x4e)); vsum32 = _mm256_add_epi32(vsum32, _mm256_permute4x64_epi64(vsum32, 0xb1)); vsum32 = _mm256_add_epi32(vsum32, _mm256_shuffle_epi32(vsum32, 0x1b)); sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(vsum32)); } else #endif { __m128i vzero = _mm_setzero_si128(); __m128i vsum32 = vzero; for (int y = 0; y < rows; y += subStep) { for (int x = 0; x < cols; x += 8) { __m128i vsrc11 = _mm_lddqu_si128((const __m128i*)(&src1[x])); __m128i vsrc12 = _mm_lddqu_si128((const __m128i*)(&src1[x + 4])); __m128i vsrc21 = _mm_lddqu_si128((const __m128i*)(&src2[x])); __m128i vsrc22 = _mm_lddqu_si128((const __m128i*)(&src2[x + 4])); __m128i vmask1, vmask2, vsum1, vsum2; if (rcDtParam.stepX == -1) { vmask1 = _mm_lddqu_si128((__m128i*)((&weightMask[x]) - (x << 1) - (8 - 1) + 4)); vmask1 = _mm_shuffle_epi32(vmask1, 0x1b); vmask2 = _mm_lddqu_si128((__m128i*)((&weightMask[x]) - (x << 1) - (8 - 1))); vmask2 = _mm_shuffle_epi32(vmask2, 0x1b); } else { vmask1 = _mm_lddqu_si128((const __m128i*)(&weightMask[x])); vmask2 = _mm_lddqu_si128((const __m128i*)(&weightMask[x + 4])); } vsum1 = _mm_mullo_epi32(vmask1, _mm_abs_epi32(_mm_sub_epi32(vsrc11, vsrc21))); vsum2 = _mm_mullo_epi32(vmask2, _mm_abs_epi32(_mm_sub_epi32(vsrc12, vsrc22))); vsum32 = _mm_add_epi32(vsum32, vsum1); vsum32 = _mm_add_epi32(vsum32, vsum2); } src1 += strideSrc1; src2 += strideSrc2; weightMask += strideMask; } vsum32 = _mm_hadd_epi32(vsum32, vzero); vsum32 = _mm_hadd_epi32(vsum32, vzero); sum = _mm_cvtsi128_si32(vsum32); } sum <<= subShift; return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } template< X86_VEXT vext > Distortion RdCost::xGetSSE_HBD_SIMD(const DistParam& pcDtParam) { #ifndef FULL_NBIT #error the function only supports full bit-depth #endif CHECK(pcDtParam.applyWeight, "the function does not support weighted SSE"); const Pel* piOrg = pcDtParam.org.buf; const Pel* piCur = pcDtParam.cur.buf; int rows = pcDtParam.org.height; int cols = pcDtParam.org.width; ptrdiff_t strideCur = pcDtParam.cur.stride; ptrdiff_t strideOrg = pcDtParam.org.stride; Distortion sum = 0; #ifdef USE_AVX2 if ((vext >= AVX2) && ((cols & 7) == 0)) { __m256i vsum = _mm256_setzero_si256(); for (int y = 0; y < rows; y++) { for (int x = 0; x < cols; x += 8) { __m256i vorg = _mm256_lddqu_si256((const __m256i *) (&piOrg[x])); __m256i vcur = _mm256_lddqu_si256((const __m256i *) (&piCur[x])); __m256i vtemp = _mm256_sub_epi32(vorg, vcur); vsum = _mm256_add_epi64(vsum, _mm256_mul_epi32(vtemp, vtemp)); vorg = _mm256_srli_si256(vorg, 4); vcur = _mm256_srli_si256(vcur, 4); vtemp = _mm256_sub_epi32(vorg, vcur); vsum = _mm256_add_epi64(vsum, _mm256_mul_epi32(vtemp, vtemp)); } piOrg += strideOrg; piCur += strideCur; } sum += _mm256_extract_epi64(vsum, 0) + _mm256_extract_epi64(vsum, 1) + _mm256_extract_epi64(vsum, 2) + _mm256_extract_epi64(vsum, 3); } else #endif { if ((cols & 3) == 0) { __m128i vsum = _mm_setzero_si128(); for (int y = 0; y < rows; y++) { for (int x = 0; x < cols; x += 4) { __m128i vorg = _mm_lddqu_si128((const __m128i *) (&piOrg[x])); __m128i vcur = _mm_lddqu_si128((const __m128i *) (&piCur[x])); __m128i vtemp = _mm_sub_epi32(vorg, vcur); vsum = _mm_add_epi64(vsum, _mm_mul_epi32(vtemp, vtemp)); vorg = _mm_srli_si128(vorg, 4); vcur = _mm_srli_si128(vcur, 4); vtemp = _mm_sub_epi32(vorg, vcur); vsum = _mm_add_epi64(vsum, _mm_mul_epi32(vtemp, vtemp)); } piOrg += strideOrg; piCur += strideCur; } sum += _mm_extract_epi64(vsum, 0) + _mm_extract_epi64(vsum, 1); } else if ((cols & 1) == 0) { __m128i vsum = _mm_setzero_si128(); for (int y = 0; y < rows; y++) { for (int x = 0; x < cols; x += 2) { __m128i vorg = _mm_loadl_epi64((const __m128i *) (&piOrg[x])); __m128i vcur = _mm_loadl_epi64((const __m128i *) (&piCur[x])); vorg = _mm_shuffle_epi32(vorg, 0xd8); vcur = _mm_shuffle_epi32(vcur, 0xd8); __m128i vtemp = _mm_sub_epi32(vorg, vcur); vsum = _mm_add_epi64(vsum, _mm_mul_epi32(vtemp, vtemp)); } piOrg += strideOrg; piCur += strideCur; } sum += _mm_extract_epi64(vsum, 0) + _mm_extract_epi64(vsum, 1); } else { Intermediate_Int temp; for (int y = 0; y < rows; y++) { for (int x = 0; x < cols; x++) { temp = piOrg[x] - piCur[x]; sum += Distortion(temp * temp); } piOrg += strideOrg; piCur += strideCur; } } } return sum; } #else template Distortion RdCost::xGetHADs_SIMD( const DistParam &rcDtParam ) { if( rcDtParam.bitDepth > 10 || rcDtParam.applyWeight ) { return RdCost::xGetHADs( rcDtParam ); } const Pel* piOrg = rcDtParam.org.buf; const Pel* piCur = rcDtParam.cur.buf; const int rows = rcDtParam.org.height; const int cols = rcDtParam.org.width; const ptrdiff_t strideCur = rcDtParam.cur.stride; const ptrdiff_t strideOrg = rcDtParam.org.stride; const int iBitDepth = rcDtParam.bitDepth; int x, y; Distortion sum = 0; if (cols > rows && (cols & 15) == 0 && (rows & 7) == 0) { for (y = 0; y < rows; y += 8) { for (x = 0; x < cols; x += 16) { if( vext >= AVX2 ) { sum += xCalcHAD16x8_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } else { sum += xCalcHAD16x8_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } } piOrg += strideOrg * 8; piCur += strideCur * 8; } } else if (cols < rows && (rows & 15) == 0 && (cols & 7) == 0) { for (y = 0; y < rows; y += 16) { for (x = 0; x < cols; x += 8) { if( vext >= AVX2 ) { sum += xCalcHAD8x16_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } else { sum += xCalcHAD8x16_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } } piOrg += strideOrg * 16; piCur += strideCur * 16; } } else if (cols > rows && (cols & 7) == 0 && (rows & 3) == 0) { for (y = 0; y < rows; y += 4) { for (x = 0; x < cols; x += 8) { sum += xCalcHAD8x4_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } piOrg += strideOrg * 4; piCur += strideCur * 4; } } else if (cols < rows && (rows & 7) == 0 && (cols & 3) == 0) { for (y = 0; y < rows; y += 8) { for (x = 0; x < cols; x += 4) { sum += xCalcHAD4x8_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } piOrg += strideOrg * 8; piCur += strideCur * 8; } } else if (vext >= AVX2 && (((rows | cols) & 15) == 0) && (rows == cols)) { ptrdiff_t offsetOrg = strideOrg << 4; ptrdiff_t offsetCur = strideCur << 4; for (y = 0; y < rows; y += 16) { for (x = 0; x < cols; x += 16) { sum += xCalcHAD16x16_AVX2(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } piOrg += offsetOrg; piCur += offsetCur; } } else if ((((rows | cols) & 7) == 0) && (rows == cols)) { ptrdiff_t offsetOrg = strideOrg << 3; ptrdiff_t offsetCur = strideCur << 3; for (y = 0; y < rows; y += 8) { for (x = 0; x < cols; x += 8) { sum += xCalcHAD8x8_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur, iBitDepth); } piOrg += offsetOrg; piCur += offsetCur; } } else if ((rows % 4 == 0) && (cols % 4 == 0)) { ptrdiff_t offsetOrg = strideOrg << 2; ptrdiff_t offsetCur = strideCur << 2; for (y = 0; y < rows; y += 4) { for (x = 0; x < cols; x += 4) { sum += xCalcHAD4x4_SSE(&piOrg[x], &piCur[x], strideOrg, strideCur); } piOrg += offsetOrg; piCur += offsetCur; } } else if ((rows % 2 == 0) && (cols % 2 == 0)) { ptrdiff_t offsetOrg = strideOrg << 1; ptrdiff_t offsetCur = strideCur << 1; for (y = 0; y < rows; y += 2) { for (x = 0; x < cols; x += 2) { sum += xCalcHADs2x2(&piOrg[x], &piCur[x * rcDtParam.step], strideOrg, strideCur, rcDtParam.step); } piOrg += offsetOrg; piCur += offsetCur; } } else { THROW( "Unsupported size" ); } return sum >> DISTORTION_PRECISION_ADJUSTMENT(rcDtParam.bitDepth); } #endif template void RdCost::_initRdCostX86() { #if RExt__HIGH_BIT_DEPTH_SUPPORT m_distortionFunc[DFunc::SAD] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD2] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD4] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD8] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD16] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD32] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD64] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD16N] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD12] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD24] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD48] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD_INTERMEDIATE_BITDEPTH] = xGetSAD_HBD_SIMD; m_distortionFunc[DFunc::SAD_WITH_MASK] = xGetSADwMask_HBD_SIMD; m_distortionFunc[DFunc::HAD] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD2] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD4] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD8] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD16] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD32] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD64] = xGetHADs_HBD_SIMD; m_distortionFunc[DFunc::HAD16N] = xGetHADs_HBD_SIMD; #if FULL_NBIT m_distortionFunc[DFunc::SSE] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE2] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE4] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE8] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE16] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE32] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE64] = xGetSSE_HBD_SIMD; m_distortionFunc[DFunc::SSE16N] = xGetSSE_HBD_SIMD; #endif #else m_distortionFunc[DFunc::SSE] = xGetSSE_SIMD; m_distortionFunc[DFunc::SSE2] = xGetSSE_NxN_SIMD<2, vext>; m_distortionFunc[DFunc::SSE4] = xGetSSE_NxN_SIMD<4, vext>; m_distortionFunc[DFunc::SSE8] = xGetSSE_NxN_SIMD<8, vext>; m_distortionFunc[DFunc::SSE16] = xGetSSE_NxN_SIMD<16, vext>; m_distortionFunc[DFunc::SSE32] = xGetSSE_NxN_SIMD<32, vext>; m_distortionFunc[DFunc::SSE64] = xGetSSE_NxN_SIMD<64, vext>; m_distortionFunc[DFunc::SSE16N] = xGetSSE_SIMD; m_distortionFunc[DFunc::SAD] = xGetSAD_SIMD; m_distortionFunc[DFunc::SAD2] = xGetSAD_SIMD; m_distortionFunc[DFunc::SAD4] = xGetSAD_NxN_SIMD<4, vext>; m_distortionFunc[DFunc::SAD8] = xGetSAD_NxN_SIMD<8, vext>; m_distortionFunc[DFunc::SAD16] = xGetSAD_NxN_SIMD<16, vext>; m_distortionFunc[DFunc::SAD32] = xGetSAD_NxN_SIMD<32, vext>; m_distortionFunc[DFunc::SAD64] = xGetSAD_NxN_SIMD<64, vext>; m_distortionFunc[DFunc::SAD16N] = xGetSAD_SIMD; m_distortionFunc[DFunc::SAD12] = RdCost::xGetSAD_SIMD; m_distortionFunc[DFunc::SAD24] = RdCost::xGetSAD_SIMD; m_distortionFunc[DFunc::SAD48] = RdCost::xGetSAD_SIMD; m_distortionFunc[DFunc::HAD] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD2] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD4] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD8] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD16] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD32] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD64] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::HAD16N] = RdCost::xGetHADs_SIMD; m_distortionFunc[DFunc::SAD_INTERMEDIATE_BITDEPTH] = RdCost::xGetSAD_IBD_SIMD; m_distortionFunc[DFunc::SAD_WITH_MASK] = xGetSADwMask_SIMD; #endif } template void RdCost::_initRdCostX86(); #endif //#if TARGET_SIMD_X86 //! \}