CommonDefX86.h 12.3 KB
Newer Older
Alberto Gonzalez's avatar
Alberto Gonzalez committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
/* The copyright in this software is being made available under the BSD
 * License, included below. This software may be subject to other third party
 * and contributor rights, including patent rights, and no such rights are
 * granted under this license.
 *
 * Copyright (c) 2010-2023, ITU/ISO/IEC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

/** \file     CommonDefX86.h
*/

#ifndef __COMMONDEFX86__
#define __COMMONDEFX86__


#include "CommonLib/CommonDef.h"

//! \ingroup CommonLib
//! \{

#ifdef TARGET_SIMD_X86

#include <immintrin.h>

#ifdef USE_AVX512
#define SIMDX86 AVX512
#elif defined USE_AVX2
#define SIMDX86 AVX2
#elif defined USE_AVX
#define SIMDX86 AVX
#elif defined USE_SSE42
#define SIMDX86 SSE42
#elif defined USE_SSE41
#define SIMDX86 SSE41
#endif


#define TRANSPOSE4x4(T) \
{\
  __m128i a01b01 = _mm_unpacklo_epi32(T[0], T[1]);\
  __m128i a23b23 = _mm_unpackhi_epi32(T[0], T[1]);\
  __m128i c01d01 = _mm_unpacklo_epi32(T[2], T[3]);\
  __m128i c23d23 = _mm_unpackhi_epi32(T[2], T[3]);\
\
  T[0] = _mm_unpacklo_epi64(a01b01, c01d01);\
  T[1] = _mm_unpackhi_epi64(a01b01, c01d01);\
  T[2] = _mm_unpacklo_epi64(a23b23, c23d23);\
  T[3] = _mm_unpackhi_epi64(a23b23, c23d23);\
}\

#define TRANSPOSE8x8(T) \
{\
  __m128i a03b03 = _mm_unpacklo_epi16(T[0], T[1]);\
  __m128i c03d03 = _mm_unpacklo_epi16(T[2], T[3]);\
  __m128i e03f03 = _mm_unpacklo_epi16(T[4], T[5]);\
  __m128i g03h03 = _mm_unpacklo_epi16(T[6], T[7]);\
  __m128i a47b47 = _mm_unpackhi_epi16(T[0], T[1]);\
  __m128i c47d47 = _mm_unpackhi_epi16(T[2], T[3]);\
  __m128i e47f47 = _mm_unpackhi_epi16(T[4], T[5]);\
  __m128i g47h47 = _mm_unpackhi_epi16(T[6], T[7]);\
\
  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);\
  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);\
  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);\
  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);\
  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);\
  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);\
  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);\
  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);\
\
  T[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);\
  T[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);\
  T[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);\
  T[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);\
  T[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);\
  T[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);\
  T[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);\
  T[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);\
}\

#define TRANSPOSESAT4x4(T)\
{\
  TRANSPOSE4x4(T);\
  T[0] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[0], vzero));\
  T[1] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[1], vzero));\
  T[2] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[2], vzero));\
  T[3] = _mm_cvtepi16_epi32(_mm_packs_epi32(T[3], vzero));\
}\

#define ADDCLIP4(dstptr, res, min, max)\
{\
  __m128i vdst = _mm_lddqu_si128((__m128i*) dstptr);\
  vdst = _mm_add_epi16(vdst,  _mm_packs_epi32(res, vzero));\
  vdst = _mm_min_epi16(max, _mm_max_epi16(min, vdst));\
  _mm_storel_epi64((__m128i*) dstptr, vdst);\
}\

#define TRANSPOSESTORE8x8_ALGN(T, D, stride)\
{\
  TRANSPOSE8x8(T); \
\
  _mm_store_si128((__m128i*)&D[0*stride], T[0]);\
  _mm_store_si128((__m128i*)&D[1*stride], T[1]);\
  _mm_store_si128((__m128i*)&D[2*stride], T[2]);\
  _mm_store_si128((__m128i*)&D[3*stride], T[3]);\
  _mm_store_si128((__m128i*)&D[4*stride], T[4]);\
  _mm_store_si128((__m128i*)&D[5*stride], T[5]);\
  _mm_store_si128((__m128i*)&D[6*stride], T[6]);\
  _mm_store_si128((__m128i*)&D[7*stride], T[7]);\
}\

#define ADDCLIP(dstptr, res, min, max)\
{\
  __m128i vdst = _mm_load_si128((__m128i*) dstptr);\
  vdst = _mm_add_epi16(vdst, res ); \
  vdst = _mm_min_epi16(max,_mm_max_epi16(min, vdst));\
  _mm_store_si128((__m128i*)dstptr, vdst);\
}\

#define TRANSPOSEADDCLIPSTORE8x8_ALGN(T, D, stride, min, max)\
{\
  TRANSPOSE8x8(T); \
\
  ADDCLIP(&D[0*stride], T[0], min, max);\
  ADDCLIP(&D[1*stride], T[1], min, max);\
  ADDCLIP(&D[2*stride], T[2], min, max);\
  ADDCLIP(&D[3*stride], T[3], min, max);\
  ADDCLIP(&D[4*stride], T[4], min, max);\
  ADDCLIP(&D[5*stride], T[5], min, max);\
  ADDCLIP(&D[6*stride], T[6], min, max);\
  ADDCLIP(&D[7*stride], T[7], min, max);\
}\


static inline __m128i _mm_sel_si128(__m128i a, __m128i b, __m128i mask)
{
#ifdef USE_SSE41
  return _mm_blendv_epi8( a, b, mask);
#else
  return _mm_or_si128( _mm_andnot_si128( mask, a ), _mm_and_si128( b, mask ));
#endif
}


static inline __m128i _mm_clip_epi8(__m128i v, __m128i low, __m128i hi)
{
#ifdef USE_SSE41
  return _mm_min_epi8(_mm_max_epi8(v, low), hi);
#else
  __m128i vlowm = _mm_cmplt_epi8(v, low);
  __m128i vhighm = _mm_cmpgt_epi8(v, hi);
  return _mm_sel_si128(_mm_sel_si128(v, low, vlowm), hi, vhighm);
#endif
}


#ifdef USE_AVX2


#define TRANSPOSESTORE16x16_ALGN(T, D, stride)\
{\
  TRANSPOSE16x16_AVX2(T); \
\
  for (int _i_=0; _i_ < 16; _i_++)\
  _mm256_store_si256((__m256i*)&D[_i_*stride], T[_i_]);\
}\

#define ADDCLIPAVX2(dstptr, res, min, max)\
{\
  __m256i vdst = _mm256_load_si256((__m256i*) dstptr);\
  vdst = _mm256_adds_epi16(vdst, res ); \
  vdst = _mm256_min_epi16(max,_mm256_max_epi16(min, vdst));\
  _mm256_store_si256((__m256i*)dstptr, vdst);\
}\

#define TRANSPOSEADDCLIPSTORE16x16_ALGN(T, D, stride, min, max)\
{\
  TRANSPOSE16x16_AVX2(T); \
\
  for (int _i_=0; _i_ < 16; _i_++)\
    ADDCLIPAVX2(&D[_i_*stride], T[_i_], min, max);\
}

static inline void TRANSPOSE16x16_AVX2(__m256i T[16])
{
  __m256i T_03[8];
  __m256i T_47[8];
  for (int i = 0; i < 8; i++)
  {
    T_03[i] = _mm256_unpacklo_epi16(T[2*i], T[2*i+1]);
    T_47[i] = _mm256_unpackhi_epi16(T[2*i], T[2*i+1]);
  }

  __m256i T_01[4];
  __m256i T_23[4];
  __m256i T_45[4];
  __m256i T_67[4];
  for (int i = 0; i < 4; i++)
  {
    T_01[i] = _mm256_unpacklo_epi32(T_03[2*i], T_03[2*i+1]);
    T_23[i] = _mm256_unpackhi_epi32(T_03[2*i], T_03[2*i+1]);
    T_45[i] = _mm256_unpacklo_epi32(T_47[2*i], T_47[2*i+1]);
    T_67[i] = _mm256_unpackhi_epi32(T_47[2*i], T_47[2*i+1]);
  }
  __m256i TR[8][2];

  for (int i = 0; i < 2; i++)
  {
    TR[0][i] = _mm256_unpacklo_epi64(T_01[2*i], T_01[2*i+1]);
    TR[1][i] = _mm256_unpackhi_epi64(T_01[2*i], T_01[2*i+1]);
    TR[2][i] = _mm256_unpacklo_epi64(T_23[2*i], T_23[2*i+1]);
    TR[3][i] = _mm256_unpackhi_epi64(T_23[2*i], T_23[2*i+1]);
    TR[4][i] = _mm256_unpacklo_epi64(T_45[2*i], T_45[2*i+1]);
    TR[5][i] = _mm256_unpackhi_epi64(T_45[2*i], T_45[2*i+1]);
    TR[6][i] = _mm256_unpacklo_epi64(T_67[2*i], T_67[2*i+1]);
    TR[7][i] = _mm256_unpackhi_epi64(T_67[2*i], T_67[2*i+1]);
  }
  for (int i = 0; i < 8; i++)
  {
    T[i] = _mm256_permute2x128_si256(TR[i][0], TR[i][1], 0x20);
    T[i+8] = _mm256_permute2x128_si256(TR[i][0], TR[i][1], 0x31);
  }
}

static inline void TRANSPOSE16x8_AVX2(__m256i T[8])
{
  __m256i T_03[4];
  __m256i T_47[4];
  for (int i = 0; i < 4; i++)
  {
    T_03[i] = _mm256_unpacklo_epi16(T[2*i], T[2*i+1]);
    T_47[i] = _mm256_unpackhi_epi16(T[2*i], T[2*i+1]);
  }

  __m256i T_01[2];
  __m256i T_23[2];
  __m256i T_45[2];
  __m256i T_67[2];
  for (int i = 0; i < 2; i++)
  {
    T_01[i] = _mm256_unpacklo_epi32(T_03[2*i], T_03[2*i+1]);
    T_23[i] = _mm256_unpackhi_epi32(T_03[2*i], T_03[2*i+1]);
    T_45[i] = _mm256_unpacklo_epi32(T_47[2*i], T_47[2*i+1]);
    T_67[i] = _mm256_unpackhi_epi32(T_47[2*i], T_47[2*i+1]);
  }
  T[0] = _mm256_unpacklo_epi64(T_01[0], T_01[1]);
  T[1] = _mm256_unpackhi_epi64(T_01[0], T_01[1]);
  T[2] = _mm256_unpacklo_epi64(T_23[0], T_23[1]);
  T[3] = _mm256_unpackhi_epi64(T_23[0], T_23[1]);
  T[4] = _mm256_unpacklo_epi64(T_45[0], T_45[1]);
  T[5] = _mm256_unpackhi_epi64(T_45[0], T_45[1]);
  T[6] = _mm256_unpacklo_epi64(T_67[0], T_67[1]);
  T[7] = _mm256_unpackhi_epi64(T_67[0], T_67[1]);
}

static inline void TRANSPOSE8x8_32b_AVX2(__m256i T[8])
{
  __m256i T_03[4];
  __m256i T_47[4];
  for (int i=0; i<4; i++){
    T_03[i] = _mm256_unpacklo_epi32(T[2*i], T[2*i+1]);
    T_47[i] = _mm256_unpackhi_epi32(T[2*i], T[2*i+1]);
  }
  __m256i T_01[2];
  __m256i T_23[2];
  __m256i T_45[2];
  __m256i T_67[2];
  for (int i = 0; i < 2; i++)
  {
    T_01[i] = _mm256_unpacklo_epi64(T_03[2*i], T_03[2*i+1]);
    T_23[i] = _mm256_unpackhi_epi64(T_03[2*i], T_03[2*i+1]);
    T_45[i] = _mm256_unpacklo_epi64(T_47[2*i], T_47[2*i+1]);
    T_67[i] = _mm256_unpackhi_epi64(T_47[2*i], T_47[2*i+1]);
  }

  T[0] = _mm256_permute2x128_si256(T_01[0], T_01[1], 0x20);
  T[4] = _mm256_permute2x128_si256(T_01[0], T_01[1], 0x31);
  T[1] = _mm256_permute2x128_si256(T_23[0], T_23[1], 0x20);
  T[5] = _mm256_permute2x128_si256(T_23[0], T_23[1], 0x31);
  T[2] = _mm256_permute2x128_si256(T_45[0], T_45[1], 0x20);
  T[6] = _mm256_permute2x128_si256(T_45[0], T_45[1], 0x31);
  T[3] = _mm256_permute2x128_si256(T_67[0], T_67[1], 0x20);
  T[7] = _mm256_permute2x128_si256(T_67[0], T_67[1], 0x31);
}


#endif

#ifdef USE_AVX512

ALWAYS_INLINE inline __m512i
_mm512_set_epi16( int16_t x31, int16_t x30, int16_t x29, int16_t x28,
                  int16_t x27, int16_t x26, int16_t x25, int16_t x24,
                  int16_t x23, int16_t x22, int16_t x21, int16_t x20,
                  int16_t x19, int16_t x18, int16_t x17, int16_t x16,
                  int16_t x15, int16_t x14, int16_t x13, int16_t x12,
                  int16_t x11, int16_t x10,  int16_t x9,  int16_t x8,
                  int16_t  x7,  int16_t x6,  int16_t x5,  int16_t x4,
                  int16_t  x3,  int16_t x2,  int16_t x1,  int16_t x0 )
{
  return _mm512_set_epi32( (x31<<16) + (0xffff & x30), (x29<<16) + (0xffff & x28),
                           (x27<<16) + (0xffff & x26), (x25<<16) + (0xffff & x24),
                           (x23<<16) + (0xffff & x22), (x21<<16) + (0xffff & x20),
                           (x19<<16) + (0xffff & x18), (x17<<16) + (0xffff & x16),
                           (x15<<16) + (0xffff & x14), (x13<<16) + (0xffff & x12),
                           (x11<<16) + (0xffff & x10), ( x9<<16) + (0xffff &  x8),
                           ( x7<<16) + (0xffff &  x6), ( x5<<16) + (0xffff &  x4),
                           ( x3<<16) + (0xffff &  x2), ( x1<<16) + (0xffff &  x0) );
}
#endif

#ifdef ENABLE_REGISTER_PRINTING
/* note for gcc: this helper throws a compilation error
 * because of name mangling when used with different types for R at the same time,
 * the workaround is to compile with -fabi-version=4 or higher
 * (needs gcc >= 4.5) */
template< class T, class R >
static void _printReg( const R var, const char* varname, uint8_t count = sizeof( R ) )
{
    T *val = (T*)&var;
    const int varcnt = std::min(count, (uint8_t)(sizeof(R)/sizeof(T)));
    std::cout << varname << ":";
    for( int i = 0; i < varcnt; ++i )
    {
        std::cout << " " << std::setw(sizeof(T)*2 + 1)<< val[i];
    }
    std::cout << std::endl;
}

#define PREG( var, t, cnt ) \
{ \
  static unsigned c = cnt; \
  if( c ) \
  { \
    std::cout << cnt - c << " "; \
    _printReg<t>( var, #var ); \
    c--;  \
  } \
}
#else
#define PREG( var, t, cnt )
#endif

#endif // TARGET_SIMD_X86

#endif // __COMMONDEFX86__