Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
13 : #define AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
14 :
15 : #include <immintrin.h>
16 :
17 : #include "aom_dsp/txfm_common.h"
18 : #include "aom_dsp/x86/txfm_common_avx2.h"
19 :
20 0 : static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
21 : #if CONFIG_HIGHBITDEPTH
22 0 : *in = _mm256_setr_epi16(
23 0 : (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
24 0 : (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
25 0 : (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
26 0 : (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
27 0 : (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
28 0 : (int16_t)coeff[15]);
29 : #else
30 : *in = _mm256_loadu_si256((const __m256i *)coeff);
31 : #endif
32 0 : }
33 :
34 0 : static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
35 0 : int i = 0;
36 0 : while (i < 16) {
37 0 : load_coeff(coeff + (i << 4), &in[i]);
38 0 : i += 1;
39 : }
40 0 : }
41 :
42 0 : static INLINE void recon_and_store(const __m256i *res, uint8_t *output) {
43 0 : const __m128i zero = _mm_setzero_si128();
44 0 : __m128i x = _mm_loadu_si128((__m128i const *)output);
45 0 : __m128i p0 = _mm_unpacklo_epi8(x, zero);
46 0 : __m128i p1 = _mm_unpackhi_epi8(x, zero);
47 :
48 0 : p0 = _mm_add_epi16(p0, _mm256_castsi256_si128(*res));
49 0 : p1 = _mm_add_epi16(p1, _mm256_extractf128_si256(*res, 1));
50 0 : x = _mm_packus_epi16(p0, p1);
51 : _mm_storeu_si128((__m128i *)output, x);
52 0 : }
53 :
54 : #define IDCT_ROUNDING_POS (6)
55 0 : static INLINE void store_buffer_16xN(__m256i *in, const int stride,
56 : uint8_t *output, int num) {
57 0 : const __m256i rounding = _mm256_set1_epi16(1 << (IDCT_ROUNDING_POS - 1));
58 0 : int i = 0;
59 :
60 0 : while (i < num) {
61 0 : in[i] = _mm256_adds_epi16(in[i], rounding);
62 0 : in[i] = _mm256_srai_epi16(in[i], IDCT_ROUNDING_POS);
63 0 : recon_and_store(&in[i], output + i * stride);
64 0 : i += 1;
65 : }
66 0 : }
67 :
68 0 : static INLINE void unpack_butter_fly(const __m256i *a0, const __m256i *a1,
69 : const __m256i *c0, const __m256i *c1,
70 : __m256i *b0, __m256i *b1) {
71 : __m256i x0, x1;
72 0 : x0 = _mm256_unpacklo_epi16(*a0, *a1);
73 0 : x1 = _mm256_unpackhi_epi16(*a0, *a1);
74 0 : *b0 = butter_fly(&x0, &x1, c0);
75 0 : *b1 = butter_fly(&x0, &x1, c1);
76 0 : }
77 :
78 : void av1_idct16_avx2(__m256i *in);
79 :
80 : #endif // AOM_DSP_X86_INV_TXFM_COMMON_AVX2_H
|