Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_BLEND_SSE4_H_
13 : #define AOM_DSP_X86_BLEND_SSE4_H_
14 :
15 : #include "aom_dsp/blend.h"
16 : #include "aom_dsp/x86/synonyms.h"
17 :
18 : //////////////////////////////////////////////////////////////////////////////
19 : // Common kernels
20 : //////////////////////////////////////////////////////////////////////////////
21 :
22 0 : static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
23 : const __m128i v_m0_w, const __m128i v_m1_w) {
24 0 : const __m128i v_s0_b = xx_loadl_32(src0);
25 0 : const __m128i v_s1_b = xx_loadl_32(src1);
26 0 : const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
27 0 : const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
28 :
29 0 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
30 0 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
31 :
32 0 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
33 :
34 0 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
35 :
36 0 : return v_res_w;
37 : }
38 :
39 0 : static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
40 : const __m128i v_m0_w, const __m128i v_m1_w) {
41 0 : const __m128i v_s0_b = xx_loadl_64(src0);
42 0 : const __m128i v_s1_b = xx_loadl_64(src1);
43 0 : const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
44 0 : const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
45 :
46 0 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
47 0 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
48 :
49 0 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
50 :
51 0 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
52 :
53 0 : return v_res_w;
54 : }
55 :
56 : #if CONFIG_HIGHBITDEPTH
57 : typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
58 : const __m128i v_m0_w, const __m128i v_m1_w);
59 :
60 0 : static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
61 : const __m128i v_m0_w, const __m128i v_m1_w) {
62 0 : const __m128i v_s0_w = xx_loadl_64(src0);
63 0 : const __m128i v_s1_w = xx_loadl_64(src1);
64 :
65 0 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
66 0 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
67 :
68 0 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
69 :
70 0 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
71 :
72 0 : return v_res_w;
73 : }
74 :
75 0 : static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
76 : const __m128i v_m0_w, const __m128i v_m1_w) {
77 0 : const __m128i v_s0_w = xx_loadu_128(src0);
78 0 : const __m128i v_s1_w = xx_loadu_128(src1);
79 :
80 0 : const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
81 0 : const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
82 :
83 0 : const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
84 :
85 0 : const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
86 :
87 0 : return v_res_w;
88 : }
89 :
90 0 : static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
91 : const __m128i v_m0_w, const __m128i v_m1_w) {
92 0 : const __m128i v_s0_w = xx_loadl_64(src0);
93 0 : const __m128i v_s1_w = xx_loadl_64(src1);
94 :
95 : // Interleave
96 0 : const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
97 0 : const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
98 :
99 : // Multiply-Add
100 0 : const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
101 :
102 : // Scale
103 0 : const __m128i v_ssum_d =
104 : _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
105 :
106 : // Pack
107 0 : const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
108 :
109 : // Round
110 0 : const __m128i v_res_w = xx_round_epu16(v_pssum_d);
111 :
112 0 : return v_res_w;
113 : }
114 :
115 0 : static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
116 : const __m128i v_m0_w, const __m128i v_m1_w) {
117 0 : const __m128i v_s0_w = xx_loadu_128(src0);
118 0 : const __m128i v_s1_w = xx_loadu_128(src1);
119 :
120 : // Interleave
121 0 : const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
122 0 : const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
123 0 : const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
124 0 : const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
125 :
126 : // Multiply-Add
127 0 : const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
128 0 : const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
129 :
130 : // Scale
131 0 : const __m128i v_ssuml_d =
132 : _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
133 0 : const __m128i v_ssumh_d =
134 : _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
135 :
136 : // Pack
137 0 : const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
138 :
139 : // Round
140 0 : const __m128i v_res_w = xx_round_epu16(v_pssum_d);
141 :
142 0 : return v_res_w;
143 : }
144 : #endif // CONFIG_HIGHBITDEPTH
145 :
146 : #endif // AOM_DSP_X86_BLEND_SSE4_H_
|