Line data Source code
1 :
2 : /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
3 : *
4 : * Copyright (c) 2016-2017 Glenn Randers-Pehrson
5 : * Written by Mike Klein and Matt Sarett
6 : * Derived from arm/filter_neon_intrinsics.c
7 : *
8 : * Last changed in libpng 1.6.29 [March 16, 2017]
9 : *
10 : * This code is released under the libpng license.
11 : * For conditions of distribution and use, see the disclaimer
12 : * and license in png.h
13 : */
14 :
15 : #include "../pngpriv.h"
16 :
17 : #ifdef PNG_READ_SUPPORTED
18 :
19 : #if PNG_INTEL_SSE_IMPLEMENTATION > 0
20 :
21 : #include <immintrin.h>
22 :
23 : /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
24 : * They're positioned like this:
25 : * prev: c b
26 : * row: a d
27 : * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
28 : * whichever of a, b, or c is closest to p=a+b-c.
29 : */
30 :
31 34310 : static __m128i load4(const void* p) {
32 68620 : return _mm_cvtsi32_si128(*(const int*)p);
33 : }
34 :
35 21204 : static void store4(void* p, __m128i v) {
36 21204 : *(int*)p = _mm_cvtsi128_si32(v);
37 21204 : }
38 :
39 0 : static __m128i load3(const void* p) {
40 : /* We'll load 2 bytes, then 1 byte,
41 : * then mask them together, and finally load into SSE.
42 : */
43 0 : const png_uint_16* p01 = p;
44 0 : const png_byte* p2 = (const png_byte*)(p01+1);
45 :
46 0 : png_uint_32 v012 = (png_uint_32)(*p01)
47 0 : | (png_uint_32)(*p2) << 16;
48 0 : return load4(&v012);
49 : }
50 :
51 0 : static void store3(void* p, __m128i v) {
52 : /* We'll pull from SSE as a 32-bit int, then write
53 : * its bottom two bytes, then its third byte.
54 : */
55 : png_uint_32 v012;
56 0 : store4(&v012, v);
57 :
58 0 : png_uint_16* p01 = p;
59 0 : png_byte* p2 = (png_byte*)(p01+1);
60 0 : *p01 = v012;
61 0 : *p2 = v012 >> 16;
62 0 : }
63 :
64 0 : void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
65 : png_const_bytep prev)
66 : {
67 : /* The Sub filter predicts each pixel as the previous pixel, a.
68 : * There is no pixel to the left of the first pixel. It's encoded directly.
69 : * That works with our main loop if we just say that left pixel was zero.
70 : */
71 : png_debug(1, "in png_read_filter_row_sub3_sse2");
72 0 : __m128i a, d = _mm_setzero_si128();
73 :
74 0 : int rb = row_info->rowbytes;
75 0 : while (rb >= 4) {
76 0 : a = d; d = load4(row);
77 0 : d = _mm_add_epi8(d, a);
78 0 : store3(row, d);
79 :
80 0 : row += 3;
81 0 : rb -= 3;
82 : }
83 0 : if (rb > 0) {
84 0 : a = d; d = load3(row);
85 0 : d = _mm_add_epi8(d, a);
86 0 : store3(row, d);
87 :
88 0 : row += 3;
89 0 : rb -= 3;
90 : }
91 0 : }
92 :
93 261 : void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
94 : png_const_bytep prev)
95 : {
96 : /* The Sub filter predicts each pixel as the previous pixel, a.
97 : * There is no pixel to the left of the first pixel. It's encoded directly.
98 : * That works with our main loop if we just say that left pixel was zero.
99 : */
100 : png_debug(1, "in png_read_filter_row_sub4_sse2");
101 261 : __m128i a, d = _mm_setzero_si128();
102 :
103 261 : int rb = row_info->rowbytes;
104 8620 : while (rb > 0) {
105 8098 : a = d; d = load4(row);
106 8098 : d = _mm_add_epi8(d, a);
107 8098 : store4(row, d);
108 :
109 8098 : row += 4;
110 8098 : rb -= 4;
111 : }
112 261 : }
113 :
114 0 : void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
115 : png_const_bytep prev)
116 : {
117 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
118 : * There's no pixel to the left of the first pixel. Luckily, it's
119 : * predicted to be half of the pixel above it. So again, this works
120 : * perfectly with our loop if we make sure a starts at zero.
121 : */
122 : png_debug(1, "in png_read_filter_row_avg3_sse2");
123 0 : const __m128i zero = _mm_setzero_si128();
124 : __m128i b;
125 0 : __m128i a, d = zero;
126 :
127 0 : int rb = row_info->rowbytes;
128 0 : while (rb >= 4) {
129 0 : b = load4(prev);
130 0 : a = d; d = load4(row );
131 :
132 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
133 0 : __m128i avg = _mm_avg_epu8(a,b);
134 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
135 0 : avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
136 : _mm_set1_epi8(1)));
137 0 : d = _mm_add_epi8(d, avg);
138 0 : store3(row, d);
139 :
140 0 : prev += 3;
141 0 : row += 3;
142 0 : rb -= 3;
143 : }
144 0 : if (rb > 0) {
145 0 : b = load3(prev);
146 0 : a = d; d = load3(row );
147 :
148 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
149 0 : __m128i avg = _mm_avg_epu8(a,b);
150 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
151 0 : avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
152 : _mm_set1_epi8(1)));
153 :
154 0 : d = _mm_add_epi8(d, avg);
155 0 : store3(row, d);
156 :
157 0 : prev += 3;
158 0 : row += 3;
159 0 : rb -= 3;
160 : }
161 0 : }
162 :
163 57 : void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
164 : png_const_bytep prev)
165 : {
166 : /* The Avg filter predicts each pixel as the (truncated) average of a and b.
167 : * There's no pixel to the left of the first pixel. Luckily, it's
168 : * predicted to be half of the pixel above it. So again, this works
169 : * perfectly with our loop if we make sure a starts at zero.
170 : */
171 : png_debug(1, "in png_read_filter_row_avg4_sse2");
172 57 : const __m128i zero = _mm_setzero_si128();
173 : __m128i b;
174 57 : __m128i a, d = zero;
175 :
176 57 : int rb = row_info->rowbytes;
177 1917 : while (rb > 0) {
178 1803 : b = load4(prev);
179 1803 : a = d; d = load4(row );
180 :
181 : /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
182 1803 : __m128i avg = _mm_avg_epu8(a,b);
183 : /* ...but we can fix it up by subtracting off 1 if it rounded up. */
184 7212 : avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
185 : _mm_set1_epi8(1)));
186 :
187 1803 : d = _mm_add_epi8(d, avg);
188 1803 : store4(row, d);
189 :
190 1803 : prev += 4;
191 1803 : row += 4;
192 1803 : rb -= 4;
193 : }
194 57 : }
195 :
196 : /* Returns |x| for 16-bit lanes. */
197 33909 : static __m128i abs_i16(__m128i x) {
198 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
199 : return _mm_abs_epi16(x);
200 : #else
201 : /* Read this all as, return x<0 ? -x : x.
202 : * To negate two's complement, you flip all the bits then add 1.
203 : */
204 67818 : __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
205 :
206 : /* Flip negative lanes. */
207 33909 : x = _mm_xor_si128(x, is_negative);
208 :
209 : /* +1 to negative lanes, else +0. */
210 33909 : x = _mm_sub_epi16(x, is_negative);
211 33909 : return x;
212 : #endif
213 : }
214 :
215 : /* Bytewise c ? t : e. */
216 22606 : static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
217 : #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
218 : return _mm_blendv_epi8(e,t,c);
219 : #else
220 67818 : return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
221 : #endif
222 : }
223 :
224 0 : void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
225 : png_const_bytep prev)
226 : {
227 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
228 : * and two pixels from the previous row, b and c:
229 : * prev: c b
230 : * row: a d
231 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
232 : * p=a+b-c.
233 : *
234 : * The first pixel has no left context, and so uses an Up filter, p = b.
235 : * This works naturally with our main loop's p = a+b-c if we force a and c
236 : * to zero.
237 : * Here we zero b and d, which become c and a respectively at the start of
238 : * the loop.
239 : */
240 : png_debug(1, "in png_read_filter_row_paeth3_sse2");
241 0 : const __m128i zero = _mm_setzero_si128();
242 0 : __m128i c, b = zero,
243 0 : a, d = zero;
244 :
245 0 : int rb = row_info->rowbytes;
246 0 : while (rb >= 4) {
247 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
248 : * intermediates.
249 : */
250 0 : c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
251 0 : a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
252 :
253 : /* (p-a) == (a+b-c - a) == (b-c) */
254 0 : __m128i pa = _mm_sub_epi16(b,c);
255 :
256 : /* (p-b) == (a+b-c - b) == (a-c) */
257 0 : __m128i pb = _mm_sub_epi16(a,c);
258 :
259 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
260 0 : __m128i pc = _mm_add_epi16(pa,pb);
261 :
262 0 : pa = abs_i16(pa); /* |p-a| */
263 0 : pb = abs_i16(pb); /* |p-b| */
264 0 : pc = abs_i16(pc); /* |p-c| */
265 :
266 0 : __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
267 :
268 : /* Paeth breaks ties favoring a over b over c. */
269 0 : __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
270 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
271 : c));
272 :
273 : /* Note `_epi8`: we need addition to wrap modulo 255. */
274 0 : d = _mm_add_epi8(d, nearest);
275 0 : store3(row, _mm_packus_epi16(d,d));
276 :
277 0 : prev += 3;
278 0 : row += 3;
279 0 : rb -= 3;
280 : }
281 0 : if (rb > 0) {
282 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
283 : * intermediates.
284 : */
285 0 : c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
286 0 : a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
287 :
288 : /* (p-a) == (a+b-c - a) == (b-c) */
289 0 : __m128i pa = _mm_sub_epi16(b,c);
290 :
291 : /* (p-b) == (a+b-c - b) == (a-c) */
292 0 : __m128i pb = _mm_sub_epi16(a,c);
293 :
294 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
295 0 : __m128i pc = _mm_add_epi16(pa,pb);
296 :
297 0 : pa = abs_i16(pa); /* |p-a| */
298 0 : pb = abs_i16(pb); /* |p-b| */
299 0 : pc = abs_i16(pc); /* |p-c| */
300 :
301 0 : __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
302 :
303 : /* Paeth breaks ties favoring a over b over c. */
304 0 : __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
305 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
306 : c));
307 :
308 : /* Note `_epi8`: we need addition to wrap modulo 255. */
309 0 : d = _mm_add_epi8(d, nearest);
310 0 : store3(row, _mm_packus_epi16(d,d));
311 :
312 0 : prev += 3;
313 0 : row += 3;
314 0 : rb -= 3;
315 : }
316 0 : }
317 :
318 358 : void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
319 : png_const_bytep prev)
320 : {
321 : /* Paeth tries to predict pixel d using the pixel to the left of it, a,
322 : * and two pixels from the previous row, b and c:
323 : * prev: c b
324 : * row: a d
325 : * The Paeth function predicts d to be whichever of a, b, or c is nearest to
326 : * p=a+b-c.
327 : *
328 : * The first pixel has no left context, and so uses an Up filter, p = b.
329 : * This works naturally with our main loop's p = a+b-c if we force a and c
330 : * to zero.
331 : * Here we zero b and d, which become c and a respectively at the start of
332 : * the loop.
333 : */
334 : png_debug(1, "in png_read_filter_row_paeth4_sse2");
335 358 : const __m128i zero = _mm_setzero_si128();
336 358 : __m128i c, b = zero,
337 358 : a, d = zero;
338 :
339 358 : int rb = row_info->rowbytes;
340 12019 : while (rb > 0) {
341 : /* It's easiest to do this math (particularly, deal with pc) with 16-bit
342 : * intermediates.
343 : */
344 22606 : c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
345 22606 : a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
346 :
347 : /* (p-a) == (a+b-c - a) == (b-c) */
348 11303 : __m128i pa = _mm_sub_epi16(b,c);
349 :
350 : /* (p-b) == (a+b-c - b) == (a-c) */
351 11303 : __m128i pb = _mm_sub_epi16(a,c);
352 :
353 : /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
354 11303 : __m128i pc = _mm_add_epi16(pa,pb);
355 :
356 11303 : pa = abs_i16(pa); /* |p-a| */
357 11303 : pb = abs_i16(pb); /* |p-b| */
358 11303 : pc = abs_i16(pc); /* |p-c| */
359 :
360 22606 : __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
361 :
362 : /* Paeth breaks ties favoring a over b over c. */
363 22606 : __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
364 : if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
365 : c));
366 :
367 : /* Note `_epi8`: we need addition to wrap modulo 255. */
368 11303 : d = _mm_add_epi8(d, nearest);
369 11303 : store4(row, _mm_packus_epi16(d,d));
370 :
371 11303 : prev += 4;
372 11303 : row += 4;
373 11303 : rb -= 4;
374 : }
375 358 : }
376 :
377 : #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
378 : #endif /* READ */
|