Line data Source code
1 : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 : /* This Source Code Form is subject to the terms of the Mozilla Public
4 : * License, v. 2.0. If a copy of the MPL was not distributed with this
5 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 :
7 : #include "nscore.h"
8 : #include "nsAlgorithm.h"
9 : #include <emmintrin.h>
10 : #include <nsUTF8Utils.h>
11 :
12 : void
13 4329 : LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
14 : uint32_t aSourceLength)
15 : {
16 4329 : char* dest = mDestination;
17 :
18 : // Align source to a 16-byte boundary.
19 4329 : uint32_t i = 0;
20 : uint32_t alignLen =
21 : XPCOM_MIN<uint32_t>(aSourceLength,
22 4329 : uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
23 28855 : for (; i < alignLen; ++i) {
24 12263 : dest[i] = static_cast<unsigned char>(aSource[i]);
25 : }
26 :
27 : // Walk 64 bytes (four XMM registers) at a time.
28 4329 : __m128i vectmask = _mm_set1_epi16(0x00ff);
29 17311 : for (; aSourceLength - i > 31; i += 32) {
30 12982 : __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
31 6491 : source1 = _mm_and_si128(source1, vectmask);
32 :
33 12982 : __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
34 6491 : source2 = _mm_and_si128(source2, vectmask);
35 :
36 12982 : __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
37 6491 : source3 = _mm_and_si128(source3, vectmask);
38 :
39 12982 : __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
40 6491 : source4 = _mm_and_si128(source4, vectmask);
41 :
42 :
43 : // Pack the source data. SSE2 views this as a saturating uint16_t to
44 : // uint8_t conversion, but since we masked off the high-order byte of every
45 : // uint16_t, we're really just grabbing the low-order bytes of source1 and
46 : // source2.
47 6491 : __m128i packed1 = _mm_packus_epi16(source1, source2);
48 6491 : __m128i packed2 = _mm_packus_epi16(source3, source4);
49 :
50 : // This store needs to be unaligned since there's no guarantee that the
51 : // alignment we did above for the source will align the destination.
52 6491 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1);
53 6491 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
54 : }
55 :
56 : // Finish up the rest.
57 116519 : for (; i < aSourceLength; ++i) {
58 56095 : dest[i] = static_cast<unsigned char>(aSource[i]);
59 : }
60 :
61 4329 : mDestination += i;
62 4329 : }
63 :
64 : void
65 1274 : LossyConvertEncoding8to16::write_sse2(const char* aSource,
66 : uint32_t aSourceLength)
67 : {
68 1274 : char16_t* dest = mDestination;
69 :
70 : // Align source to a 16-byte boundary. We choose to align source rather than
71 : // dest because we'd rather have our loads than our stores be fast. You have
72 : // to wait for a load to complete, but you can keep on moving after issuing a
73 : // store.
74 1274 : uint32_t i = 0;
75 : uint32_t alignLen = XPCOM_MIN(aSourceLength,
76 1274 : uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
77 11552 : for (; i < alignLen; ++i) {
78 5139 : dest[i] = static_cast<unsigned char>(aSource[i]);
79 : }
80 :
81 : // Walk 32 bytes (two XMM registers) at a time.
82 1978 : for (; aSourceLength - i > 31; i += 32) {
83 704 : __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
84 704 : __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
85 :
86 : // Interleave 0s in with the bytes of source to create lo and hi.
87 704 : __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
88 704 : __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
89 704 : __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
90 704 : __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
91 :
92 : // store lo and hi into dest.
93 352 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1);
94 352 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1);
95 352 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
96 352 : _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
97 : }
98 :
99 : // Finish up whatever's left.
100 23392 : for (; i < aSourceLength; ++i) {
101 11059 : dest[i] = static_cast<unsigned char>(aSource[i]);
102 : }
103 :
104 1274 : mDestination += i;
105 1274 : }
|