LCOV - code coverage report
Current view: top level - xpcom/string - nsUTF8UtilsSSE2.cpp (source / functions) Hit Total Coverage
Test: output.info Lines: 45 45 100.0 %
Date: 2017-07-14 16:53:18 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
       2             : /* vim: set ts=8 sts=2 et sw=2 tw=80: */
       3             : /* This Source Code Form is subject to the terms of the Mozilla Public
       4             :  * License, v. 2.0. If a copy of the MPL was not distributed with this
       5             :  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
       6             : 
       7             : #include "nscore.h"
       8             : #include "nsAlgorithm.h"
       9             : #include <emmintrin.h>
      10             : #include <nsUTF8Utils.h>
      11             : 
      12             : void
      13        4329 : LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
      14             :                                       uint32_t aSourceLength)
      15             : {
      16        4329 :   char* dest = mDestination;
      17             : 
      18             :   // Align source to a 16-byte boundary.
      19        4329 :   uint32_t i = 0;
      20             :   uint32_t alignLen =
      21             :     XPCOM_MIN<uint32_t>(aSourceLength,
      22        4329 :                         uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
      23       28855 :   for (; i < alignLen; ++i) {
      24       12263 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      25             :   }
      26             : 
      27             :   // Walk 64 bytes (four XMM registers) at a time.
      28        4329 :   __m128i vectmask = _mm_set1_epi16(0x00ff);
      29       17311 :   for (; aSourceLength - i > 31; i += 32) {
      30       12982 :     __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
      31        6491 :     source1 = _mm_and_si128(source1, vectmask);
      32             : 
      33       12982 :     __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
      34        6491 :     source2 = _mm_and_si128(source2, vectmask);
      35             : 
      36       12982 :     __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
      37        6491 :     source3 = _mm_and_si128(source3, vectmask);
      38             : 
      39       12982 :     __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
      40        6491 :     source4 = _mm_and_si128(source4, vectmask);
      41             : 
      42             : 
      43             :     // Pack the source data.  SSE2 views this as a saturating uint16_t to
      44             :     // uint8_t conversion, but since we masked off the high-order byte of every
      45             :     // uint16_t, we're really just grabbing the low-order bytes of source1 and
      46             :     // source2.
      47        6491 :     __m128i packed1 = _mm_packus_epi16(source1, source2);
      48        6491 :     __m128i packed2 = _mm_packus_epi16(source3, source4);
      49             : 
      50             :     // This store needs to be unaligned since there's no guarantee that the
      51             :     // alignment we did above for the source will align the destination.
      52        6491 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      packed1);
      53        6491 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
      54             :   }
      55             : 
      56             :   // Finish up the rest.
      57      116519 :   for (; i < aSourceLength; ++i) {
      58       56095 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      59             :   }
      60             : 
      61        4329 :   mDestination += i;
      62        4329 : }
      63             : 
      64             : void
      65        1274 : LossyConvertEncoding8to16::write_sse2(const char* aSource,
      66             :                                       uint32_t aSourceLength)
      67             : {
      68        1274 :   char16_t* dest = mDestination;
      69             : 
      70             :   // Align source to a 16-byte boundary.  We choose to align source rather than
      71             :   // dest because we'd rather have our loads than our stores be fast. You have
      72             :   // to wait for a load to complete, but you can keep on moving after issuing a
      73             :   // store.
      74        1274 :   uint32_t i = 0;
      75             :   uint32_t alignLen = XPCOM_MIN(aSourceLength,
      76        1274 :                                 uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
      77       11552 :   for (; i < alignLen; ++i) {
      78        5139 :     dest[i] = static_cast<unsigned char>(aSource[i]);
      79             :   }
      80             : 
      81             :   // Walk 32 bytes (two XMM registers) at a time.
      82        1978 :   for (; aSourceLength - i > 31; i += 32) {
      83         704 :     __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
      84         704 :     __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
      85             : 
      86             :     // Interleave 0s in with the bytes of source to create lo and hi.
      87         704 :     __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
      88         704 :     __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
      89         704 :     __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
      90         704 :     __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
      91             : 
      92             :     // store lo and hi into dest.
      93         352 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      lo1);
      94         352 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8),  hi1);
      95         352 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
      96         352 :     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
      97             :   }
      98             : 
      99             :   // Finish up whatever's left.
     100       23392 :   for (; i < aSourceLength; ++i) {
     101       11059 :     dest[i] = static_cast<unsigned char>(aSource[i]);
     102             :   }
     103             : 
     104        1274 :   mDestination += i;
     105        1274 : }

Generated by: LCOV version 1.13