Line data Source code
1 : /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "nsCharSetProber.h"
7 :
8 : //This filter applies to all scripts which do not use English characters
9 0 : bool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
10 : {
11 : char *newptr;
12 : char *prevPtr, *curPtr;
13 :
14 0 : bool meetMSB = false;
15 0 : newptr = *newBuf = (char*)malloc(aLen);
16 0 : if (!newptr)
17 0 : return false;
18 :
19 0 : for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
20 : {
21 0 : if (*curPtr & 0x80)
22 : {
23 0 : meetMSB = true;
24 : }
25 0 : else if (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z')
26 : {
27 : //current char is a symbol, most likely a punctuation. we treat it as segment delimiter
28 0 : if (meetMSB && curPtr > prevPtr)
29 : //this segment contains more than single symbol, and it has upper ASCII, we need to keep it
30 : {
31 0 : while (prevPtr < curPtr) *newptr++ = *prevPtr++;
32 0 : prevPtr++;
33 0 : *newptr++ = ' ';
34 0 : meetMSB = false;
35 : }
36 : else //ignore current segment. (either because it is just a symbol or just an English word)
37 0 : prevPtr = curPtr+1;
38 : }
39 : }
40 0 : if (meetMSB && curPtr > prevPtr)
41 0 : while (prevPtr < curPtr) *newptr++ = *prevPtr++;
42 :
43 0 : newLen = newptr - *newBuf;
44 :
45 0 : return true;
46 : }
47 :
48 : //This filter applies to all scripts which contain both English characters and upper ASCII characters.
49 0 : bool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, uint32_t aLen, char** newBuf, uint32_t& newLen)
50 : {
51 : //do filtering to reduce load to probers
52 : char *newptr;
53 : char *prevPtr, *curPtr;
54 0 : bool isInTag = false;
55 :
56 0 : newptr = *newBuf = (char*)malloc(aLen);
57 0 : if (!newptr)
58 0 : return false;
59 :
60 0 : for (curPtr = prevPtr = (char*)aBuf; curPtr < aBuf+aLen; curPtr++)
61 : {
62 0 : if (*curPtr == '>')
63 0 : isInTag = false;
64 0 : else if (*curPtr == '<')
65 0 : isInTag = true;
66 :
67 0 : if (!(*curPtr & 0x80) &&
68 0 : (*curPtr < 'A' || (*curPtr > 'Z' && *curPtr < 'a') || *curPtr > 'z') )
69 : {
70 0 : if (curPtr > prevPtr && !isInTag) // Current segment contains more than just a symbol
71 : // and it is not inside a tag, keep it.
72 : {
73 0 : while (prevPtr < curPtr) *newptr++ = *prevPtr++;
74 0 : prevPtr++;
75 0 : *newptr++ = ' ';
76 : }
77 : else
78 0 : prevPtr = curPtr+1;
79 : }
80 : }
81 :
82 : // If the current segment contains more than just a symbol
83 : // and it is not inside a tag then keep it.
84 0 : if (!isInTag)
85 0 : while (prevPtr < curPtr)
86 0 : *newptr++ = *prevPtr++;
87 :
88 0 : newLen = newptr - *newBuf;
89 :
90 0 : return true;
91 : }
|