Beispiel #1
0
bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd)
{
    while (b < bEnd) {
        if (isASCII(*a) || isASCII(*b)) {
            if (*a++ != *b++)
                return false;
            continue;
        }

        if (b + 1 == bEnd)
            return false;

        if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80)
            return false;

        LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F);

        b += 2;

        if (*a++ != character)
            return false;
    }

    return true;
}
Beispiel #2
0
unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
{
    if (!data)
        return 0;

    StringHasher stringHasher;
    dataLength = 0;
    utf16Length = 0;

    while (data < dataEnd || (!dataEnd && *data)) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            dataLength++;
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
        dataLength += utf8SequenceLength;

        if (!dataEnd) {
            for (int i = 1; i < utf8SequenceLength; ++i) {
                if (!data[i])
                    return 0;
            }
        } else if (dataEnd - data < utf8SequenceLength) {
            return 0;
        }

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else {
            return 0;
        }
    }

    return stringHasher.hashWithTop8BitsMasked();
}
Beispiel #3
0
uint32 UString::toUpper(uint32 c) {
	if (!isASCII(c))
		// We don't know how to uppercase that
		return c;

	return std::toupper(c);
}
void CombinedURLFilters::addDomain(uint64_t actionId, const String& domain)
{
    // This is like adding (.|^)domain$ by adding two Vector<Term>'s,
    // but interpreting domain as a series of characters, not a regular expression.
    // This way a domain of "webkit.org" will match "bugs.webkit.org" and "webkit.org".
    // FIXME: Add support for matching only subdomains or no subdomains.
    Vector<Term> prependDot;
    Vector<Term> prependBeginningOfLine;
    prependDot.reserveInitialCapacity(domain.length() + 3);
    prependBeginningOfLine.reserveInitialCapacity(domain.length() + 1); // This is just no .* at the beginning.
    
    Term canonicalDotStar(Term::UniversalTransition);
    canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore);
    prependDot.uncheckedAppend(canonicalDotStar);
    prependDot.uncheckedAppend(Term('.', true));
    
    for (unsigned i = 0; i < domain.length(); i++) {
        ASSERT(isASCII(domain[i]));
        ASSERT(!isASCIIUpper(domain[i]));
        prependDot.uncheckedAppend(Term(domain[i], true));
        prependBeginningOfLine.uncheckedAppend(Term(domain[i], true));
    }
    prependDot.uncheckedAppend(Term::EndOfLineAssertionTerm);
    prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm);
    
    addPattern(actionId, prependDot);
    addPattern(actionId, prependBeginningOfLine);
}
// http://dev.w3.org/csswg/css-syntax/#name-start-code-point
static bool isNameStart(UChar c)
{
    if (isASCIIAlpha(c))
        return true;
    if (c == '_')
        return true;
    return !isASCII(c);
}
static bool containsOnlyASCIIWithNoUppercase(const String& domain)
{
    for (unsigned i = 0; i < domain.length(); ++i) {
        UChar c = domain.at(i);
        if (!isASCII(c) || isASCIIUpper(c))
            return false;
    }
    return true;
}
Beispiel #7
0
STATIC char
S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning)
{

    U8 result;

    if (utf8) {
	/* Trying to deprecate non-ASCII usages.  This construct has never
	 * worked for a utf8 variant.  So, even though are accepting non-ASCII
	 * Latin1 in 5.14, no need to make them work under utf8 */
	if (! isASCII(source)) {
	    Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII");
	}
    }

    result = toCTRL(source);
    if (! isASCII(source)) {
	    Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX),
			    "Character following \"\\c\" must be ASCII");
    }
    else if (! isCNTRL(result) && output_warning) {
	if (source == '{') {
	    Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX),
			    "\"\\c{\" is deprecated and is more clearly written as \";\"");
	}
	else {
	    U8 clearer[3];
	    U8 i = 0;
	    if (! isALNUM(result)) {
		clearer[i++] = '\\';
	    }
	    clearer[i++] = result;
	    clearer[i++] = '\0';

	    Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),
			    "\"\\c%c\" is more clearly written simply as \"%s\"",
			    source,
			    clearer);
	}
    }

    return result;
}
Beispiel #8
0
void expand(char s1[], char s2[])
{
    int i=0, j=0;
    char c, k;
    
    while ((c=s1[i]) != '\0')
    {
        if (isASCII(c)) {
            if (s1[i+1] == '-') {
                /* is next char a A-Za-z0-9 ? */
                if (isASCII(s1[i+2])) {
                    /* expand */
                    if (s1[i]<s1[i+2]) {
                        for (k=s1[i]; k<s1[i+2]; k=nextASCII(k)) {
                            s2[j++] = k;
                        }
                        
                        i += 2;
                    }
                } else if (s1[i+2] == '\0') { /* end of string reached */

                    s2[j++] = s1[i+1]; /* copy '-' */

                    s2[j++] = '\0';
                    
                    break;
                }
            } else if (s1[i+1] == '\0') { /* end of string reached */
                
                s2[j++] = '\0';
                
                break;
            }
        }
        
        s2[j++] = s1[i];
        
        i++;
    }
    
    s2[j] = '\0';
}
Beispiel #9
0
unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length)
{
    if (!data)
        return 0;

    WTF::StringHasher stringHasher;
    utf16Length = 0;

    while (data < dataEnd) {
        if (isASCII(*data)) {
            stringHasher.addCharacter(*data++);
            utf16Length++;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);

        if (dataEnd - data < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return 0;
            stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
            utf16Length++;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
                                       static_cast<UChar>(U16_TRAIL(character)));
            utf16Length += 2;
        } else
            return 0;
    }

    return stringHasher.hash();
}
Beispiel #10
0
static int checkEnding(char c, char *filename) {
	int i = 0;
	while (filename[i] != '.') {
		if (filename[i] == '\0' || isASCII(filename+i)==0) 
			return 0;	
		i++;
	}
	i++;
	if ( filename[i] == c && filename[i+1] == '\0' )
		return 1;

	return 0;	
}
static String createSearchRegexSource(const String& text)
{
    StringBuilder result;

    for (unsigned i = 0; i < text.length(); i++) {
        UChar character = text[i];
        if (isASCII(character) && strchr(regexSpecialCharacters, character))
            result.append('\\');
        result.append(character);
    }

    return result.toString();
}
Beispiel #12
0
static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
{
    ASSERT(!isASCII(sequence[0]));
    if (length == 2) {
        ASSERT(sequence[0] <= 0xDF);
        if (sequence[0] < 0xC2)
            return nonCharacter;
        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
            return nonCharacter;
        return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
    }
    if (length == 3) {
        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
        switch (sequence[0]) {
        case 0xE0:
            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
                return nonCharacter;
            break;
        case 0xED:
            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
                return nonCharacter;
            break;
        default:
            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
                return nonCharacter;
        }
        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
            return nonCharacter;
        return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
    }
    ASSERT(length == 4);
    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
    switch (sequence[0]) {
    case 0xF0:
        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
            return nonCharacter;
        break;
    case 0xF4:
        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
            return nonCharacter;
        break;
    default:
        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
            return nonCharacter;
    }
    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
        return nonCharacter;
    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
        return nonCharacter;
    return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
}
Beispiel #13
0
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
{
    while (b < bEnd) {
        if (isASCII(*b)) {
            if (*a++ != *b++)
                return false;
            continue;
        }

        int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);

        if (bEnd - b < utf8SequenceLength)
            return false;

        if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
            return 0;

        UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
        ASSERT(!isASCII(character));

        if (U_IS_BMP(character)) {
            // UTF-16 surrogate values are illegal in UTF-32
            if (U_IS_SURROGATE(character))
                return false;
            if (*a++ != character)
                return false;
        } else if (U_IS_SUPPLEMENTARY(character)) {
            if (*a++ != U16_LEAD(character))
                return false;
            if (*a++ != U16_TRAIL(character))
                return false;
        } else {
            return false;
        }
    }

    return a == aEnd;
}
Beispiel #14
0
bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
{
    ASSERT(m_partialSequenceSize);
    do {
        if (isASCII(m_partialSequence[0])) {
            *destination++ = m_partialSequence[0];
            consumePartialSequenceByte();
            continue;
        }
        int count = nonASCIISequenceLength(m_partialSequence[0]);
        if (!count) {
            handleError(destination, stopOnError, sawError);
            if (stopOnError)
                return false;
            continue;
        }
        if (count > m_partialSequenceSize) {
            if (count - m_partialSequenceSize > end - source) {
                if (!flush) {
                    // The new data is not enough to complete the sequence, so
                    // add it to the existing partial sequence.
                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
                    m_partialSequenceSize += end - source;
                    return false;
                }
                // An incomplete partial sequence at the end is an error.
                handleError(destination, stopOnError, sawError);
                if (stopOnError)
                    return false;
                continue;
            }
            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
            source += count - m_partialSequenceSize;
            m_partialSequenceSize = count;
        }
        int character = decodeNonASCIISequence(m_partialSequence, count);
        if (character == nonCharacter) {
            handleError(destination, stopOnError, sawError);
            if (stopOnError)
                return false;
            continue;
        }

        m_partialSequenceSize -= count;
        destination = appendCharacter(destination, character);
    } while (m_partialSequenceSize);

    return false;
}
Beispiel #15
0
int main(int argc, char *argv[]){

	FILE *fp = fopen(argv[1], "r");
	char buffer[5], curChar, preChar;
	int i = 0, length = 0, I = 0, file_size = 0;

	printf("\nFilename: %s\n", argv[1]);
	printf("-----------------------\n");

	fseek(fp, 0L, SEEK_END);
	file_size = ftell(fp);
	fseek(fp, 0L, SEEK_SET);

	while(I <= file_size){

		curChar = fgetc(fp);

		if(curChar == NEWLINE){
			length = i;
			for(i = 0; i < length; i++){
				printf("%c", buffer[i]);
			}
			printf("%c", curChar);
			i = 0;
		}else if(isASCII(curChar)){
			buffer[i] = curChar;
			i++;
		}else if(i >= 4){
			length = i;
			for(i = 0; i < length; i++){
				printf("%c", buffer[i]);
			}
			i = 0;
		}else{
			i = 0;
		}

		preChar = curChar;

	I++;	
	}	

return 0;	
}
Beispiel #16
0
// function to read regular file
static int readfile(const char *pathname, char *searchstr, const struct stat *statptr, int type)
{
	// file descriptor
	int fd;
	char charbuf[1];	
	char linebuf[LINE_MAX];
	int lbpos = 0;
	int errnum;
	
	// error handling: if fd==-1, error opening file
	if ( (fd = open(pathname, O_RDONLY)) < 0 ) {
		errnum = errno;
		my_errprintf("Error opening file: %s\n", strerror(errnum) );
	}

	// Copy one line to buffer by reading file one byte
	//  at a time until a newline character is reached
	//	OR buffer is full

	while ( read(fd, charbuf, 1) > 0 ) {
		if ( isASCII(charbuf) == 0 ) // contains non-ascii char, skip file 
			break;

		// New line character...
		// 	-put null char at end of line
		//		then pass to mygrep and print line if match
		if ( (charbuf[0] == '\n') || (lbpos >= LINE_MAX-1)) {
			linebuf[lbpos] = '\0';
			if ( mygrep(searchstr, strlen(searchstr), linebuf, strlen(linebuf)) == 1 ) { 
				my_printf("Line: %s\nFile: %s\n", linebuf, pathname);
			}

			lbpos = 0;

		} else { // store byte in linebuf
			linebuf[lbpos] = (char) charbuf[0];
			lbpos++ ;
		}
	}

	close(fd);
	return 0;
}
static char *findString(char *data, char *maxData)
{
    int length = *(data++);
    int i;
    char *name;

    if (length <= 0)
        return NULL;

    for (i = 0; i < length; i++) {
        if (!isASCII(data[i]))
            return NULL;
    }

    name = (char *)malloc(length + 1);
    memcpy(name, data, length);
    name[length] = '\0';
    return name;
}
Beispiel #18
0
Znk_INLINE bool
SJIS_isSecondByte( const char* base, const char* p )
{
	int lbc = 0;
	while( p > base ){
		--p;
		if( isASCII(*p) ){
			/* second or ascii */
			break;
		} else if( SJIS_isHankakuKatakana(*p) ){
			/* second or hankaku_katakana */
			break;
		} else {
			/* second or first */
		}
		++lbc;
	}
	return (bool)(lbc & 1);
}
Beispiel #19
0
bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
{
    ASSERT(m_partialSequenceSize);
    do {
        if (isASCII(m_partialSequence[0])) {
            *destination++ = m_partialSequence[0];
            consumePartialSequenceByte();
            continue;
        }
        int count = nonASCIISequenceLength(m_partialSequence[0]);
        if (!count)
            return true;

        if (count > m_partialSequenceSize) {
            if (count - m_partialSequenceSize > end - source) {
                if (!flush) {
                    // The new data is not enough to complete the sequence, so
                    // add it to the existing partial sequence.
                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
                    m_partialSequenceSize += end - source;
                    return false;
                }
                // An incomplete partial sequence at the end is an error, but it will create
                // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
                // the error.
                return true;
            }
            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
            source += count - m_partialSequenceSize;
            m_partialSequenceSize = count;
        }
        int character = decodeNonASCIISequence(m_partialSequence, count);
        if ((character == nonCharacter) || (character > 0xff))
            return true;

        m_partialSequenceSize -= count;
        *destination++ = character;
    } while (m_partialSequenceSize);

    return false;
}
Beispiel #20
0
String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&)
{
    UChar* characters;
    String result = String::createUninitialized(length, characters);

    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length);
    const uint8_t* alignedEnd = alignToMachineWord(end);
    UChar* destination = characters;

    while (source < end) {
        if (isASCII(*source)) {
            // Fast path for ASCII. Most Latin-1 text will be ASCII.
            if (isAlignedToMachineWord(source)) {
                while (source < alignedEnd) {
                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);

                    if (!isAllASCII<LChar>(chunk))
                        goto useLookupTable;

                    copyASCIIMachineWord(destination, source);
                    source += sizeof(MachineWord);
                    destination += sizeof(MachineWord);
                }

                if (source == end)
                    break;
            }
            *destination = *source;
        } else {
useLookupTable:
            *destination = table[*source];
        }

        ++source;
        ++destination;
    }

    return result;
}
Beispiel #21
0
void guess_encoding(const char *data, size_t size)
{
#define FOUND(name) \
    do { \
        if (found) fputs(", ", stdout); \
        fputs(name, stdout); \
        found = 1; \
    } while (0)

    int found = 0;

    dump_byte_string(stdout, "guess_encoding(\"", data, size, "\"): ");

    if (size >= 3) {
        if (memcmp(data, UTF_8_BOM, 3) == 0)
            FOUND("UTF-8 (BOM)");
    }
    if (size >= 4) {
        if (memcmp(data, UTF_32_LE_BOM, 4) == 0)
            FOUND("UTF-32-LE (BOM)");
        if (memcmp(data, UTF_32_BE_BOM, 4) == 0)
            FOUND("UTF-32-BE (BOM)");
    }
    if (size >= 2) {
        if (memcmp(data, UTF_16_LE_BOM, 2) == 0)
            FOUND("UTF-16-LE (BOM)");
        if (memcmp(data, UTF_16_BE_BOM, 2) == 0)
            FOUND("UTF-16-BE (BOM)");
    }

    if (isASCII(data, size)) { FOUND("ASCII"); }

    if (isUTF8(data, size)) { FOUND("UTF-8"); }

    if (!found)
        printf("<unknown>");
    fputs("\n", stdout);

}
CSSParserToken CSSTokenizer::nextToken()
{
    // Unlike the HTMLTokenizer, the CSS Syntax spec is written
    // as a stateless, (fixed-size) look-ahead tokenizer.
    // We could move to the stateful model and instead create
    // states for all the "next 3 codepoints are X" cases.
    // State-machine tokenizers are easier to write to handle
    // incremental tokenization of partial sources.
    // However, for now we follow the spec exactly.
    UChar cc = consume();
    CodePoint codePointFunc = 0;

    if (isASCII(cc)) {
        ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);
        codePointFunc = codePoints[cc];
    } else {
        codePointFunc = &CSSTokenizer::nameStart;
    }

    if (codePointFunc)
        return ((this)->*(codePointFunc))(cc);
    return CSSParserToken(DelimiterToken, cc);
}
String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&)
{
    LChar* characters;
    if (!length)
        return emptyString();
    String result = String::createUninitialized(length, characters);

    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length);
    const uint8_t* alignedEnd = alignToMachineWord(end);
    LChar* destination = characters;

    while (source < end) {
        if (isASCII(*source)) {
            // Fast path for ASCII. Most Latin-1 text will be ASCII.
            if (isAlignedToMachineWord(source)) {
                while (source < alignedEnd) {
                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);

                    if (!isAllASCII<LChar>(chunk))
                        goto useLookupTable;

                    copyASCIIMachineWord(destination, source);
                    source += sizeof(MachineWord);
                    destination += sizeof(MachineWord);
                }

                if (source == end)
                    break;
            }
            *destination = *source;
        } else {
useLookupTable:
            if (table[*source] > 0xff)
                goto upConvertTo16Bit;

            *destination = table[*source];
        }

        ++source;
        ++destination;
    }

    return result;
    
upConvertTo16Bit:
    UChar* characters16;
    String result16 = String::createUninitialized(length, characters16);

    UChar* destination16 = characters16;

    // Zero extend and copy already processed 8 bit data
    LChar* ptr8 = characters;
    LChar* endPtr8 = destination;

    while (ptr8 < endPtr8)
        *destination16++ = *ptr8++;

    // Handle the character that triggered the 16 bit path
    *destination16 = table[*source];
    ++source;
    ++destination16;

    while (source < end) {
        if (isASCII(*source)) {
            // Fast path for ASCII. Most Latin-1 text will be ASCII.
            if (isAlignedToMachineWord(source)) {
                while (source < alignedEnd) {
                    MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                    
                    if (!isAllASCII<LChar>(chunk))
                        goto useLookupTable16;
                    
                    copyASCIIMachineWord(destination16, source);
                    source += sizeof(MachineWord);
                    destination16 += sizeof(MachineWord);
                }
                
                if (source == end)
                    break;
            }
            *destination16 = *source;
        } else {
useLookupTable16:
            *destination16 = table[*source];
        }
        
        ++source;
        ++destination16;
    }
    
    return result16;
}
Beispiel #24
0
String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
{
    // Each input byte might turn into a character.
    // That includes all bytes in the partial-sequence buffer because
    // each byte in an invalid sequence will turn into a replacement character.
    StringBuffer<LChar> buffer(m_partialSequenceSize + length);

    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    const uint8_t* end = source + length;
    const uint8_t* alignedEnd = alignToMachineWord(end);
    LChar* destination = buffer.characters();

    do {
        if (m_partialSequenceSize) {
            // Explicitly copy destination and source pointers to avoid taking pointers to the
            // local variables, which may harm code generation by disabling some optimizations
            // in some compilers.
            LChar* destinationForHandlePartialSequence = destination;
            const uint8_t* sourceForHandlePartialSequence = source;
            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
                source = sourceForHandlePartialSequence;
                goto upConvertTo16Bit;
            }
            destination = destinationForHandlePartialSequence;
            source = sourceForHandlePartialSequence;
            if (m_partialSequenceSize)
                break;
        }

        while (source < end) {
            if (isASCII(*source)) {
                // Fast path for ASCII. Most UTF-8 text will be ASCII.
                if (isAlignedToMachineWord(source)) {
                    while (source < alignedEnd) {
                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                        if (!isAllASCII<LChar>(chunk))
                            break;
                        copyASCIIMachineWord(destination, source);
                        source += sizeof(MachineWord);
                        destination += sizeof(MachineWord);
                    }
                    if (source == end)
                        break;
                    if (!isASCII(*source))
                        continue;
                }
                *destination++ = *source++;
                continue;
            }
            int count = nonASCIISequenceLength(*source);
            int character;
            if (!count)
                character = nonCharacter;
            else {
                if (count > end - source) {
                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
                    ASSERT(!m_partialSequenceSize);
                    m_partialSequenceSize = end - source;
                    memcpy(m_partialSequence, source, m_partialSequenceSize);
                    source = end;
                    break;
                }
                character = decodeNonASCIISequence(source, count);
            }
            if (character == nonCharacter) {
                sawError = true;
                if (stopOnError)
                    break;

                goto upConvertTo16Bit;
            }
            if (character > 0xff)
                goto upConvertTo16Bit;

            source += count;
            *destination++ = character;
        }
    } while (flush && m_partialSequenceSize);

    buffer.shrink(destination - buffer.characters());

    return String::adopt(buffer);

upConvertTo16Bit:
    StringBuffer<UChar> buffer16(m_partialSequenceSize + length);

    UChar* destination16 = buffer16.characters();

    // Copy the already converted characters
    for (LChar* converted8 = buffer.characters(); converted8 < destination;)
        *destination16++ = *converted8++;

    do {
        if (m_partialSequenceSize) {
            // Explicitly copy destination and source pointers to avoid taking pointers to the
            // local variables, which may harm code generation by disabling some optimizations
            // in some compilers.
            UChar* destinationForHandlePartialSequence = destination16;
            const uint8_t* sourceForHandlePartialSequence = source;
            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
            destination16 = destinationForHandlePartialSequence;
            source = sourceForHandlePartialSequence;
            if (m_partialSequenceSize)
                break;
        }

        while (source < end) {
            if (isASCII(*source)) {
                // Fast path for ASCII. Most UTF-8 text will be ASCII.
                if (isAlignedToMachineWord(source)) {
                    while (source < alignedEnd) {
                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
                        if (!isAllASCII<LChar>(chunk))
                            break;
                        copyASCIIMachineWord(destination16, source);
                        source += sizeof(MachineWord);
                        destination16 += sizeof(MachineWord);
                    }
                    if (source == end)
                        break;
                    if (!isASCII(*source))
                        continue;
                }
                *destination16++ = *source++;
                continue;
            }
            int count = nonASCIISequenceLength(*source);
            int character;
            if (!count)
                character = nonCharacter;
            else {
                if (count > end - source) {
                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
                    ASSERT(!m_partialSequenceSize);
                    m_partialSequenceSize = end - source;
                    memcpy(m_partialSequence, source, m_partialSequenceSize);
                    source = end;
                    break;
                }
                character = decodeNonASCIISequence(source, count);
            }
            if (character == nonCharacter) {
                sawError = true;
                if (stopOnError)
                    break;
                // Each error generates a replacement character and consumes one byte.
                *destination16++ = replacementCharacter;
                ++source;
                continue;
            }
            source += count;
            destination16 = appendCharacter(destination16, character);
        }
    } while (flush && m_partialSequenceSize);

    buffer16.shrink(destination16 - buffer16.characters());

    return String::adopt(buffer16);
}
Beispiel #25
0
bool UString::isCntrl(uint32 c) {
	return isASCII(c) && std::iscntrl(c);
}
Beispiel #26
0
static inline bool isIdentPart(int c)
{
    return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c);
}
Beispiel #27
0
bool UString::isDigit(uint32 c) {
	return isASCII(c) && std::isdigit(c);
}
Beispiel #28
0
bool UString::isAlpha(uint32 c) {
	return isASCII(c) && std::isalpha(c);
}
Beispiel #29
0
void UTS46Test::TestSomeCases() {
    IcuTestErrorCode errorCode(*this, "TestSomeCases");
    char buffer[400], buffer2[400];
    int32_t i;
    for(i=0; i<UPRV_LENGTHOF(testCases); ++i) {
        const TestCase &testCase=testCases[i];
        UnicodeString input(ctou(testCase.s));
        UnicodeString expected(ctou(testCase.u));
        // ToASCII/ToUnicode, transitional/nontransitional
        UnicodeString aT, uT, aN, uN;
        IDNAInfo aTInfo, uTInfo, aNInfo, uNInfo;
        trans->nameToASCII(input, aT, aTInfo, errorCode);
        trans->nameToUnicode(input, uT, uTInfo, errorCode);
        nontrans->nameToASCII(input, aN, aNInfo, errorCode);
        nontrans->nameToUnicode(input, uN, uNInfo, errorCode);
        if(errorCode.logIfFailureAndReset("first-level processing [%d/%s] %s",
                                          (int)i, testCase.o, testCase.s)
        ) {
            continue;
        }
        // ToUnicode does not set length-overflow errors.
        uint32_t uniErrors=testCase.errors&~
            (UIDNA_ERROR_LABEL_TOO_LONG|
             UIDNA_ERROR_DOMAIN_NAME_TOO_LONG);
        char mode=testCase.o[0];
        if(mode=='B' || mode=='N') {
            if(uNInfo.getErrors()!=uniErrors) {
                errln("N.nameToUnicode([%d] %s) unexpected errors %04lx",
                      (int)i, testCase.s, (long)uNInfo.getErrors());
                continue;
            }
            if(uN!=expected) {
                prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                errln("N.nameToUnicode([%d] %s) unexpected string %s",
                      (int)i, testCase.s, buffer);
                continue;
            }
            if(aNInfo.getErrors()!=testCase.errors) {
                errln("N.nameToASCII([%d] %s) unexpected errors %04lx",
                      (int)i, testCase.s, (long)aNInfo.getErrors());
                continue;
            }
        }
        if(mode=='B' || mode=='T') {
            if(uTInfo.getErrors()!=uniErrors) {
                errln("T.nameToUnicode([%d] %s) unexpected errors %04lx",
                      (int)i, testCase.s, (long)uTInfo.getErrors());
                continue;
            }
            if(uT!=expected) {
                prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                errln("T.nameToUnicode([%d] %s) unexpected string %s",
                      (int)i, testCase.s, buffer);
                continue;
            }
            if(aTInfo.getErrors()!=testCase.errors) {
                errln("T.nameToASCII([%d] %s) unexpected errors %04lx",
                      (int)i, testCase.s, (long)aTInfo.getErrors());
                continue;
            }
        }
        // ToASCII is all-ASCII if no severe errors
        if((aNInfo.getErrors()&severeErrors)==0 && !isASCII(aN)) {
            prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            errln("N.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s",
                  (int)i, testCase.s, aNInfo.getErrors(), buffer);
            continue;
        }
        if((aTInfo.getErrors()&severeErrors)==0 && !isASCII(aT)) {
            prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            errln("T.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s",
                  (int)i, testCase.s, aTInfo.getErrors(), buffer);
            continue;
        }
        if(verbose) {
            char m= mode=='B' ? mode : 'N';
            prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            logln("%c.nameToASCII([%d] %s) (errors %04lx) result string: %s",
                  m, (int)i, testCase.s, aNInfo.getErrors(), buffer);
            if(mode!='B') {
                prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                logln("T.nameToASCII([%d] %s) (errors %04lx) result string: %s",
                      (int)i, testCase.s, aTInfo.getErrors(), buffer);
            }
        }
        // second-level processing
        UnicodeString aTuN, uTaN, aNuN, uNaN;
        IDNAInfo aTuNInfo, uTaNInfo, aNuNInfo, uNaNInfo;
        nontrans->nameToUnicode(aT, aTuN, aTuNInfo, errorCode);
        nontrans->nameToASCII(uT, uTaN, uTaNInfo, errorCode);
        nontrans->nameToUnicode(aN, aNuN, aNuNInfo, errorCode);
        nontrans->nameToASCII(uN, uNaN, uNaNInfo, errorCode);
        if(errorCode.logIfFailureAndReset("second-level processing [%d/%s] %s",
                                          (int)i, testCase.o, testCase.s)
        ) {
            continue;
        }
        if(aN!=uNaN) {
            prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            prettify(uNaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
            errln("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "
                  "(errors %04lx) %s vs. %s",
                  (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2);
            continue;
        }
        if(aT!=uTaN) {
            prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            prettify(uTaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
            errln("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "
                  "(errors %04lx) %s vs. %s",
                  (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2);
            continue;
        }
        if(uN!=aNuN) {
            prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            prettify(aNuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
            errln("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "
                  "(errors %04lx) %s vs. %s",
                  (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2);
            continue;
        }
        if(uT!=aTuN) {
            prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
            prettify(aTuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
            errln("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "
                  "(errors %04lx) %s vs. %s",
                  (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2);
            continue;
        }
        // labelToUnicode
        UnicodeString aTL, uTL, aNL, uNL;
        IDNAInfo aTLInfo, uTLInfo, aNLInfo, uNLInfo;
        trans->labelToASCII(input, aTL, aTLInfo, errorCode);
        trans->labelToUnicode(input, uTL, uTLInfo, errorCode);
        nontrans->labelToASCII(input, aNL, aNLInfo, errorCode);
        nontrans->labelToUnicode(input, uNL, uNLInfo, errorCode);
        if(errorCode.logIfFailureAndReset("labelToXYZ processing [%d/%s] %s",
                                          (int)i, testCase.o, testCase.s)
        ) {
            continue;
        }
        if(aN.indexOf((UChar)0x2e)<0) {
            if(aN!=aNL || aNInfo.getErrors()!=aNLInfo.getErrors()) {
                prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                prettify(aNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
                errln("N.nameToASCII([%d] %s)!=N.labelToASCII() "
                      "(errors %04lx vs %04lx) %s vs. %s",
                      (int)i, testCase.s, aNInfo.getErrors(), aNLInfo.getErrors(), buffer, buffer2);
                continue;
            }
        } else {
            if((aNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
                errln("N.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
                      (int)i, testCase.s, (long)aNLInfo.getErrors());
                continue;
            }
        }
        if(aT.indexOf((UChar)0x2e)<0) {
            if(aT!=aTL || aTInfo.getErrors()!=aTLInfo.getErrors()) {
                prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                prettify(aTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
                errln("T.nameToASCII([%d] %s)!=T.labelToASCII() "
                      "(errors %04lx vs %04lx) %s vs. %s",
                      (int)i, testCase.s, aTInfo.getErrors(), aTLInfo.getErrors(), buffer, buffer2);
                continue;
            }
        } else {
            if((aTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
                errln("T.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
                      (int)i, testCase.s, (long)aTLInfo.getErrors());
                continue;
            }
        }
        if(uN.indexOf((UChar)0x2e)<0) {
            if(uN!=uNL || uNInfo.getErrors()!=uNLInfo.getErrors()) {
                prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                prettify(uNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
                errln("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "
                      "(errors %04lx vs %04lx) %s vs. %s",
                      (int)i, testCase.s, uNInfo.getErrors(), uNLInfo.getErrors(), buffer, buffer2);
                continue;
            }
        } else {
            if((uNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
                errln("N.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
                      (int)i, testCase.s, (long)uNLInfo.getErrors());
                continue;
            }
        }
        if(uT.indexOf((UChar)0x2e)<0) {
            if(uT!=uTL || uTInfo.getErrors()!=uTLInfo.getErrors()) {
                prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer));
                prettify(uTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2));
                errln("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "
                      "(errors %04lx vs %04lx) %s vs. %s",
                      (int)i, testCase.s, uTInfo.getErrors(), uTLInfo.getErrors(), buffer, buffer2);
                continue;
            }
        } else {
            if((uTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) {
                errln("T.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT",
                      (int)i, testCase.s, (long)uTLInfo.getErrors());
                continue;
            }
        }
        // Differences between transitional and nontransitional processing
        if(mode=='B') {
            if( aNInfo.isTransitionalDifferent() ||
                aTInfo.isTransitionalDifferent() ||
                uNInfo.isTransitionalDifferent() ||
                uTInfo.isTransitionalDifferent() ||
                aNLInfo.isTransitionalDifferent() ||
                aTLInfo.isTransitionalDifferent() ||
                uNLInfo.isTransitionalDifferent() ||
                uTLInfo.isTransitionalDifferent()
            ) {
                errln("B.process([%d] %s) isTransitionalDifferent()", (int)i, testCase.s);
                continue;
            }
            if( aN!=aT || uN!=uT || aNL!=aTL || uNL!=uTL ||
                aNInfo.getErrors()!=aTInfo.getErrors() || uNInfo.getErrors()!=uTInfo.getErrors() ||
                aNLInfo.getErrors()!=aTLInfo.getErrors() || uNLInfo.getErrors()!=uTLInfo.getErrors()
            ) {
                errln("N.process([%d] %s) vs. T.process() different errors or result strings",
                      (int)i, testCase.s);
                continue;
            }
        } else {
            if( !aNInfo.isTransitionalDifferent() ||
                !aTInfo.isTransitionalDifferent() ||
                !uNInfo.isTransitionalDifferent() ||
                !uTInfo.isTransitionalDifferent() ||
                !aNLInfo.isTransitionalDifferent() ||
                !aTLInfo.isTransitionalDifferent() ||
                !uNLInfo.isTransitionalDifferent() ||
                !uTLInfo.isTransitionalDifferent()
            ) {
                errln("%s.process([%d] %s) !isTransitionalDifferent()",
                      testCase.o, (int)i, testCase.s);
                continue;
            }
            if(aN==aT || uN==uT || aNL==aTL || uNL==uTL) {
                errln("N.process([%d] %s) vs. T.process() same result strings",
                      (int)i, testCase.s);
                continue;
            }
        }
        // UTF-8
        std::string input8, aT8, uT8, aN8, uN8;
        StringByteSink<std::string> aT8Sink(&aT8), uT8Sink(&uT8), aN8Sink(&aN8), uN8Sink(&uN8);
        IDNAInfo aT8Info, uT8Info, aN8Info, uN8Info;
        input.toUTF8String(input8);
        trans->nameToASCII_UTF8(input8, aT8Sink, aT8Info, errorCode);
        trans->nameToUnicodeUTF8(input8, uT8Sink, uT8Info, errorCode);
        nontrans->nameToASCII_UTF8(input8, aN8Sink, aN8Info, errorCode);
        nontrans->nameToUnicodeUTF8(input8, uN8Sink, uN8Info, errorCode);
        if(errorCode.logIfFailureAndReset("UTF-8 processing [%d/%s] %s",
                                          (int)i, testCase.o, testCase.s)
        ) {
            continue;
        }
        UnicodeString aT16(UnicodeString::fromUTF8(aT8));
        UnicodeString uT16(UnicodeString::fromUTF8(uT8));
        UnicodeString aN16(UnicodeString::fromUTF8(aN8));
        UnicodeString uN16(UnicodeString::fromUTF8(uN8));
        if( aN8Info.getErrors()!=aNInfo.getErrors() ||
            uN8Info.getErrors()!=uNInfo.getErrors()
        ) {
            errln("N.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx",
                  (int)i, testCase.s,
                  (long)aN8Info.getErrors(), (long)aNInfo.getErrors());
            continue;
        }
        if( aT8Info.getErrors()!=aTInfo.getErrors() ||
            uT8Info.getErrors()!=uTInfo.getErrors()
        ) {
            errln("T.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx",
                  (int)i, testCase.s,
                  (long)aT8Info.getErrors(), (long)aTInfo.getErrors());
            continue;
        }
        if(aT16!=aT || uT16!=uT || aN16!=aN || uN16!=uN) {
            errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different string results",
                  testCase.o, (int)i, testCase.s, (long)aTInfo.getErrors());
            continue;
        }
        if( aT8Info.isTransitionalDifferent()!=aTInfo.isTransitionalDifferent() ||
            uT8Info.isTransitionalDifferent()!=uTInfo.isTransitionalDifferent() ||
            aN8Info.isTransitionalDifferent()!=aNInfo.isTransitionalDifferent() ||
            uN8Info.isTransitionalDifferent()!=uNInfo.isTransitionalDifferent()
        ) {
            errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different isTransitionalDifferent()",
                  testCase.o, (int)i, testCase.s);
            continue;
        }
    }
}
Beispiel #30
0
bool UString::isAlNum(uint32 c) {
	return isASCII(c) && std::isalnum(c);
}