Beispiel #1
0
std::string validateOrEscapeUTF8(std::string input)
{
    std::string output;
    std::string::size_type i = 0;
    while( i < input.length() ) {
	uint8_t ch = (uint8_t)input.at(i);
	// utf8 1 byte prefix (0xxx xxxx)
	if((ch & 0x80)==0x00){			// 00 .. 0x7f
	    if(ch=='\\'){			// escape the escape character
		output += hexesc(ch);
		i++;
		continue;
	    }

	    if( ch < ' '){		// not printable
		output += hexesc(ch);
		i++;
		continue;
	    }

	    output += ch;		// printable
	    i++;
	    continue;
	}

	// utf8 2 bytes  (110x xxxx) prefix
	if(((ch & 0xe0)==0xc0)  // 2-byte prefix
	   && (i+1 < input.length())
	   && utf8cont((uint8_t)input.at(i+1))){
	    wchar_t unichar = (((uint8_t)input.at(i) & 0x1f) << 6) | (((uint8_t)input.at(i+1) & 0x3f));

	    // check for invalid code point for this encoding
	    if(invalid_utf8unichar(unichar)
	       || ((uint8_t)input.at(i)==0xc0)
	       || (unichar <= 0x7f)){ 
		output += hexesc((uint8_t)input.at(i++));
		output += hexesc((uint8_t)input.at(i++));
		continue;
	    }
			      
	    output += (uint8_t)input.at(i++);	// byte1
	    output += (uint8_t)input.at(i++);	// byte2
	    continue;
	}
		
	// utf8 3 bytes (1110 xxxx prefix)
	if(((ch & 0xf0) == 0xe0)
	   && (i+2 < input.length())
	   && utf8cont((uint8_t)input.at(i+1))
	   && utf8cont((uint8_t)input.at(i+2))){
	    uint32_t unichar = (((uint8_t)input.at(i) & 0x0f) << 12)
		| (((uint8_t)input.at(i+1) & 0x3f) << 6)
		| (((uint8_t)input.at(i+2) & 0x3f));
	    
	    if(invalid_utf8unichar(unichar) || unichar<=0x7ff){ // invalid code points
		output += hexesc((uint8_t)input.at(i++));
		output += hexesc((uint8_t)input.at(i++));
		continue;
	    }

	    output += (uint8_t)input.at(i++);	// byte1
	    output += (uint8_t)input.at(i++);	// byte2
	    output += (uint8_t)input.at(i++);	// byte3
	    continue;
	}
	    
	// utf8 4 bytes (1111 0xxx prefix)
	if((( ch & 0xf8) == 0xf0)
	   && (i+3 < input.length())
	   && utf8cont((uint8_t)input.at(i+1))
	   && utf8cont((uint8_t)input.at(i+2))
	   && utf8cont((uint8_t)input.at(i+3))){
	    uint32_t unichar =( (((uint8_t)input.at(i) & 0x07) << 18)
				|(((uint8_t)input.at(i+1) & 0x3f) << 12)
				|(((uint8_t)input.at(i+2) & 0x3f) <<  6)
				|(((uint8_t)input.at(i+3) & 0x3f)));

	    if(invalid_utf8unichar(unichar)){
		output += hexesc((uint8_t)input.at(i++)); // byte 1
		output += hexesc((uint8_t)input.at(i++)); // byte 2
		output += hexesc((uint8_t)input.at(i++)); // byte 3
		output += hexesc((uint8_t)input.at(i++)); // byte 4
		continue;
	    }
	    output += (uint8_t)input.at(i++);	// byte1
	    output += (uint8_t)input.at(i++);	// byte2
	    output += (uint8_t)input.at(i++);	// byte3
	    output += (uint8_t)input.at(i++);	// byte4
	    continue;
	}

	// Just escape it
	output += hexesc((uint8_t)input.at(i++));
    }
    return output;
}
Beispiel #2
0
std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8,bool escape_backslash)
{
    // 
    // skip the validation if not escaping and not DEBUG_PEDANTIC
    if (escape_bad_utf8==false && escape_backslash==false && ((debug & DEBUG_PEDANTIC) == 0)){
        return input;
    }
        
    // validate or escape input
    std::string output;
    for(std::string::size_type i =0; i< input.length(); ) {
        uint8_t ch = (uint8_t)input.at(i);
        
        // utf8 1 byte prefix (0xxx xxxx)
        if((ch & 0x80)==0x00){          // 00 .. 0x7f
            if(ch=='\\' && escape_backslash){   // escape the escape character as \x92
                output += hexesc(ch);
                i++;
                continue;
            }

            if( ch < ' '){              // not printable are escaped
                output += hexesc(ch);
                i++;
                continue;
            }
            output += ch;               // printable is not escaped
            i++;
            continue;
        }

        // utf8 2 bytes  (110x xxxx) prefix
        if(((ch & 0xe0)==0xc0)  // 2-byte prefix
           && (i+1 < input.length())
           && utf8cont((uint8_t)input.at(i+1))){
            uint32_t unichar = (((uint8_t)input.at(i) & 0x1f) << 6) | (((uint8_t)input.at(i+1) & 0x3f));

            // check for valid 2-byte encoding
            if(valid_utf8codepoint(unichar)
               && ((uint8_t)input.at(i)!=0xc0)
               && (unichar >= 0x80)){ 
                output += (uint8_t)input.at(i++);       // byte1
                output += (uint8_t)input.at(i++);       // byte2
                continue;
            }
        }
                
        // utf8 3 bytes (1110 xxxx prefix)
        if(((ch & 0xf0) == 0xe0)
           && (i+2 < input.length())
           && utf8cont((uint8_t)input.at(i+1))
           && utf8cont((uint8_t)input.at(i+2))){
            uint32_t unichar = (((uint8_t)input.at(i) & 0x0f) << 12)
                | (((uint8_t)input.at(i+1) & 0x3f) << 6)
                | (((uint8_t)input.at(i+2) & 0x3f));
            
            // check for a valid 3-byte code point
            if(valid_utf8codepoint(unichar)
               && unichar>=0x800){                     
                output += (uint8_t)input.at(i++);       // byte1
                output += (uint8_t)input.at(i++);       // byte2
                output += (uint8_t)input.at(i++);       // byte3
                continue;
            }
        }
            
        // utf8 4 bytes (1111 0xxx prefix)
        if((( ch & 0xf8) == 0xf0)
           && (i+3 < input.length())
           && utf8cont((uint8_t)input.at(i+1))
           && utf8cont((uint8_t)input.at(i+2))
           && utf8cont((uint8_t)input.at(i+3))){
            uint32_t unichar =( (((uint8_t)input.at(i) & 0x07) << 18)
                                |(((uint8_t)input.at(i+1) & 0x3f) << 12)
                                |(((uint8_t)input.at(i+2) & 0x3f) <<  6)
                                |(((uint8_t)input.at(i+3) & 0x3f)));

            if(valid_utf8codepoint(unichar) && unichar>=0x1000000){
                output += (uint8_t)input.at(i++);       // byte1
                output += (uint8_t)input.at(i++);       // byte2
                output += (uint8_t)input.at(i++);       // byte3
                output += (uint8_t)input.at(i++);       // byte4
                continue;
            }
        }

        if (escape_bad_utf8) {
            // Just escape the next byte and carry on
            output += hexesc((uint8_t)input.at(i++));
        } else {
            // fatal if we are debug pedantic, otherwise just ignore
            // note: we shouldn't be here anyway, since if we are not escaping and we are not
            // pedantic we should have returned above
            if(debug & DEBUG_PEDANTIC){
                std::ofstream os("bad_unicode.txt");
                os << input << "\n";
                os.close();
                std::cerr << "INTERNAL ERROR: bad unicode stored in bad_unicode.txt\n";
                assert(0);
            }
        }
    }
    return output;
}