std::string validateOrEscapeUTF8(std::string input) { std::string output; std::string::size_type i = 0; while( i < input.length() ) { uint8_t ch = (uint8_t)input.at(i); // utf8 1 byte prefix (0xxx xxxx) if((ch & 0x80)==0x00){ // 00 .. 0x7f if(ch=='\\'){ // escape the escape character output += hexesc(ch); i++; continue; } if( ch < ' '){ // not printable output += hexesc(ch); i++; continue; } output += ch; // printable i++; continue; } // utf8 2 bytes (110x xxxx) prefix if(((ch & 0xe0)==0xc0) // 2-byte prefix && (i+1 < input.length()) && utf8cont((uint8_t)input.at(i+1))){ wchar_t unichar = (((uint8_t)input.at(i) & 0x1f) << 6) | (((uint8_t)input.at(i+1) & 0x3f)); // check for invalid code point for this encoding if(invalid_utf8unichar(unichar) || ((uint8_t)input.at(i)==0xc0) || (unichar <= 0x7f)){ output += hexesc((uint8_t)input.at(i++)); output += hexesc((uint8_t)input.at(i++)); continue; } output += (uint8_t)input.at(i++); // byte1 output += (uint8_t)input.at(i++); // byte2 continue; } // utf8 3 bytes (1110 xxxx prefix) if(((ch & 0xf0) == 0xe0) && (i+2 < input.length()) && utf8cont((uint8_t)input.at(i+1)) && utf8cont((uint8_t)input.at(i+2))){ uint32_t unichar = (((uint8_t)input.at(i) & 0x0f) << 12) | (((uint8_t)input.at(i+1) & 0x3f) << 6) | (((uint8_t)input.at(i+2) & 0x3f)); if(invalid_utf8unichar(unichar) || unichar<=0x7ff){ // invalid code points output += hexesc((uint8_t)input.at(i++)); output += hexesc((uint8_t)input.at(i++)); continue; } output += (uint8_t)input.at(i++); // byte1 output += (uint8_t)input.at(i++); // byte2 output += (uint8_t)input.at(i++); // byte3 continue; } // utf8 4 bytes (1111 0xxx prefix) if((( ch & 0xf8) == 0xf0) && (i+3 < input.length()) && utf8cont((uint8_t)input.at(i+1)) && utf8cont((uint8_t)input.at(i+2)) && utf8cont((uint8_t)input.at(i+3))){ uint32_t unichar =( (((uint8_t)input.at(i) & 0x07) << 18) |(((uint8_t)input.at(i+1) & 0x3f) << 12) |(((uint8_t)input.at(i+2) & 0x3f) << 6) |(((uint8_t)input.at(i+3) & 0x3f))); if(invalid_utf8unichar(unichar)){ output += hexesc((uint8_t)input.at(i++)); // byte 1 output += hexesc((uint8_t)input.at(i++)); // byte 2 output += hexesc((uint8_t)input.at(i++)); // byte 3 output += hexesc((uint8_t)input.at(i++)); // byte 4 continue; } output += (uint8_t)input.at(i++); // byte1 output += (uint8_t)input.at(i++); // byte2 output += (uint8_t)input.at(i++); // byte3 output += (uint8_t)input.at(i++); // byte4 continue; } // Just escape it output += hexesc((uint8_t)input.at(i++)); } return output; }
std::string validateOrEscapeUTF8(const std::string &input, bool escape_bad_utf8,bool escape_backslash) { // // skip the validation if not escaping and not DEBUG_PEDANTIC if (escape_bad_utf8==false && escape_backslash==false && ((debug & DEBUG_PEDANTIC) == 0)){ return input; } // validate or escape input std::string output; for(std::string::size_type i =0; i< input.length(); ) { uint8_t ch = (uint8_t)input.at(i); // utf8 1 byte prefix (0xxx xxxx) if((ch & 0x80)==0x00){ // 00 .. 0x7f if(ch=='\\' && escape_backslash){ // escape the escape character as \x92 output += hexesc(ch); i++; continue; } if( ch < ' '){ // not printable are escaped output += hexesc(ch); i++; continue; } output += ch; // printable is not escaped i++; continue; } // utf8 2 bytes (110x xxxx) prefix if(((ch & 0xe0)==0xc0) // 2-byte prefix && (i+1 < input.length()) && utf8cont((uint8_t)input.at(i+1))){ uint32_t unichar = (((uint8_t)input.at(i) & 0x1f) << 6) | (((uint8_t)input.at(i+1) & 0x3f)); // check for valid 2-byte encoding if(valid_utf8codepoint(unichar) && ((uint8_t)input.at(i)!=0xc0) && (unichar >= 0x80)){ output += (uint8_t)input.at(i++); // byte1 output += (uint8_t)input.at(i++); // byte2 continue; } } // utf8 3 bytes (1110 xxxx prefix) if(((ch & 0xf0) == 0xe0) && (i+2 < input.length()) && utf8cont((uint8_t)input.at(i+1)) && utf8cont((uint8_t)input.at(i+2))){ uint32_t unichar = (((uint8_t)input.at(i) & 0x0f) << 12) | (((uint8_t)input.at(i+1) & 0x3f) << 6) | (((uint8_t)input.at(i+2) & 0x3f)); // check for a valid 3-byte code point if(valid_utf8codepoint(unichar) && unichar>=0x800){ output += (uint8_t)input.at(i++); // byte1 output += (uint8_t)input.at(i++); // byte2 output += (uint8_t)input.at(i++); // byte3 continue; } } // utf8 4 bytes (1111 0xxx prefix) if((( ch & 0xf8) == 0xf0) && (i+3 < input.length()) && utf8cont((uint8_t)input.at(i+1)) && utf8cont((uint8_t)input.at(i+2)) && utf8cont((uint8_t)input.at(i+3))){ uint32_t unichar =( (((uint8_t)input.at(i) & 0x07) << 18) |(((uint8_t)input.at(i+1) & 0x3f) << 12) |(((uint8_t)input.at(i+2) & 0x3f) << 6) |(((uint8_t)input.at(i+3) & 0x3f))); if(valid_utf8codepoint(unichar) && unichar>=0x1000000){ output += (uint8_t)input.at(i++); // byte1 output += (uint8_t)input.at(i++); // byte2 output += (uint8_t)input.at(i++); // byte3 output += (uint8_t)input.at(i++); // byte4 continue; } } if (escape_bad_utf8) { // Just escape the next byte and carry on output += hexesc((uint8_t)input.at(i++)); } else { // fatal if we are debug pedantic, otherwise just ignore // note: we shouldn't be here anyway, since if we are not escaping and we are not // pedantic we should have returned above if(debug & DEBUG_PEDANTIC){ std::ofstream os("bad_unicode.txt"); os << input << "\n"; os.close(); std::cerr << "INTERNAL ERROR: bad unicode stored in bad_unicode.txt\n"; assert(0); } } } return output; }