CString CFileDataIO::ReadString(bool bOptUTF8, UINT uRawSize) { #ifdef _UNICODE const UINT uMaxShortRawSize = SHORT_RAW_ED2K_UTF8_STR; if (uRawSize <= uMaxShortRawSize) { char acRaw[uMaxShortRawSize]; Read(acRaw, uRawSize); if (uRawSize >= 3 && (UCHAR)acRaw[0] == 0xEFU && (UCHAR)acRaw[1] == 0xBBU && (UCHAR)acRaw[2] == 0xBFU) { WCHAR awc[uMaxShortRawSize]; int iChars = ByteStreamToWideChar(acRaw + 3, uRawSize - 3, awc, ARRSIZE(awc)); if (iChars >= 0) return CStringW(awc, iChars); } else if (bOptUTF8) { WCHAR awc[uMaxShortRawSize]; //int iChars = ByteStreamToWideChar(acRaw, uRawSize, awc, ARRSIZE(awc)); int iChars = utf8towc(acRaw, uRawSize, awc, ARRSIZE(awc)); if (iChars >= 0) return CStringW(awc, iChars); } return CStringW(acRaw, uRawSize); // use local codepage } else { Array<char> acRaw(uRawSize); Read(acRaw, uRawSize); if (uRawSize >= 3 && (UCHAR)acRaw[0] == 0xEFU && (UCHAR)acRaw[1] == 0xBBU && (UCHAR)acRaw[2] == 0xBFU) { Array<WCHAR> awc(uRawSize); int iChars = ByteStreamToWideChar(acRaw + 3, uRawSize - 3, awc, uRawSize); if (iChars >= 0) return CStringW(awc, iChars); } else if (bOptUTF8) { Array<WCHAR> awc(uRawSize); //int iChars = ByteStreamToWideChar(acRaw, uRawSize, awc, uRawSize); int iChars = utf8towc(acRaw, uRawSize, awc, uRawSize); if (iChars >= 0) return CStringW(awc, iChars); } return CStringW(acRaw, uRawSize); // use local codepage } #else CStringA strA; Read(strA.GetBuffer(uRawSize), uRawSize); strA.ReleaseBuffer(uRawSize); return strA; #endif }
CStringW DecodeDoubleEncodedUtf8(LPCWSTR pszFileName) { size_t nChars = wcslen(pszFileName); // Check if all characters are valid for UTF-8 value range // for (UINT i = 0; i < nChars; i++) { if ((_TUCHAR)pszFileName[i] > 0xFFU) return pszFileName; // string is already using Unicode character value range; return original } // Transform Unicode string to UTF-8 byte sequence // CStringA strA; #pragma warning(disable : 4267) LPSTR pszA = strA.GetBuffer(nChars); for (UINT i = 0; i < nChars; i++) pszA[i] = (CHAR)pszFileName[i]; strA.ReleaseBuffer(nChars); // Decode the string with UTF-8 // CStringW strW; LPWSTR pszW = strW.GetBuffer(nChars); int iNewChars = utf8towc(strA, nChars, pszW, nChars); #pragma warning(disable : 4267) if (iNewChars < 0) { strW.ReleaseBuffer(0); return pszFileName; // conversion error (not a valid UTF-8 string); return original } strW.ReleaseBuffer(iNewChars); return strW; }
void Parser::parse(const char* src) { const char* ptr = src; mNodeStart = ptr; ERRNO(mbtowc(NULL, NULL, 0)); // reset shift state. while(*ptr) { // skip invalid utf-8 sequences. wchar_t w; int res = utf8towc(&w, ptr, 4); if(res <= 0) { if(res < 0) { printf("Invalid UTF-8 0x%x @ pos %" PRIuPTR "\n", (unsigned char)*ptr, ptr - src); } FLUSH; ptr++; mNodeStart = ptr; continue; } else if(res > 1) { // valid utf-8 beyond ascii ptr += res; continue; } if(STREQ(ptr, "http://")) { // unescaped link FLUSH; ptr = parseUnescapedUrl(ptr); mNodeStart = ptr; continue; } char c = *ptr; ptr++; if(c == '[') { // start tag while(*ptr == '[') { ptr++; } const char* endPtr = strchr(ptr, ']'); if(!endPtr) { break; } const char* newEndPtr = fixTag(ptr, endPtr); bool diff = newEndPtr != endPtr; endPtr = newEndPtr; flush(ptr-1); if(!diff) { mNodeStart = ptr; parseTag(ptr, endPtr - ptr); } ptr = endPtr; if(!diff) { ptr++; } mNodeStart = ptr; } else if(c == '\\' && *ptr == 'n') { flush(ptr-1); ptr++; mNodeStart = ptr; addLinebreakNode(); } } FLUSH; }
size_t utf8toutf32(uint32_t *up, const char *str, size_t n) { wchar_t wc; int r; if ((r = utf8towc(&wc, str, n)) > 0) *up = (uint32_t)wc; return r; }
CString OptUtf8ToStr(LPCSTR psz, int iLen) { CStringW wstr; int iMaxWideStrLen = iLen; LPWSTR pwsz = wstr.GetBuffer(iMaxWideStrLen); int iWideChars = utf8towc(psz, iLen, pwsz, iMaxWideStrLen); if (iWideChars <= 0) { // invalid UTF8 string... wstr.ReleaseBuffer(0); wstr = psz; // convert with local codepage } else wstr.ReleaseBuffer(iWideChars); return wstr; // just return the string }
CString OptUtf8ToStr(const CStringA& rastr) { CStringW wstr; int iMaxWideStrLen = rastr.GetLength(); LPWSTR pwsz = wstr.GetBuffer(iMaxWideStrLen); int iWideChars = utf8towc(rastr, rastr.GetLength(), pwsz, iMaxWideStrLen); if (iWideChars <= 0) { // invalid UTF8 string... wstr.ReleaseBuffer(0); wstr = rastr; // convert with local codepage } else wstr.ReleaseBuffer(iWideChars); return wstr; // just return the string }
int ByteStreamToWideChar(LPCSTR pcUtf8, UINT uUtf8Size, LPWSTR pwc, UINT uWideCharSize) { int iWideChars = utf8towc(pcUtf8, uUtf8Size, pwc, uWideCharSize); if (iWideChars < 0) { LPWSTR pwc0 = pwc; while (uUtf8Size && uWideCharSize) { if ((*pwc++ = (BYTE)*pcUtf8++) == L'\0') break; uUtf8Size--; uWideCharSize--; } iWideChars = pwc - pwc0; } return iWideChars; }
wchar_t *CPLRecodeToWCharStub( const char *pszSource, const char *pszSrcEncoding, const char *pszDstEncoding ) { char *pszUTF8Source = (char *) pszSource; if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0 && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 ) { pszUTF8Source = CPLRecodeStub( pszSource, pszSrcEncoding, CPL_ENC_UTF8 ); if( pszUTF8Source == NULL ) return NULL; } /* -------------------------------------------------------------------- */ /* We try to avoid changes of character set. We are just */ /* providing for unicode to unicode. */ /* -------------------------------------------------------------------- */ if( strcmp(pszDstEncoding,"WCHAR_T") != 0 && strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0 && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0 && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 ) { CPLError( CE_Failure, CPLE_AppDefined, "Stub recoding implementation does not support\n" "CPLRecodeToWCharStub(...,%s,%s)", pszSrcEncoding, pszDstEncoding ); return NULL; } /* -------------------------------------------------------------------- */ /* Do the UTF-8 to UCS-2 recoding. */ /* -------------------------------------------------------------------- */ int nSrcLen = strlen(pszUTF8Source); wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1); utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 ); if( pszUTF8Source != pszSource ) CPLFree( pszUTF8Source ); return pwszResult; }
int ByteStreamToWideChar(LPCSTR pcUtf8, UINT uUtf8Size, LPWSTR pwc, UINT uWideCharSize) { int iWideChars = utf8towc(pcUtf8, uUtf8Size, pwc, uWideCharSize); if (iWideChars < 0) { LPWSTR pwc0 = pwc; while (uUtf8Size && uWideCharSize) { if ((*pwc++ = (BYTE)*pcUtf8++) == L'\0') break; uUtf8Size--; uWideCharSize--; } #pragma warning(disable : 4244) iWideChars = pwc - pwc0; #pragma warning(default : 4244) } return iWideChars; }
static Tab* getComments(const char* query, bool log) { sqlite3_stmt* stmt = NULL; if(!sDB) { SQLT(sqlite3_open("../wowfoot-import/imports.db", &sDB)); atexit(&closeDb); } SQLT(sqlite3_prepare_v2(sDB, query, -1, &stmt, NULL)); int res; commentTabChtml* ct = new commentTabChtml(); Formatter f(log); while((res = sqlite3_step(stmt)) == SQLITE_ROW) { Comment c; try { c.user = (const char*)sqlite3_column_text(stmt, 0); c.originalBody = (const char*)sqlite3_column_text(stmt, 1); for(size_t i=0; i<c.originalBody.size(); ) { #if 0 // blank out invalid utf-8 sequences. wchar_t w; int rs = utf8towc(&w, c.originalBody.c_str() + i, c.originalBody.size() - i); if(rs <= 0) { c.originalBody[i] = ' '; i++; continue; } #else // blank out all non-printable ascii characters. if((unsigned int)c.originalBody[i] > 127 || iscntrl(c.originalBody[i])) c.originalBody[i] = ' '; #endif // transform '-' to avoid the HTML end-comment combo "-->". if(c.originalBody[i] == '-') c.originalBody[i] = '_'; // transform EOL for readability. if(i >= 1) if(c.originalBody[i] == 'n' && c.originalBody[i-1] == '\\') c.originalBody[i] = '\n'; i++; } c.rating = sqlite3_column_int(stmt, 2); c.date = (const char*)sqlite3_column_text(stmt, 3); c.indent = sqlite3_column_int(stmt, 4); c.id = sqlite3_column_int(stmt, 5); if(log) printf("Comment %i\n", c.id); c.body = f.formatComment((const char*)sqlite3_column_text(stmt, 1)); ct->mComments.push_back(c); } catch(Exception& e) { printf("Exception in comment %i\n", c.id); SQLT(sqlite3_finalize(stmt)); throw e; } } if(res != SQLITE_DONE) { SQLT(res); } SQLT(sqlite3_finalize(stmt)); ct->id = "comments"; ct->title = "Comments"; ct->count = ct->mComments.size(); return ct; }