예제 #1
0
CString CFileDataIO::ReadString(bool bOptUTF8, UINT uRawSize)
{
#ifdef _UNICODE
	const UINT uMaxShortRawSize = SHORT_RAW_ED2K_UTF8_STR;
	if (uRawSize <= uMaxShortRawSize)
	{
		char acRaw[uMaxShortRawSize];
		Read(acRaw, uRawSize);
		if (uRawSize >= 3 && (UCHAR)acRaw[0] == 0xEFU && (UCHAR)acRaw[1] == 0xBBU && (UCHAR)acRaw[2] == 0xBFU)
		{
			WCHAR awc[uMaxShortRawSize];
			int iChars = ByteStreamToWideChar(acRaw + 3, uRawSize - 3, awc, ARRSIZE(awc));
			if (iChars >= 0)
				return CStringW(awc, iChars);
		}
		else if (bOptUTF8)
		{
			WCHAR awc[uMaxShortRawSize];
			//int iChars = ByteStreamToWideChar(acRaw, uRawSize, awc, ARRSIZE(awc));
			int iChars = utf8towc(acRaw, uRawSize, awc, ARRSIZE(awc));
			if (iChars >= 0)
				return CStringW(awc, iChars);
		}
		return CStringW(acRaw, uRawSize); // use local codepage
	}
	else
	{
		Array<char> acRaw(uRawSize);
		Read(acRaw, uRawSize);
		if (uRawSize >= 3 && (UCHAR)acRaw[0] == 0xEFU && (UCHAR)acRaw[1] == 0xBBU && (UCHAR)acRaw[2] == 0xBFU)
		{
			Array<WCHAR> awc(uRawSize);
			int iChars = ByteStreamToWideChar(acRaw + 3, uRawSize - 3, awc, uRawSize);
			if (iChars >= 0)
				return CStringW(awc, iChars);
		}
		else if (bOptUTF8)
		{
			Array<WCHAR> awc(uRawSize);
			//int iChars = ByteStreamToWideChar(acRaw, uRawSize, awc, uRawSize);
			int iChars = utf8towc(acRaw, uRawSize, awc, uRawSize);
			if (iChars >= 0)
				return CStringW(awc, iChars);
		}
		return CStringW(acRaw, uRawSize); // use local codepage
	}
#else
	CStringA strA;
	Read(strA.GetBuffer(uRawSize), uRawSize);
	strA.ReleaseBuffer(uRawSize);
	return strA;
#endif
}
예제 #2
0
CStringW DecodeDoubleEncodedUtf8(LPCWSTR pszFileName)
{
    size_t nChars = wcslen(pszFileName);

    // Check if all characters are valid for UTF-8 value range
    //
    for (UINT i = 0; i < nChars; i++) {
        if ((_TUCHAR)pszFileName[i] > 0xFFU)
            return pszFileName; // string is already using Unicode character value range; return original
    }

    // Transform Unicode string to UTF-8 byte sequence
    //
    CStringA strA;
#pragma warning(disable : 4267)
    LPSTR pszA = strA.GetBuffer(nChars);

    for (UINT i = 0; i < nChars; i++)
        pszA[i] = (CHAR)pszFileName[i];
    strA.ReleaseBuffer(nChars);

    // Decode the string with UTF-8
    //
    CStringW strW;
    LPWSTR pszW = strW.GetBuffer(nChars);
    int iNewChars = utf8towc(strA, nChars, pszW, nChars);
#pragma warning(disable : 4267)
    if (iNewChars < 0) {
        strW.ReleaseBuffer(0);
        return pszFileName;		// conversion error (not a valid UTF-8 string); return original
    }
    strW.ReleaseBuffer(iNewChars);

    return strW;
}
예제 #3
0
파일: parser.cpp 프로젝트: Masken3/wowfoot
void Parser::parse(const char* src) {
	const char* ptr = src;
	mNodeStart = ptr;
	ERRNO(mbtowc(NULL, NULL, 0));	// reset shift state.
	while(*ptr) {
		// skip invalid utf-8 sequences.
		wchar_t w;
		int res = utf8towc(&w, ptr, 4);
		if(res <= 0) {
			if(res < 0) {
				printf("Invalid UTF-8 0x%x @ pos %" PRIuPTR "\n", (unsigned char)*ptr, ptr - src);
			}
			FLUSH;
			ptr++;
			mNodeStart = ptr;
			continue;
		} else if(res > 1) {	// valid utf-8 beyond ascii
			ptr += res;
			continue;
		}

		if(STREQ(ptr, "http://")) {	// unescaped link
			FLUSH;
			ptr = parseUnescapedUrl(ptr);
			mNodeStart = ptr;
			continue;
		}

		char c = *ptr;
		ptr++;
		if(c == '[') {	// start tag
			while(*ptr == '[') {
				ptr++;
			}
			const char* endPtr = strchr(ptr, ']');
			if(!endPtr) {
				break;
			}
			const char* newEndPtr = fixTag(ptr, endPtr);
			bool diff = newEndPtr != endPtr;
			endPtr = newEndPtr;
			flush(ptr-1);
			if(!diff) {
				mNodeStart = ptr;
				parseTag(ptr, endPtr - ptr);
			}
			ptr = endPtr;
			if(!diff) {
				ptr++;
			}
			mNodeStart = ptr;
		} else if(c == '\\' && *ptr == 'n') {
			flush(ptr-1);
			ptr++;
			mNodeStart = ptr;
			addLinebreakNode();
		}
	}
	FLUSH;
}
예제 #4
0
파일: utf8toutf32.c 프로젝트: att/ast
size_t utf8toutf32(uint32_t *up, const char *str, size_t n) {
    wchar_t wc;
    int r;

    if ((r = utf8towc(&wc, str, n)) > 0) *up = (uint32_t)wc;
    return r;
}
예제 #5
0
CString OptUtf8ToStr(LPCSTR psz, int iLen)
{
	CStringW wstr;
	int iMaxWideStrLen = iLen;
	LPWSTR pwsz = wstr.GetBuffer(iMaxWideStrLen);
	int iWideChars = utf8towc(psz, iLen, pwsz, iMaxWideStrLen);
	if (iWideChars <= 0)
	{
		// invalid UTF8 string...
		wstr.ReleaseBuffer(0);
		wstr = psz;				// convert with local codepage
	}
	else
		wstr.ReleaseBuffer(iWideChars);
	return wstr;					// just return the string
}
예제 #6
0
CString OptUtf8ToStr(const CStringA& rastr)
{
	CStringW wstr;
	int iMaxWideStrLen = rastr.GetLength();
	LPWSTR pwsz = wstr.GetBuffer(iMaxWideStrLen);
	int iWideChars = utf8towc(rastr, rastr.GetLength(), pwsz, iMaxWideStrLen);
	if (iWideChars <= 0)
	{
		// invalid UTF8 string...
		wstr.ReleaseBuffer(0);
		wstr = rastr;				// convert with local codepage
	}
	else
		wstr.ReleaseBuffer(iWideChars);
	return wstr;					// just return the string
}
예제 #7
0
int ByteStreamToWideChar(LPCSTR pcUtf8, UINT uUtf8Size, LPWSTR pwc, UINT uWideCharSize)
{
	int iWideChars = utf8towc(pcUtf8, uUtf8Size, pwc, uWideCharSize);
	if (iWideChars < 0)
	{
		LPWSTR pwc0 = pwc;
		while (uUtf8Size && uWideCharSize)
		{
			if ((*pwc++ = (BYTE)*pcUtf8++) == L'\0')
				break;
			uUtf8Size--;
			uWideCharSize--;
		}
		iWideChars = pwc - pwc0;
	}
	return iWideChars;
}
예제 #8
0
wchar_t *CPLRecodeToWCharStub( const char *pszSource,
                               const char *pszSrcEncoding, 
                               const char *pszDstEncoding )

{
    char *pszUTF8Source = (char *) pszSource;

    if( strcmp(pszSrcEncoding,CPL_ENC_UTF8) != 0 
        && strcmp(pszSrcEncoding,CPL_ENC_ASCII) != 0 )
    {
        pszUTF8Source = CPLRecodeStub( pszSource, pszSrcEncoding, CPL_ENC_UTF8 );
        if( pszUTF8Source == NULL )
            return NULL;
    }

/* -------------------------------------------------------------------- */
/*      We try to avoid changes of character set.  We are just          */
/*      providing for unicode to unicode.                               */
/* -------------------------------------------------------------------- */
    if( strcmp(pszDstEncoding,"WCHAR_T") != 0
        && strcmp(pszDstEncoding,CPL_ENC_UCS2) != 0
        && strcmp(pszDstEncoding,CPL_ENC_UCS4) != 0 
        && strcmp(pszDstEncoding,CPL_ENC_UTF16) != 0 )
    {
        CPLError( CE_Failure, CPLE_AppDefined,
                  "Stub recoding implementation does not support\n"
                  "CPLRecodeToWCharStub(...,%s,%s)", 
                  pszSrcEncoding, pszDstEncoding );
        return NULL;
    }

/* -------------------------------------------------------------------- */
/*      Do the UTF-8 to UCS-2 recoding.                                 */
/* -------------------------------------------------------------------- */
    int nSrcLen = strlen(pszUTF8Source);
    wchar_t *pwszResult = (wchar_t *) CPLCalloc(sizeof(wchar_t),nSrcLen+1);

    utf8towc( pszUTF8Source, nSrcLen, pwszResult, nSrcLen+1 );

    if( pszUTF8Source != pszSource )
        CPLFree( pszUTF8Source );

    return pwszResult;
}
예제 #9
0
int ByteStreamToWideChar(LPCSTR pcUtf8, UINT uUtf8Size, LPWSTR pwc, UINT uWideCharSize)
{
    int iWideChars = utf8towc(pcUtf8, uUtf8Size, pwc, uWideCharSize);
    if (iWideChars < 0)
    {
        LPWSTR pwc0 = pwc;
        while (uUtf8Size && uWideCharSize)
        {
            if ((*pwc++ = (BYTE)*pcUtf8++) == L'\0')
                break;
            uUtf8Size--;
            uWideCharSize--;
        }
#pragma warning(disable : 4244)
        iWideChars = pwc - pwc0;
#pragma warning(default : 4244)
    }
    return iWideChars;
}
예제 #10
0
static Tab* getComments(const char* query, bool log) {
	sqlite3_stmt* stmt = NULL;
	if(!sDB) {
		SQLT(sqlite3_open("../wowfoot-import/imports.db", &sDB));
		atexit(&closeDb);
	}

	SQLT(sqlite3_prepare_v2(sDB, query, -1, &stmt, NULL));

	int res;
	commentTabChtml* ct = new commentTabChtml();
	Formatter f(log);
	while((res = sqlite3_step(stmt)) == SQLITE_ROW) {
		Comment c;
	try {
		c.user = (const char*)sqlite3_column_text(stmt, 0);
		c.originalBody = (const char*)sqlite3_column_text(stmt, 1);
		for(size_t i=0; i<c.originalBody.size(); ) {
#if 0
			// blank out invalid utf-8 sequences.
			wchar_t w;
			int rs = utf8towc(&w, c.originalBody.c_str() + i, c.originalBody.size() - i);
			if(rs <= 0) {
				c.originalBody[i] = ' ';
				i++;
				continue;
			}
#else
			// blank out all non-printable ascii characters.
			if((unsigned int)c.originalBody[i] > 127 || iscntrl(c.originalBody[i]))
				c.originalBody[i] = ' ';
#endif
			// transform '-' to avoid the HTML end-comment combo "-->".
			if(c.originalBody[i] == '-')
				c.originalBody[i] = '_';
			// transform EOL for readability.
			if(i >= 1) if(c.originalBody[i] == 'n' && c.originalBody[i-1] == '\\')
				c.originalBody[i] = '\n';
			i++;
		}
		c.rating = sqlite3_column_int(stmt, 2);
		c.date = (const char*)sqlite3_column_text(stmt, 3);
		c.indent = sqlite3_column_int(stmt, 4);
		c.id = sqlite3_column_int(stmt, 5);
		if(log)
			printf("Comment %i\n", c.id);
		c.body = f.formatComment((const char*)sqlite3_column_text(stmt, 1));
		ct->mComments.push_back(c);
	} catch(Exception& e) {
		printf("Exception in comment %i\n", c.id);
		SQLT(sqlite3_finalize(stmt));
		throw e;
	}
	}
	if(res != SQLITE_DONE) {
		SQLT(res);
	}
	SQLT(sqlite3_finalize(stmt));
	ct->id = "comments";
	ct->title = "Comments";
	ct->count = ct->mComments.size();
	return ct;
}