unsigned char* web_page::load_utf8_file( LPCWSTR path, bool is_html, LPCWSTR defEncoding ) { unsigned char* ret = NULL; HANDLE fl = CreateFile(path, GENERIC_READ, 0, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); if(fl != INVALID_HANDLE_VALUE) { DWORD size = GetFileSize(fl, NULL); ret = new unsigned char[size + 1]; DWORD cbRead = 0; if(size >= 3) { ReadFile(fl, ret, 3, &cbRead, NULL); if(ret[0] == '\xEF' && ret[1] == '\xBB' && ret[2] == '\xBF') { ReadFile(fl, ret, size - 3, &cbRead, NULL); ret[cbRead] = 0; } else { ReadFile(fl, ret + 3, size - 3, &cbRead, NULL); ret[cbRead + 3] = 0; } } CloseHandle(fl); } // try to convert encoding if(is_html) { std::wstring encoding; char* begin = StrStrIA((LPSTR) ret, "<meta"); while(begin && encoding.empty()) { char* end = StrStrIA(begin, ">"); char* s1 = StrStrIA(begin, "Content-Type"); if(s1 && s1 < end) { s1 = StrStrIA(begin, "charset"); if(s1) { s1 += strlen("charset"); while(!isalnum(s1[0]) && s1 < end) { s1++; } while((isalnum(s1[0]) || s1[0] == '-') && s1 < end) { encoding += s1[0]; s1++; } } } if(encoding.empty()) { begin = StrStrIA(begin + strlen("<meta"), "<meta"); } } if(encoding.empty() && defEncoding) { encoding = defEncoding; } if(!encoding.empty()) { if(!StrCmpI(encoding.c_str(), L"UTF-8")) { encoding.clear(); } } if(!encoding.empty()) { CoInitialize(NULL); IMultiLanguage* ml = NULL; HRESULT hr = CoCreateInstance(CLSID_CMultiLanguage, NULL, CLSCTX_INPROC_SERVER, IID_IMultiLanguage, (LPVOID*) &ml); MIMECSETINFO charset_src = {0}; MIMECSETINFO charset_dst = {0}; BSTR bstrCharSet = SysAllocString(encoding.c_str()); ml->GetCharsetInfo(bstrCharSet, &charset_src); SysFreeString(bstrCharSet); bstrCharSet = SysAllocString(L"utf-8"); ml->GetCharsetInfo(bstrCharSet, &charset_dst); SysFreeString(bstrCharSet); DWORD dwMode = 0; UINT szDst = (UINT) strlen((LPSTR) ret) * 4; LPSTR dst = new char[szDst]; if(ml->ConvertString(&dwMode, charset_src.uiInternetEncoding, charset_dst.uiInternetEncoding, (LPBYTE) ret, NULL, (LPBYTE) dst, &szDst) == S_OK) { dst[szDst] = 0; delete ret; ret = (unsigned char*) dst; } else { delete dst; } CoUninitialize(); } } return ret; }