/* static */ VFromUnicodeConverter* XWinIntlMgr::NewFromUnicodeConverter(CharSet inCharSet) { XWinFromUnicodeConverter* converter = NULL; IMultiLanguage2* multiLanguage = RetainMultiLanguage(); if (multiLanguage != NULL) { converter = new XWinFromUnicodeConverter( multiLanguage, inCharSet); if (converter != NULL && !converter->IsValid()) { delete converter; converter = NULL; } multiLanguage->Release(); } return converter; }
~CExconverterMLang() { if (m_pmlang != nullptr) m_pmlang->Release(); if (m_hLibMLang != nullptr) FreeLibrary(m_hLibMLang); }
bool convertToUnicode(int srcCodepage, const char * src, size_t * srcbytes, wchar_t * dest, size_t *destchars) { UINT uisrcbytes = static_cast<UINT>(*srcbytes), uidestchars = static_cast<UINT>(*destchars); HRESULT hr = m_pmlang->ConvertStringToUnicode(&m_mlangcookie, srcCodepage, (char *)src, &uisrcbytes, dest, &uidestchars); *srcbytes = uisrcbytes; *destchars = uidestchars; return SUCCEEDED(hr) ? true : false; }
bool convertFromUnicode(int dstCodepage, const wchar_t * src, size_t * srcchars, char * dest, size_t *destbytes) { UINT uisrcchars = static_cast<UINT>(*srcchars), uidestbytes = static_cast<UINT>(*destbytes); HRESULT hr = m_pmlang->ConvertStringFromUnicode(&m_mlangcookie, dstCodepage, (wchar_t *)src, &uisrcchars, (char *)dest, &uidestbytes); *srcchars = uisrcchars; *destbytes = uidestbytes; return SUCCEEDED(hr) ? true : false; }
bool getCodePageInfo(int codepage, CodePageInfo *pCodePageInfo) { MIMECPINFO mcpi = {0}; HRESULT hr = m_pmlang->GetCodePageInfo(codepage, GetSystemDefaultLangID(), &mcpi); if (FAILED(hr)) return false; pCodePageInfo->fixedWidthFont = ucr::toTString(mcpi.wszFixedWidthFont); pCodePageInfo->bGDICharset = mcpi.bGDICharset; return true; }
bool getCodepageDescription(int codepage, String& sDescription) { wchar_t szDescription[256]; HRESULT hr = m_pmlang->GetCodePageDescription(codepage, GetSystemDefaultLangID(), szDescription, sizeof(szDescription)/sizeof(wchar_t)); if (FAILED(hr)) return false; sDescription = ucr::toTString(szDescription); return true; }
bool getCodepageFromCharsetName(const String& sCharsetName, int& codepage) { MIMECSETINFO charsetInfo; BSTR bstrCharsetName = SysAllocString(ucr::toUTF16(sCharsetName).c_str()); HRESULT hr = m_pmlang->GetCharsetInfo(bstrCharsetName, &charsetInfo); SysFreeString(bstrCharsetName); if (FAILED(hr)) return false; codepage = charsetInfo.uiInternetEncoding; return true; }
std::vector<CodePageInfo> enumCodePages() { std::vector<CodePageInfo> cpinfo; IEnumCodePage *pEnumCodePage = nullptr; ULONG ccpInfo; HRESULT hr = m_pmlang->EnumCodePages(MIMECONTF_SAVABLE_BROWSER | MIMECONTF_VALID | MIMECONTF_VALID_NLS, 0, &pEnumCodePage); if (FAILED(hr)) return cpinfo; std::unique_ptr<MIMECPINFO[]> pcpInfo(new MIMECPINFO[256]); if (FAILED(pEnumCodePage->Next(256, pcpInfo.get(), &ccpInfo))) return cpinfo; cpinfo.resize(ccpInfo); for (int i = 0; i < (int)ccpInfo; i++) { cpinfo[i].codepage = pcpInfo[i].uiCodePage; cpinfo[i].desc = ucr::toTString(pcpInfo[i].wszDescription); } return cpinfo; }
int detectInputCodepage(int autodetectType, int defcodepage, const char *data, size_t size) { int codepage; IMLangConvertCharset *pcc; UINT dstsize; UINT srcsize; HRESULT hr; hr = m_pmlang->CreateConvertCharset(autodetectType, ucr::CP_UCS2LE, MLCONVCHARF_AUTODETECT, &pcc); if (FAILED(hr)) return defcodepage; srcsize = static_cast<UINT>(size); dstsize = static_cast<UINT>(size * sizeof(wchar_t)); std::unique_ptr<unsigned char[]> pdst(new unsigned char[size * sizeof(wchar_t)]); SetLastError(0); hr = pcc->DoConversion((unsigned char *)data, &srcsize, pdst.get(), &dstsize); pcc->GetSourceCodePage((unsigned *)&codepage); if (FAILED(hr) || GetLastError() == ERROR_NO_UNICODE_TRANSLATION || codepage == autodetectType) { int codepagestotry[3] = {0}; if (codepage == autodetectType) { if (size < 2 || (data[0] != 0 && data[1] != 0)) { codepagestotry[0] = defcodepage; codepagestotry[1] = ucr::CP_UTF_8; } } else { if (size < 2 || (data[0] != 0 && data[1] != 0)) codepagestotry[0] = ucr::CP_UTF_8; } codepage = defcodepage; size_t i; for (i = 0; i < sizeof(codepagestotry)/sizeof(codepagestotry[0]) - 1; i++) { if (codepagestotry[i] == 0) break; pcc->Initialize(codepagestotry[i], ucr::CP_UCS2LE, 0); srcsize = static_cast<UINT>(size); dstsize = static_cast<UINT>(size * sizeof(wchar_t)); SetLastError(0); hr = pcc->DoConversion((unsigned char *)data, &srcsize, pdst.get(), &dstsize); if (FAILED(hr) || GetLastError() == ERROR_NO_UNICODE_TRANSLATION) continue; codepage = codepagestotry[i]; break; } if (codepagestotry[i] == 0 && (size % 2) == 0) { // UCS-2 int lezerocount = 0; int lecrorlf = 0; int bezerocount = 0; int becrorlf = 0; for (i = 0; i < size; i += 2) { if (data[i] == 0) { bezerocount++; if (data[i + 1] == 0x0a || data[i + 1] == 0x0d) becrorlf++; } else if (data[i + 1] == 0) { lezerocount++; if (data[i] == 0x0a || data[i] == 0x0d) lecrorlf++; } } if (lezerocount > 0 || bezerocount > 0) { if ((lecrorlf == 0 && size < 512 || (lecrorlf > 0 && (size / lecrorlf > 1024))) && lezerocount > bezerocount) codepage = ucr::CP_UCS2LE; else if ((becrorlf == 0 && size < 512 || (becrorlf > 0 && (size / becrorlf > 1024))) && lezerocount < bezerocount) codepage = ucr::CP_UCS2BE; } } } if (codepage == 20127) return defcodepage; return codepage; }
std::wstring load_text_file(const std::wstring &path, uint32_t codepage) { struct F { static void release(IUnknown *x) { x->Release(); } }; IStream *stream; HRESULT hr = SHCreateStreamOnFileW(path.c_str(), STGM_READ | STGM_SHARE_DENY_WRITE, &stream); if (FAILED(hr)) win32::throw_error(path, hr); std::shared_ptr<IStream> streamPtr(stream, F::release); LARGE_INTEGER li = { 0 }; ULARGE_INTEGER ui; HR(stream->Seek(li, STREAM_SEEK_END, &ui)); if (ui.QuadPart > 0x100000) { throw std::runtime_error(strutil::w2us(path + L": file too big")); } size_t fileSize = ui.LowPart; HR(stream->Seek(li, STREAM_SEEK_SET, &ui)); IMultiLanguage2 *mlang; HR(CoCreateInstance(CLSID_CMultiLanguage, 0, CLSCTX_INPROC_SERVER, IID_IMultiLanguage2, (void**)(&mlang))); std::shared_ptr<IMultiLanguage2> mlangPtr(mlang, F::release); if (!codepage) { DetectEncodingInfo encoding[5]; INT nscores = 5; HR(mlang->DetectCodepageInIStream(0, GetACP(), stream, encoding, &nscores)); /* * Usually DetectCodepageInIStream() puts the most appropriate choice * in the first place. * However, it tends to pick 8bit locale charset for the first place, * even if it is really an UTF-8 encoded file. */ codepage = encoding[0].nCodePage; for (size_t i = 0; i < nscores; ++i) if (encoding[i].nCodePage == 65001) { codepage = 65001; break; } HR(stream->Seek(li, STREAM_SEEK_SET, &ui)); } std::vector<char> ibuf(fileSize); ULONG nread; HR(stream->Read(&ibuf[0], ibuf.size(), &nread)); DWORD ctx = 0; UINT size = ibuf.size(), cnt; HR(mlang->ConvertStringToUnicode(&ctx, codepage, &ibuf[0], &size, 0, &cnt)); std::vector<wchar_t> obuf(cnt); size = ibuf.size(); HR(mlang->ConvertStringToUnicode(&ctx, codepage, &ibuf[0], &size, &obuf[0], &cnt)); obuf.push_back(0); // chop off BOM size_t bom = obuf.size() && obuf[0] == 0xfeff; return strutil::normalize_crlf(&obuf[bom], L"\n"); }