// If the file contains a BOM this function will advance the file pointer by the BOM size (either 2 or 3) static bool GetFileCodepage(const os::fs::file& File, uintptr_t DefaultCodepage, uintptr_t& Codepage, bool& SignatureFound, bool& NotUTF8, bool& NotUTF16, bool UseHeuristics) { if (GetUnicodeCpUsingBOM(File, Codepage)) { SignatureFound = true; return true; } if (!UseHeuristics) return false; // TODO: configurable const size_t Size = 32768; char_ptr Buffer(Size); size_t ReadSize = 0; const auto ReadResult = File.Read(Buffer.get(), Size, ReadSize); File.SetPointer(0, nullptr, FILE_BEGIN); if (!ReadResult || !ReadSize) return false; if (GetUnicodeCpUsingWindows(Buffer.get(), ReadSize, Codepage)) return true; NotUTF16 = true; unsigned long long FileSize = 0; const auto WholeFileRead = File.GetSize(FileSize) && ReadSize == FileSize; bool PureAscii = false; if (encoding::is_valid_utf8({ Buffer.get(), ReadSize }, !WholeFileRead, PureAscii)) { if (!PureAscii) Codepage = CP_UTF8; else if (DefaultCodepage == CP_UTF8 || DefaultCodepage == encoding::codepage::ansi() || DefaultCodepage == encoding::codepage::oem()) Codepage = DefaultCodepage; else Codepage = encoding::codepage::ansi(); return true; } NotUTF8 = true; return GetCpUsingUniversalDetectorWithExceptions({ Buffer.get(), ReadSize }, Codepage); }
// If the file contains a BOM this function will advance the file pointer by the BOM size (either 2 or 3) static bool GetUnicodeCpUsingBOM(const os::fs::file& File, uintptr_t& Codepage) { char Buffer[3]{}; size_t BytesRead = 0; if (!File.Read(Buffer, std::size(Buffer), BytesRead)) return false; std::string_view Signature(Buffer, std::size(Buffer)); if (BytesRead >= 2) { if (Signature.substr(0, 2) == encoding::get_signature_bytes(CP_UNICODE)) { Codepage = CP_UNICODE; File.SetPointer(2, nullptr, FILE_BEGIN); return true; } if (Signature.substr(0, 2) == encoding::get_signature_bytes(CP_REVERSEBOM)) { Codepage = CP_REVERSEBOM; File.SetPointer(2, nullptr, FILE_BEGIN); return true; } } if (BytesRead >= 3 && Signature == encoding::get_signature_bytes(CP_UTF8)) { Codepage = CP_UTF8; File.SetPointer(3, nullptr, FILE_BEGIN); return true; } File.SetPointer(0, nullptr, FILE_BEGIN); return false; }