void CPatternsFileProcessor::readLine() { check_logic( file.good() ); ++lineNumber; getline( file, line, '\n' ); // support windows EOL style "\r\n" if( !line.empty() && line.back() == '\r' ) { line.erase( prev( line.end() ) ); // line.pop_back(); } const string::size_type invalidByteOffset = IsValidUtf8( line ); if( invalidByteOffset == string::npos ) { // not so effective... ReplaceTabsWithSpacesInSignleLine( line ); const string::size_type invalidCharOffset = IsValidText( line ); if( invalidCharOffset != string::npos ) { errorProcessor.AddError( CError( CLineSegment( invalidCharOffset ), CSharedFileLine( line, lineNumber ), "the file is not a text file", ES_CriticalError ) ); line.clear(); } } else { errorProcessor.AddError( CError( CLineSegment( invalidByteOffset ), CSharedFileLine( line, lineNumber ), "the file is not valid UTF-8 file", ES_CriticalError ) ); line.clear(); } }
// Accepts an HTML stream and tries to determine its encoding; // if no encoding is detected, the default codec for this locale is returned. // We use this function because Qt's QTextCodec::codecForHtml() function // leaves a *lot* to be desired. const QTextCodec& HTMLEncodingResolver::GetCodecForHTML( const QByteArray &raw_text ) { // Qt docs say Qt will take care of deleting // any QTextCodec objects on application exit QString ascii_data = raw_text; int head_end = ascii_data.indexOf( QRegExp( HEAD_END ) ); if ( head_end != -1 ) { QString head = Utility::Substring( 0, head_end, ascii_data ); QRegExp encoding( ENCODING_ATTRIBUTE ); head.indexOf( encoding ); QTextCodec *encoding_codec = QTextCodec::codecForName( encoding.cap( 1 ).toAscii() ); if ( encoding_codec != 0 ) return *encoding_codec; QRegExp charset( "charset=([^\"]+)\"" ); head.indexOf( charset ); QTextCodec *charset_codec = QTextCodec::codecForName( charset .cap( 1 ).toAscii() ); if ( charset_codec != 0 ) return *charset_codec; } // This is a workaround for a bug in QTextCodec which // expects the 'charset' attribute to always come after // the 'http-equiv' attribute ascii_data.replace( QRegExp( "<\\s*meta([^>]*)http-equiv=\"Content-Type\"([^>]*)>" ), "<meta http-equiv=\"Content-Type\" \\1 \\2>" ); // If we couldn't find a codec ourselves, // we use Qt's function. QTextCodec &locale_codec = *QTextCodec::codecForLocale(); QTextCodec &detected_codec = *QTextCodec::codecForHtml( ascii_data.toAscii(), QTextCodec::codecForLocale() ); if ( detected_codec.name() != locale_codec.name() ) return detected_codec; // If that couldn't find anything, then let's test for UTF-8 if ( IsValidUtf8( raw_text ) ) return *QTextCodec::codecForName( "UTF-8" ); // If everything fails, we fall back to the locale default return locale_codec; }
// Accepts a full path to an HTML file. // Reads the file, detects the encoding // and returns the text converted to Unicode. QString HTMLEncodingResolver::ReadHTMLFile(const QString &fullfilepath) { QFile file(fullfilepath); // Check if we can open the file if (!file.open(QFile::ReadOnly)) { std::string msg = file.fileName().toStdString() + ": " + file.errorString().toStdString(); throw (CannotOpenFile(msg)); } QByteArray data = file.readAll(); if (IsValidUtf8(data)) { data.replace("\xC2\xA0", " "); } return Utility::ConvertLineEndings(GetCodecForHTML(data)->toUnicode(data)); }
// Accepts a full path to an HTML file. // Reads the file, detects the encoding // and returns the text converted to Unicode. QString HTMLEncodingResolver::ReadHTMLFile(const QString &fullfilepath) { QFile file(fullfilepath); // Check if we can open the file if (!file.open(QFile::ReadOnly)) { boost_throw(CannotOpenFile() << errinfo_file_fullpath(file.fileName().toStdString()) << errinfo_file_errorstring(file.errorString().toStdString()) ); } QByteArray data = file.readAll(); if (IsValidUtf8(data)) { data.replace("\xC2\xA0", " "); } return Utility::ConvertLineEndings(GetCodecForHTML(data)->toUnicode(data)); }
// Accepts an HTML stream and tries to determine its encoding; // if no encoding is detected, the default codec for this locale is returned. // We use this function because Qt's QTextCodec::codecForHtml() function // leaves a *lot* to be desired. const QTextCodec *HTMLEncodingResolver::GetCodecForHTML(const QByteArray &raw_text) { unsigned char c1; unsigned char c2; unsigned char c3; unsigned char c4; QString text; QTextCodec *codec; if (raw_text.count() < 4) { return QTextCodec::codecForName("UTF-8"); } // Check the BOM if present. c1 = raw_text.at(0); c2 = raw_text.at(1); c3 = raw_text.at(2); c4 = raw_text.at(3); if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { return QTextCodec::codecForName("UTF-8"); } else if (c1 == 0xFF && c2 == 0xFE && c3 == 0 && c4 == 0) { return QTextCodec::codecForName("UTF-32LE"); } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) { return QTextCodec::codecForName("UTF-32BE"); } else if (c1 == 0xFE && c2 == 0xFF) { return QTextCodec::codecForName("UTF-16BE"); } else if (c1 == 0xFF && c2 == 0xFE) { return QTextCodec::codecForName("UTF-16LE"); } // Alternating char followed by 0 is typical of utf 16 le without BOM. if (c1 != 0 && c2 == 0 && c3 != 0 && c4 == 0) { return QTextCodec::codecForName("UTF-16LE"); } // Try to find an ecoding specified in the file itself. text = Utility::Substring(0, 1024, raw_text); // Check if the xml encoding attribute is set. QRegularExpression enc_re(ENCODING_ATTRIBUTE); QRegularExpressionMatch enc_mo = enc_re.match(text); if (enc_mo.hasMatch()) { codec = QTextCodec::codecForName(enc_mo.captured(1).toLatin1().toUpper()); if (codec) { return codec; } } // Check if the charset is set in the head. QRegularExpression char_re(CHARSET_ATTRIBUTE); QRegularExpressionMatch char_mo = char_re.match(text); if (char_mo.hasMatch()) { codec = QTextCodec::codecForName(char_mo.captured(1).toLatin1().toUpper()); if (codec) { return codec; } } // See if all characters within this document are utf-8. if (IsValidUtf8(raw_text)) { return QTextCodec::codecForName("UTF-8"); } // Finally, let Qt guess and if it doesn't know it will return the codec // for the current locale. text = raw_text; return QTextCodec::codecForHtml(raw_text, QTextCodec::codecForLocale()); }