Esempio n. 1
0
void CPatternsFileProcessor::readLine()
{
	check_logic( file.good() );

	++lineNumber;
	getline( file, line, '\n' );

	// support windows EOL style "\r\n"
	if( !line.empty() && line.back() == '\r' ) {
		line.erase( prev( line.end() ) ); // line.pop_back();
	}

	const string::size_type invalidByteOffset = IsValidUtf8( line );
	if( invalidByteOffset == string::npos ) {
		// not so effective...
		ReplaceTabsWithSpacesInSignleLine( line );

		const string::size_type invalidCharOffset = IsValidText( line );
		if( invalidCharOffset != string::npos ) {
			errorProcessor.AddError( CError(
				CLineSegment( invalidCharOffset ),
				CSharedFileLine( line, lineNumber ),
				"the file is not a text file", ES_CriticalError ) );
			line.clear();
		}
	} else {
		errorProcessor.AddError( CError(
			CLineSegment( invalidByteOffset ),
			CSharedFileLine( line, lineNumber ),
			"the file is not valid UTF-8 file", ES_CriticalError ) );
		line.clear();
	}
}
Esempio n. 2
0
// Accepts an HTML stream and tries to determine its encoding;
// if no encoding is detected, the default codec for this locale is returned.
// We use this function because Qt's QTextCodec::codecForHtml() function
// leaves a *lot* to be desired.
const QTextCodec& HTMLEncodingResolver::GetCodecForHTML( const QByteArray &raw_text )
{
    // Qt docs say Qt will take care of deleting
    // any QTextCodec objects on application exit

    QString ascii_data = raw_text;

    int head_end = ascii_data.indexOf( QRegExp( HEAD_END ) );

    if ( head_end != -1 )
    {
        QString head = Utility::Substring( 0, head_end, ascii_data );

        QRegExp encoding( ENCODING_ATTRIBUTE );
        head.indexOf( encoding );
        QTextCodec *encoding_codec = QTextCodec::codecForName( encoding.cap( 1 ).toAscii() );

        if ( encoding_codec != 0 )

            return *encoding_codec;

        QRegExp charset( "charset=([^\"]+)\"" );
        head.indexOf( charset );
        QTextCodec *charset_codec  = QTextCodec::codecForName( charset .cap( 1 ).toAscii() );

        if ( charset_codec != 0 )

            return *charset_codec;
    }

    // This is a workaround for a bug in QTextCodec which
    // expects the 'charset' attribute to always come after
    // the 'http-equiv' attribute
    ascii_data.replace( QRegExp( "<\\s*meta([^>]*)http-equiv=\"Content-Type\"([^>]*)>" ),
                                 "<meta http-equiv=\"Content-Type\" \\1 \\2>" );

    // If we couldn't find a codec ourselves,
    // we use Qt's function.
    QTextCodec &locale_codec   = *QTextCodec::codecForLocale();
    QTextCodec &detected_codec = *QTextCodec::codecForHtml( ascii_data.toAscii(), QTextCodec::codecForLocale() ); 

    if ( detected_codec.name() != locale_codec.name() )

        return detected_codec;

    // If that couldn't find anything, then let's test for UTF-8
    if ( IsValidUtf8( raw_text ) )

        return *QTextCodec::codecForName( "UTF-8" );

    // If everything fails, we fall back to the locale default
    return locale_codec;
}
Esempio n. 3
0
// Accepts a full path to an HTML file.
// Reads the file, detects the encoding
// and returns the text converted to Unicode.
QString HTMLEncodingResolver::ReadHTMLFile(const QString &fullfilepath)
{
    QFile file(fullfilepath);

    // Check if we can open the file
    if (!file.open(QFile::ReadOnly)) {
        std::string msg = file.fileName().toStdString() + ": " + file.errorString().toStdString();
        throw (CannotOpenFile(msg));
    }

    QByteArray data = file.readAll();

    if (IsValidUtf8(data)) {
        data.replace("\xC2\xA0", "&#160;");
    }

    return Utility::ConvertLineEndings(GetCodecForHTML(data)->toUnicode(data));
}
Esempio n. 4
0
// Accepts a full path to an HTML file.
// Reads the file, detects the encoding
// and returns the text converted to Unicode.
QString HTMLEncodingResolver::ReadHTMLFile(const QString &fullfilepath)
{
    QFile file(fullfilepath);

    // Check if we can open the file
    if (!file.open(QFile::ReadOnly)) {
        boost_throw(CannotOpenFile()
                    << errinfo_file_fullpath(file.fileName().toStdString())
                    << errinfo_file_errorstring(file.errorString().toStdString())
                   );
    }

    QByteArray data = file.readAll();

    if (IsValidUtf8(data)) {
        data.replace("\xC2\xA0", "&nbsp;");
    }

    return Utility::ConvertLineEndings(GetCodecForHTML(data)->toUnicode(data));
}
Esempio n. 5
0
// Accepts an HTML stream and tries to determine its encoding;
// if no encoding is detected, the default codec for this locale is returned.
// We use this function because Qt's QTextCodec::codecForHtml() function
// leaves a *lot* to be desired.
const QTextCodec *HTMLEncodingResolver::GetCodecForHTML(const QByteArray &raw_text)
{
    unsigned char c1;
    unsigned char c2;
    unsigned char c3;
    unsigned char c4;
    QString text;
    QTextCodec *codec;

    if (raw_text.count() < 4) {
        return QTextCodec::codecForName("UTF-8");
    }

    // Check the BOM if present.
    c1 = raw_text.at(0);
    c2 = raw_text.at(1);
    c3 = raw_text.at(2);
    c4 = raw_text.at(3);
    if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
        return QTextCodec::codecForName("UTF-8");
    } else if (c1 == 0xFF && c2 == 0xFE && c3 == 0 && c4 == 0) {
        return QTextCodec::codecForName("UTF-32LE");
    } else if (c1 == 0 && c2 == 0 && c3 == 0xFE && c4 == 0xFF) {
        return QTextCodec::codecForName("UTF-32BE");
    } else if (c1 == 0xFE && c2 == 0xFF) {
        return QTextCodec::codecForName("UTF-16BE");
    } else if (c1 == 0xFF && c2 == 0xFE) {
        return QTextCodec::codecForName("UTF-16LE");
    }

    // Alternating char followed by 0 is typical of utf 16 le without BOM.
    if (c1 != 0 && c2 == 0 && c3 != 0 && c4 == 0) {
        return QTextCodec::codecForName("UTF-16LE");
    }

    // Try to find an ecoding specified in the file itself.
    text = Utility::Substring(0, 1024, raw_text);

    // Check if the xml encoding attribute is set.
    QRegularExpression enc_re(ENCODING_ATTRIBUTE);
    QRegularExpressionMatch enc_mo = enc_re.match(text);
    if (enc_mo.hasMatch()) {
        codec = QTextCodec::codecForName(enc_mo.captured(1).toLatin1().toUpper());
        if (codec) {
            return codec;
        }
    }

    // Check if the charset is set in the head.
    QRegularExpression char_re(CHARSET_ATTRIBUTE);
    QRegularExpressionMatch char_mo = char_re.match(text);
    if (char_mo.hasMatch()) {
        codec = QTextCodec::codecForName(char_mo.captured(1).toLatin1().toUpper());
        if (codec) {
            return codec;
        }
    }

    // See if all characters within this document are utf-8.
    if (IsValidUtf8(raw_text)) {
        return QTextCodec::codecForName("UTF-8");
    }

    // Finally, let Qt guess and if it doesn't know it will return the codec
    // for the current locale.
    text = raw_text;
    return QTextCodec::codecForHtml(raw_text, QTextCodec::codecForLocale());
}