Example #1
0
bool KEncodingDetector::processNull(char *data, int len)
{
    bool bin=false;
    if(is16Bit(d->m_codec))
    {
        for (int i=1; i < len; i+=2)
        {
            if ((data[i]=='\0') && (data[i-1]=='\0'))
            {
                bin=true;
                data[i]=' ';
            }
        }
        return bin;
    }
    // replace '\0' by spaces, for buggy pages
    int i = len-1;
    while(--i>=0)
    {
        if(data[i]==0)
        {
            bin=true;
            data[i]=' ';
        }
    }
    return bin;
}
Example #2
0
bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type)
{
    QTextCodec *codec;
    QByteArray enc(_encoding);
    if(/*enc.isNull() || */enc.isEmpty())
    {
        if (type==DefaultEncoding)
            codec=d->m_defaultCodec;
        else
            return false;
    }
    else
    {
        //QString->QTextCodec

        enc = enc.toLower();
         // hebrew visually ordered
        if(enc=="visual")
            enc="iso8859-8";
        bool b;
        codec = KGlobal::charsets()->codecForName(QLatin1String(enc), b);
        if (!b)
            return false;
    }

    if (d->m_codec->mibEnum()==codec->mibEnum())
    {
        // We already have the codec, but we still want to re-set the type,
        // as we may have overwritten a default with a detected
        d->m_source = type;
        return true;
    }

    if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec))
    {
        //Sometimes the codec specified is absurd, i.e. UTF-16 despite
        //us decoding a meta tag as ASCII. In that case, ignore it.
        return false;
    }

    if (codec->mibEnum() == Mib8859_8)
    {
        //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.
        codec = QTextCodec::codecForName("iso8859-8-i");

        // visually ordered unless one of the following
        if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical"))
            d->m_visualRTL = true;
    }

    d->m_codec = codec;
    d->m_source = type;
    delete d->m_decoder;
    d->m_decoder = d->m_codec->makeDecoder();
#ifdef DECODE_DEBUG
    kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name();
#endif
    return true;
}
Example #3
0
bool KEncodingDetector::analyze(const char *data, int len)
{
    // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
    // maximumBOMLength = 10
    // Even if the user has chosen utf16 we still need to auto-detect the endianness
    if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) {
        // Extract the first three bytes.
        const uchar *udata = (const uchar *)data;
        uchar c1 = *udata++;
        uchar c2 = *udata++;
        uchar c3 = *udata++;

        // Check for the BOM
        const char *autoDetectedEncoding;
        if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
            autoDetectedEncoding = "UTF-16";
        } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
            autoDetectedEncoding = "UTF-8";
        } else if (c1 == 0x00 || c2 == 0x00) {
            uchar c4 = *udata++;
            uchar c5 = *udata++;
            uchar c6 = *udata++;
            uchar c7 = *udata++;
            uchar c8 = *udata++;
            uchar c9 = *udata++;
            uchar c10 = *udata++;

            int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);
            int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);
            if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) {
                autoDetectedEncoding = "UTF-16";
            } else {
                autoDetectedEncoding = 0;
            }
        } else {
            autoDetectedEncoding = 0;
        }

        // If we found a BOM, use the encoding it implies.
        if (autoDetectedEncoding != 0) {
            d->m_source = BOM;
            d->m_codec = QTextCodec::codecForName(autoDetectedEncoding);
            assert(d->m_codec);
            //enc = d->m_codec->name();
            delete d->m_decoder;
            d->m_decoder = d->m_codec->makeDecoder();
#ifdef DECODE_DEBUG
            qWarning() << "Detection by BOM";
#endif
            if (is16Bit(d->m_codec) && c2 == 0x00) {
                // utf16LE, we need to put the decoder in LE mode
                char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00};
                d->m_decoder->toUnicode(reverseUtf16, 2);
            }
            return true;
        }
    }

    //exit from routine in case it was called to only detect byte order for utf-16
    if (d->m_source == UserChosenEncoding) {
#ifdef DECODE_DEBUG
        qWarning() << "KEncodingDetector: UserChosenEncoding exit ";
#endif

        if (errorsIfUtf8(data, len)) {
            setEncoding("", DefaultEncoding);
        }
        return true;
    }

    // HTTP header takes precedence over meta-type stuff
    if (d->m_source == EncodingFromHTTPHeader) {
        return true;
    }

    if (!d->m_seenBody) {
        // we still don't have an encoding, and are in the head
        // the following tags are allowed in <head>:
        // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
        const char *ptr = data;
        const char *pEnd = data + len;

        while (ptr != pEnd) {
            if (*ptr != '<') {
                ++ptr;
                continue;
            }
            ++ptr;
            // Handle comments.
            if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
                ptr += 3;
                skipComment(ptr, pEnd);
                continue;
            }

            // Handle XML header, which can have encoding in it.
            if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
                const char *end = ptr;
                while (*end != '>' && end < pEnd) {
                    end++;
                }
                if (*end == '\0' || end == pEnd) {
                    break;
                }
                QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator
                int length;
                int pos = findXMLEncoding(str, length);
                // also handles the case when specified encoding aint correct
                if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) {
                    return true;
                }
            }

            //look for <meta>, stop if we reach <body>
            while (
                !(((*ptr >= 'a') && (*ptr <= 'z')) ||
                  ((*ptr >= 'A') && (*ptr <= 'Z')))
                && ptr < pEnd
            ) {
                ++ptr;
            }

            char tmp[5];
            int length = 0;
            const char *max = ptr + 4;
            if (pEnd < max) {
                max = pEnd;
            }
            while (
                (((*ptr >= 'a') && (*ptr <= 'z')) ||
                 ((*ptr >= 'A') && (*ptr <= 'Z')) ||
                 ((*ptr >= '0') && (*ptr <= '9')))
                && ptr < max
            ) {
                tmp[length] = tolower(*ptr);
                ++ptr;
                ++length;
            }
            tmp[length] = 0;
            if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') {
                // found a meta tag...
                const char *end = ptr;
                while (*end != '>' && *end != '\0' && end < pEnd) {
                    end++;
                }
                //if ( *end == '\0' ) break;
                const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower();
                const int strLength = str.length();
                int pos = 0;
                //if( (pos = str.find("http-equiv", pos)) == -1) break;
                //if( (pos = str.find("content-type", pos)) == -1) break;
                if ((pos = str.indexOf("charset")) == -1) {
                    continue;
                }
                pos += 6;
                // skip to '='
                if ((pos = str.indexOf("=", pos)) == -1) {
                    continue;
                }

                // skip '='
                ++pos;

                // skip whitespace before encoding itself
                while (pos < strLength && str[pos] <= ' ') {
                    ++pos;
                }

                // there may also be an opening quote, if this is a charset= and not a http-equiv.
                if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) {
                    ++pos;
                }

                // skip whitespace
                while (pos < strLength && str[pos] <= ' ') {
                    ++pos;
                }

                if (pos == strLength) {
                    continue;
                }

                int endpos = pos;
                while (endpos < strLength &&
                        (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
                         && str[endpos] != ';' && str[endpos] != '>')) {
                    ++endpos;
                }
#ifdef DECODE_DEBUG
                qDebug() << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data();
#endif
                if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) {
                    return true;
                }
            } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') {
                d->m_seenBody = true;
                break;
            }
        }
    }

    if (len < 20) {
        return false;
    }

#ifdef DECODE_DEBUG
    qDebug() << "KEncodingDetector: using heuristics (" << strlen(data) << ")";
#endif

    switch (d->m_autoDetectLanguage) {
    case KEncodingDetector::Arabic:
        return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::Baltic:
        return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::CentralEuropean:
        return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//            break;
    case KEncodingDetector::Cyrillic:
        return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::Greek:
        return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::Hebrew:
        return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::Japanese:
        return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::Turkish:
        return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding);
//             break;
    case KEncodingDetector::WesternEuropean:
        if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) {
            return true;
        } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml
            return setEncoding("iso-8859-15", AutoDetectedEncoding);
        } else { //use default provided by eg katepart
            return setEncoding("", DefaultEncoding);
        }
//             break;
    case KEncodingDetector::SemiautomaticDetection:
    case KEncodingDetector::ChineseSimplified:
    case KEncodingDetector::ChineseTraditional:
    case KEncodingDetector::Korean:
    case KEncodingDetector::Thai:
    case KEncodingDetector::Unicode:
    case KEncodingDetector::NorthernSaami:
    case KEncodingDetector::SouthEasternEurope:
    case KEncodingDetector::None:
        // huh. somethings broken in this code ### FIXME
        //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.
        break;
    }

    return true;
}
bool AvolitesD4Parser::parseChannels(const QDomElement& elem, QLCFixtureDef* fixtureDef)
{
    QDomElement el = elem.firstChildElement(KD4TagAttribute);
    for (; !el.isNull(); el = el.nextSiblingElement(KD4TagAttribute))
    {
        // Small integrity check
        if (el.attribute(KD4TagID).isEmpty())
            continue;

        // If this attribute is a function (i.e. an attribute used as a control variable for other attributes)
        // then we just ignore it and continue. We can check it by checking if attribute Update on a <Function/> exists
        if (isFunction(el))
            continue;

        QLCChannel* chan = new QLCChannel();
        chan->setName(el.attribute(KD4TagName));
        chan->setGroup(getGroupFromXML(el));
        chan->setColour(getColourFromXML(el));
        chan->setControlByte(QLCChannel::MSB);

        // add channel to fixture definition
        fixtureDef->addChannel(chan);
        m_channels.insert(el.attribute(KD4TagID), chan);

        // if this channel is a NoGroup then we don't need to continue
        // no capabilities nor 16 bit channel
        if (chan->group() == QLCChannel::NoGroup)
            continue;

        // parse capabilities
        if (!parseCapabilities(el, chan))
        {
            m_channels.remove(el.attribute(KD4TagID));
            delete chan;
            return false;
        }

        // If we have a DMX attribute higher than 255 means we have an attribute with a 16bit precision
        // so, we add another channel, with 'Fine' appended to it's name and set the LSB controlbyte

        // NOTE: this can be changed in the future, pending the revamp over adding 16bit capabilities to any channel
        // not only pan/tiltm, therefore I didn't add a constant for Fine and kept it as it.
        if (is16Bit(el))
        {
            QLCChannel* fchan = new QLCChannel();
            fchan->setName(el.attribute(KD4TagName) + " Fine");
            fchan->setGroup(getGroupFromXML(el));
            fchan->setColour(getColourFromXML(el));
            fchan->setControlByte(QLCChannel::LSB);

            // parse capabilities
            if (!parseCapabilities(el, fchan, true))
            {
                delete fchan;
                return false;
            }

            // Finally add channel to fixture definition
            fixtureDef->addChannel(fchan);
            m_channels.insert(el.attribute(KD4TagID) + " Fine", fchan);
        }
    }

    return true;
}