bool KEncodingDetector::processNull(char *data, int len) { bool bin=false; if(is16Bit(d->m_codec)) { for (int i=1; i < len; i+=2) { if ((data[i]=='\0') && (data[i-1]=='\0')) { bin=true; data[i]=' '; } } return bin; } // replace '\0' by spaces, for buggy pages int i = len-1; while(--i>=0) { if(data[i]==0) { bin=true; data[i]=' '; } } return bin; }
bool KEncodingDetector::setEncoding(const char *_encoding, EncodingChoiceSource type) { QTextCodec *codec; QByteArray enc(_encoding); if(/*enc.isNull() || */enc.isEmpty()) { if (type==DefaultEncoding) codec=d->m_defaultCodec; else return false; } else { //QString->QTextCodec enc = enc.toLower(); // hebrew visually ordered if(enc=="visual") enc="iso8859-8"; bool b; codec = KGlobal::charsets()->codecForName(QLatin1String(enc), b); if (!b) return false; } if (d->m_codec->mibEnum()==codec->mibEnum()) { // We already have the codec, but we still want to re-set the type, // as we may have overwritten a default with a detected d->m_source = type; return true; } if ((type==EncodingFromMetaTag || type==EncodingFromXMLHeader) && is16Bit(codec)) { //Sometimes the codec specified is absurd, i.e. UTF-16 despite //us decoding a meta tag as ASCII. In that case, ignore it. return false; } if (codec->mibEnum() == Mib8859_8) { //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. codec = QTextCodec::codecForName("iso8859-8-i"); // visually ordered unless one of the following if(!(enc=="iso-8859-8-i"||enc=="iso_8859-8-i"||enc=="csiso88598i"||enc=="logical")) d->m_visualRTL = true; } d->m_codec = codec; d->m_source = type; delete d->m_decoder; d->m_decoder = d->m_codec->makeDecoder(); #ifdef DECODE_DEBUG kDebug(6005) << "KEncodingDetector::encoding used is" << d->m_codec->name(); #endif return true; }
bool KEncodingDetector::analyze(const char *data, int len) { // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. // maximumBOMLength = 10 // Even if the user has chosen utf16 we still need to auto-detect the endianness if (len >= 10 && ((d->m_source != UserChosenEncoding) || is16Bit(d->m_codec))) { // Extract the first three bytes. const uchar *udata = (const uchar *)data; uchar c1 = *udata++; uchar c2 = *udata++; uchar c3 = *udata++; // Check for the BOM const char *autoDetectedEncoding; if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { autoDetectedEncoding = "UTF-16"; } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { autoDetectedEncoding = "UTF-8"; } else if (c1 == 0x00 || c2 == 0x00) { uchar c4 = *udata++; uchar c5 = *udata++; uchar c6 = *udata++; uchar c7 = *udata++; uchar c8 = *udata++; uchar c9 = *udata++; uchar c10 = *udata++; int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) { autoDetectedEncoding = "UTF-16"; } else { autoDetectedEncoding = 0; } } else { autoDetectedEncoding = 0; } // If we found a BOM, use the encoding it implies. if (autoDetectedEncoding != 0) { d->m_source = BOM; d->m_codec = QTextCodec::codecForName(autoDetectedEncoding); assert(d->m_codec); //enc = d->m_codec->name(); delete d->m_decoder; d->m_decoder = d->m_codec->makeDecoder(); #ifdef DECODE_DEBUG qWarning() << "Detection by BOM"; #endif if (is16Bit(d->m_codec) && c2 == 0x00) { // utf16LE, we need to put the decoder in LE mode char reverseUtf16[3] = {(char)0xFF, (char)0xFE, 0x00}; d->m_decoder->toUnicode(reverseUtf16, 2); } return true; } } //exit from routine in case it was called to only detect byte order for utf-16 if (d->m_source == UserChosenEncoding) { #ifdef DECODE_DEBUG qWarning() << "KEncodingDetector: UserChosenEncoding exit "; #endif if (errorsIfUtf8(data, len)) { setEncoding("", DefaultEncoding); } return true; } // HTTP header takes precedence over meta-type stuff if (d->m_source == EncodingFromHTTPHeader) { return true; } if (!d->m_seenBody) { // we still don't have an encoding, and are in the head // the following tags are allowed in <head>: // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE const char *ptr = data; const char *pEnd = data + len; while (ptr != pEnd) { if (*ptr != '<') { ++ptr; continue; } ++ptr; // Handle comments. if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { ptr += 3; skipComment(ptr, pEnd); continue; } // Handle XML header, which can have encoding in it. if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') { const char *end = ptr; while (*end != '>' && end < pEnd) { end++; } if (*end == '\0' || end == pEnd) { break; } QByteArray str(ptr, end - ptr); // qbytearray provides the \0 terminator int length; int pos = findXMLEncoding(str, length); // also handles the case when specified encoding aint correct if (pos != -1 && setEncoding(str.mid(pos, length).data(), EncodingFromXMLHeader)) { return true; } } //look for <meta>, stop if we reach <body> while ( !(((*ptr >= 'a') && (*ptr <= 'z')) || ((*ptr >= 'A') && (*ptr <= 'Z'))) && ptr < pEnd ) { ++ptr; } char tmp[5]; int length = 0; const char *max = ptr + 4; if (pEnd < max) { max = pEnd; } while ( (((*ptr >= 'a') && (*ptr <= 'z')) || ((*ptr >= 'A') && (*ptr <= 'Z')) || ((*ptr >= '0') && (*ptr <= '9'))) && ptr < max ) { tmp[length] = tolower(*ptr); ++ptr; ++length; } tmp[length] = 0; if (tmp[0] == 'm' && tmp[1] == 'e' && tmp[2] == 't' && tmp[3] == 'a') { // found a meta tag... const char *end = ptr; while (*end != '>' && *end != '\0' && end < pEnd) { end++; } //if ( *end == '\0' ) break; const QByteArray str = QByteArray(ptr, (end - ptr) + 1).toLower(); const int strLength = str.length(); int pos = 0; //if( (pos = str.find("http-equiv", pos)) == -1) break; //if( (pos = str.find("content-type", pos)) == -1) break; if ((pos = str.indexOf("charset")) == -1) { continue; } pos += 6; // skip to '=' if ((pos = str.indexOf("=", pos)) == -1) { continue; } // skip '=' ++pos; // skip whitespace before encoding itself while (pos < strLength && str[pos] <= ' ') { ++pos; } // there may also be an opening quote, if this is a charset= and not a http-equiv. if (pos < strLength && (str[pos] == '"' || str[pos] == '\'')) { ++pos; } // skip whitespace while (pos < strLength && str[pos] <= ' ') { ++pos; } if (pos == strLength) { continue; } int endpos = pos; while (endpos < strLength && (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\'' && str[endpos] != ';' && str[endpos] != '>')) { ++endpos; } #ifdef DECODE_DEBUG qDebug() << "KEncodingDetector: found charset in <meta>: " << str.mid(pos, endpos - pos).data(); #endif if (setEncoding(str.mid(pos, endpos - pos).data(), EncodingFromMetaTag)) { return true; } } else if (tmp[0] == 'b' && tmp[1] == 'o' && tmp[2] == 'd' && tmp[3] == 'y') { d->m_seenBody = true; break; } } } if (len < 20) { return false; } #ifdef DECODE_DEBUG qDebug() << "KEncodingDetector: using heuristics (" << strlen(data) << ")"; #endif switch (d->m_autoDetectLanguage) { case KEncodingDetector::Arabic: return setEncoding(automaticDetectionForArabic((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::Baltic: return setEncoding(automaticDetectionForBaltic((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::CentralEuropean: return setEncoding(automaticDetectionForCentralEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::Cyrillic: return setEncoding(automaticDetectionForCyrillic((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::Greek: return setEncoding(automaticDetectionForGreek((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::Hebrew: return setEncoding(automaticDetectionForHebrew((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::Japanese: return setEncoding(automaticDetectionForJapanese((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::Turkish: return setEncoding(automaticDetectionForTurkish((const unsigned char *) data, len).data(), AutoDetectedEncoding); // break; case KEncodingDetector::WesternEuropean: if (setEncoding(automaticDetectionForWesternEuropean((const unsigned char *) data, len).data(), AutoDetectedEncoding)) { return true; } else if (d->m_defaultCodec->mibEnum() == MibLatin1) { //detection for khtml return setEncoding("iso-8859-15", AutoDetectedEncoding); } else { //use default provided by eg katepart return setEncoding("", DefaultEncoding); } // break; case KEncodingDetector::SemiautomaticDetection: case KEncodingDetector::ChineseSimplified: case KEncodingDetector::ChineseTraditional: case KEncodingDetector::Korean: case KEncodingDetector::Thai: case KEncodingDetector::Unicode: case KEncodingDetector::NorthernSaami: case KEncodingDetector::SouthEasternEurope: case KEncodingDetector::None: // huh. somethings broken in this code ### FIXME //enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback. break; } return true; }
bool AvolitesD4Parser::parseChannels(const QDomElement& elem, QLCFixtureDef* fixtureDef) { QDomElement el = elem.firstChildElement(KD4TagAttribute); for (; !el.isNull(); el = el.nextSiblingElement(KD4TagAttribute)) { // Small integrity check if (el.attribute(KD4TagID).isEmpty()) continue; // If this attribute is a function (i.e. an attribute used as a control variable for other attributes) // then we just ignore it and continue. We can check it by checking if attribute Update on a <Function/> exists if (isFunction(el)) continue; QLCChannel* chan = new QLCChannel(); chan->setName(el.attribute(KD4TagName)); chan->setGroup(getGroupFromXML(el)); chan->setColour(getColourFromXML(el)); chan->setControlByte(QLCChannel::MSB); // add channel to fixture definition fixtureDef->addChannel(chan); m_channels.insert(el.attribute(KD4TagID), chan); // if this channel is a NoGroup then we don't need to continue // no capabilities nor 16 bit channel if (chan->group() == QLCChannel::NoGroup) continue; // parse capabilities if (!parseCapabilities(el, chan)) { m_channels.remove(el.attribute(KD4TagID)); delete chan; return false; } // If we have a DMX attribute higher than 255 means we have an attribute with a 16bit precision // so, we add another channel, with 'Fine' appended to it's name and set the LSB controlbyte // NOTE: this can be changed in the future, pending the revamp over adding 16bit capabilities to any channel // not only pan/tiltm, therefore I didn't add a constant for Fine and kept it as it. if (is16Bit(el)) { QLCChannel* fchan = new QLCChannel(); fchan->setName(el.attribute(KD4TagName) + " Fine"); fchan->setGroup(getGroupFromXML(el)); fchan->setColour(getColourFromXML(el)); fchan->setControlByte(QLCChannel::LSB); // parse capabilities if (!parseCapabilities(el, fchan, true)) { delete fchan; return false; } // Finally add channel to fixture definition fixtureDef->addChannel(fchan); m_channels.insert(el.attribute(KD4TagID) + " Fine", fchan); } } return true; }