static Codec * guessTextCodec( const EString & body ) { // step 1. try iso-2022-jp. this goes first because it's so // restrictive, and because 2022 strings also match the ascii and // utf-8 tests. if ( body[0] == 0x1B && ( body[1] == '(' || body[1] == '$' ) && ( body[2] == 'B' || body[2] == 'J' || body[2] == '@' ) ) { Codec * c = new Iso2022JpCodec; c->toUnicode( body ); if ( c->wellformed() ) return c; } // step 2. could it be pure ascii? Codec * a = new AsciiCodec; (void)a->toUnicode( body ); if ( a->wellformed() ) return a; // some multibyte encodings have to go before utf-8, or else utf-8 // will match. this applies at least to iso-2002-jp, but may also // apply to other encodings that use octet values 0x01-0x07f // exclusively. // step 3. does it look good as utf-8? Codec * u = new Utf8Codec; (void)u->toUnicode( body ); if ( u->wellformed() ) { // if it's actually ascii, return that. if ( a->valid() ) return a; return u; } // step 4. guess a codec based on the bodypart content. Codec * g = Codec::byString( body ); if ( g ) { // this probably isn't necessary... but it doesn't hurt to be sure. (void)g->toUnicode( body ); if ( g->wellformed() ) return g; } // step 5. is utf-8 at all plausible? if ( u->valid() ) return u; // should we use g here if valid()? return 0; }
Bodypart * Bodypart::parseBodypart( uint start, uint end, const EString & rfc2822, Header * h, Multipart * parent ) { if ( rfc2822[start] == 13 ) start++; if ( rfc2822[start] == 10 ) start++; Bodypart * bp = new Bodypart; bp->setParent( parent ); bp->setHeader( h ); EString body; if ( end > start ) body = rfc2822.mid( start, end-start ); if ( !body.contains( '=' ) ) { // sometimes people send c-t-e: q-p _and_ c-t-e: 7bit or 8bit. // if they are equivalent we can accept it. uint i = 0; bool any = false; HeaderField * f = 0; while ( (f=h->field(HeaderField::ContentTransferEncoding,i)) != 0 ) { if ( ((ContentTransferEncoding*)f)->encoding() == EString::QP ) any = true; i++; } if ( any && i > 1 ) h->removeField( HeaderField::ContentTransferEncoding ); } EString::Encoding e = EString::Binary; ContentTransferEncoding * cte = h->contentTransferEncoding(); if ( cte ) e = cte->encoding(); if ( !body.isEmpty() ) { if ( e == EString::Base64 || e == EString::Uuencode ) body = body.decoded( e ); else body = body.crlf().decoded( e ); } ContentType * ct = h->contentType(); if ( !ct ) { switch ( h->defaultType() ) { case Header::TextPlain: h->add( "Content-Type", "text/plain" ); break; case Header::MessageRfc822: h->add( "Content-Type", "message/rfc822" ); break; } ct = h->contentType(); } if ( ct->type() == "text" ) { bool specified = false; bool unknown = false; Codec * c = 0; if ( ct ) { EString csn = ct->parameter( "charset" ); if ( csn.lower() == "default" ) csn = ""; if ( !csn.isEmpty() ) specified = true; c = Codec::byName( csn ); if ( !c ) unknown = true; if ( c && c->name().lower() == "us-ascii" ) { // Some MTAs appear to say this in case there is no // Content-Type field - without checking whether the // body actually is ASCII. If it isn't, we'd better // call our charset guesser. (void)c->toUnicode( body ); if ( !c->valid() ) specified = false; // Not pretty. } } if ( !c ) c = new AsciiCodec; bp->d->hasText = true; bp->d->text = c->toUnicode( body.crlf() ); if ( c->name() == "GB2312" || c->name() == "ISO-2022-JP" || c->name() == "KS_C_5601-1987" ) { // undefined code point usage in GB2312 spam is much too // common. (GB2312 spam is much too common, but that's // another matter.) Gb2312Codec turns all undefined code // points into U+FFFD, so here, we can take the unicode // form and say it's the canonical form. when a client // later reads the message, it gets the text in unicode, // including U+FFFD. bool bad = !c->valid(); // the header may contain some unencoded gb2312. we bang // it by hand, ignoring errors. List<HeaderField>::Iterator hf( h->fields() ); while ( hf ) { if ( !hf->valid() && hf->type() == HeaderField::Subject ) { // is it right to bang only Subject? c->reset(); hf->setValue( c->toUnicode( hf->unparsedValue() ) ); } ++hf; } // if the body was bad, we prefer the (unicode) in // bp->d->text and pretend it arrived as UTF-8: if ( bad ) { c = new Utf8Codec; body = c->fromUnicode( bp->d->text ); } } if ( ( !specified && ( !c->wellformed() || ct->subtype() == "html" ) ) || ( specified && ( !c->valid() ) ) ) { Codec * g = 0; if ( ct->subtype() == "html" ) g = guessHtmlCodec( body ); else g = guessTextCodec( body ); UString guessed; if ( g ) guessed = g->toUnicode( body.crlf() ); if ( !g ) { // if we couldn't guess anything, keep what we had if // it's valid or explicitly specified, else use // unknown-8bit. if ( !specified && !c->valid() ) { c = new Unknown8BitCodec; bp->d->text = c->toUnicode( body.crlf() ); } } else { // if we could guess something, is our guess better // than what we had? if ( g->wellformed() && !c->wellformed() ) { c = g; bp->d->text = guessed; } } } if ( specified && c->state() == Codec::Invalid ) { // the codec was specified, and the specified codec // resulted in an error, but did not abort conversion. we // respond by forgetting the error, using the conversion // result (probably including one or more U+FFFD) and // labelling the message as UTF-8. c = new Utf8Codec; body = c->fromUnicode( bp->d->text ); } else if ( !specified && c->state() == Codec::Invalid ) { // the codec was not specified, and we couldn't find // anything. we call it unknown-8bit. c = new Unknown8BitCodec; bp->d->text = c->toUnicode( body ); } // if we ended up using a 16-bit codec and were using q-p, we // need to reevaluate without any trailing CRLF if ( e == EString::QP && c->name().startsWith( "UTF-16" ) ) bp->d->text = c->toUnicode( body.stripCRLF() ); if ( !c->valid() && bp->d->error.isEmpty() ) { bp->d->error = "Could not convert body to Unicode"; if ( specified ) { EString cs; if ( ct ) cs = ct->parameter( "charset" ); if ( cs.isEmpty() ) cs = c->name(); bp->d->error.append( " from " + cs ); } if ( specified && unknown ) bp->d->error.append( ": Character set not implemented" ); else if ( !c->error().isEmpty() ) bp->d->error.append( ": " + c->error() ); } if ( c->name().lower() != "us-ascii" ) ct->addParameter( "charset", c->name().lower() ); else if ( ct ) ct->removeParameter( "charset" ); body = c->fromUnicode( bp->d->text ); bool qp = body.needsQP(); if ( cte ) { if ( !qp ) { h->removeField( HeaderField::ContentTransferEncoding ); cte = 0; } else if ( cte->encoding() != EString::QP ) { cte->setEncoding( EString::QP ); } } else if ( qp ) { h->add( "Content-Transfer-Encoding", "quoted-printable" ); cte = h->contentTransferEncoding(); } } else { bp->d->data = body; if ( ct->type() != "multipart" && ct->type() != "message" ) { e = EString::Base64; // there may be exceptions. cases where some format really // needs another content-transfer-encoding: if ( ct->type() == "application" && ct->subtype().startsWith( "pgp-" ) && !body.needsQP() ) { // seems some PGP things need "Version: 1" unencoded e = EString::Binary; } else if ( ct->type() == "application" && ct->subtype() == "octet-stream" && body.contains( "BEGIN PGP MESSAGE" ) ) { // mutt cannot handle PGP in base64 (what a crock) e = EString::Binary; } // change c-t-e to match the encoding decided above if ( e == EString::Binary ) { h->removeField( HeaderField::ContentTransferEncoding ); cte = 0; } else if ( cte ) { cte->setEncoding( e ); } else { h->add( "Content-Transfer-Encoding", "base64" ); cte = h->contentTransferEncoding(); } } } if ( ct->type() == "multipart" ) { parseMultipart( start, end, rfc2822, ct->parameter( "boundary" ), ct->subtype() == "digest", bp->children(), bp, false ); } else if ( ct->type() == "message" && ct->subtype() == "rfc822" ) { // There are sometimes blank lines before the message. while ( rfc2822[start] == 13 || rfc2822[start] == 10 ) start++; Message * m = new Message; m->setParent( bp ); m->parse( rfc2822.mid( start, end-start ) ); List<Bodypart>::Iterator it( m->children() ); while ( it ) { bp->children()->append( it ); it->setParent( bp ); ++it; } bp->setMessage( m ); body = m->rfc822( false ); } bp->d->numBytes = body.length(); if ( cte ) body = body.encoded( cte->encoding(), 72 ); bp->d->numEncodedBytes = body.length(); if ( bp->d->hasText || ( ct->type() == "message" && ct->subtype() == "rfc822" ) ) { uint n = 0; uint i = 0; uint l = body.length(); while ( i < l ) { if ( body[i] == '\n' ) n++; i++; } if ( l && body[l-1] != '\n' ) n++; bp->setNumEncodedLines( n ); } h->simplify(); return bp; }
static Codec * guessHtmlCodec( const EString & body ) { // Let's see if the general function has something for us. Codec * guess = guessTextCodec( body ); // HTML prescribes that 8859-1 is the default. Let's see if 8859-1 // works. if ( !guess ) { guess = new Iso88591Codec; (void)guess->toUnicode( body ); if ( !guess->valid() ) guess = 0; } if ( !guess || ( !guess->wellformed() && ( guess->name() == "ISO-8859-1" || guess->name() == "ISO-8859-15" ) ) ) { // Some people believe that Windows codepage 1252 is // ISO-8859-1. Let's see if that works. Codec * windoze = new Cp1252Codec; (void)windoze->toUnicode( body ); if ( windoze->wellformed() ) guess = windoze; } // Some user-agents add a <meta http-equiv="content-type"> instead // of the Content-Type field. Maybe that exists? And if it exists, // is it more likely to be correct than our guess above? EString b = body.lower().simplified(); int i = 0; while ( i >= 0 ) { EString tag( "<meta http-equiv=\"content-type\" content=\"" ); i = b.find( tag, i ); if ( i >= 0 ) { i = i + tag.length(); int j = i; while ( j < (int)b.length() && b[j] != '"' ) j++; HeaderField * hf = HeaderField::create( "Content-Type", b.mid( i, j-i ) ); EString cs = ((MimeField*)hf)->parameter( "charset" ); Codec * meta = 0; if ( !cs.isEmpty() ) meta = Codec::byName( cs ); UString m; if ( meta ) m = meta->toUnicode( body ); UString g; if ( guess ) g = guess->toUnicode( body ); if ( meta && ( ( !m.isEmpty() && m == g ) || ( meta->wellformed() && ( !guess || !guess->wellformed() ) ) || ( meta->valid() && !guess ) || ( meta->valid() && guess && guess->name() == "ISO-8859-1" ) || ( meta->valid() && guess && !guess->valid() ) ) && meta->toUnicode( b ).ascii().contains( tag ) ) { guess = meta; } } } return guess; }