void ContentType::parse( const EString &s ) { EmailParser p( s ); p.whitespace(); while ( p.present( ":" ) ) p.whitespace(); bool mustGuess = false; if ( p.atEnd() ) { t = "text"; st = "plain"; } else { uint x = p.mark(); if ( p.nextChar() == '/' ) mustGuess = true; else t = p.mimeToken().lower(); if ( p.atEnd() ) { if ( s == "text" ) { t = "text"; // elm? mailtool? someone does this, anyway. st = "plain"; } // the remainder is from RFC 1049 else if ( s == "postscript" ) { t = "application"; st = "postscript"; } else if ( s == "postscript" ) { t = "application"; st = "postscript"; } else if ( s == "sgml" ) { t = "text"; st = "sgml"; } else if ( s == "tex" ) { t = "application"; st = "x-tex"; } else if ( s == "troff" ) { t = "application"; st = "x-troff"; } else if ( s == "dvi" ) { t = "application"; st = "x-dvi"; } else if ( s.startsWith( "x-" ) ) { st = "x-rfc1049-" + s; t = "application"; } else { // scribe and undefined types setError( "Invalid Content-Type: " + s.quoted() ); } } else { if ( p.nextChar() == '/' ) { p.step(); if ( !p.atEnd() || p.nextChar() != ';' ) st = p.mimeToken().lower(); if ( st.isEmpty() ) mustGuess = true; } else if ( p.nextChar() == '=' ) { // oh no. someone skipped the content-type and // supplied only some parameters. we'll assume it's // text/plain and parse the parameters. t = "text"; st = "plain"; p.restore( x ); mustGuess = true; } else { addParameter( "original-type", t + "/" + st ); t = "application"; st = "octet-stream"; mustGuess = true; } parseParameters( &p ); } } if ( mustGuess ) { EString fn = parameter( "name" ); if ( fn.isEmpty() ) fn = parameter( "filename" ); while ( fn.endsWith( "." ) ) fn.truncate( fn.length() - 1 ); fn = fn.lower(); if ( fn.endsWith( "jpg" ) || fn.endsWith( "jpeg" ) ) { t = "image"; st = "jpeg"; } else if ( fn.endsWith( "htm" ) || fn.endsWith( "html" ) ) { t = "text"; st = "html"; } else if ( fn.isEmpty() && st.isEmpty() && t == "text" ) { st = "plain"; } else if ( t == "text" ) { addParameter( "original-type", t + "/" + st ); st = "plain"; } else { addParameter( "original-type", t + "/" + st ); t = "application"; st = "octet-stream"; } } if ( t.isEmpty() || st.isEmpty() ) setError( "Both type and subtype must be nonempty: " + s.quoted() ); if ( valid() && t == "multipart" && st == "appledouble" && parameter( "boundary" ).isEmpty() ) { // some people send appledouble without the header. what can // we do? let's just call it application/octet-stream. whoever // wants to decode can try, or reply. t = "application"; st = "octet-steam"; } if ( valid() && !p.atEnd() && t == "multipart" && parameter( "boundary" ).isEmpty() && s.lower().containsWord( "boundary" ) ) { EmailParser csp( s.mid( s.lower().find( "boundary" ) ) ); csp.require( "boundary" ); csp.whitespace(); if ( csp.present( "=" ) ) csp.whitespace(); uint m = csp.mark(); EString b = csp.string(); if ( b.isEmpty() || !csp.ok() ) { csp.restore( m ); b = csp.input().mid( csp.pos() ).section( ";", 1 ).simplified(); if ( !b.isQuoted() ) b.replace( "\\", "" ); if ( b.isQuoted() ) b = b.unquoted(); else if ( b.isQuoted( '\'' ) ) b = b.unquoted( '\'' ); } if ( !b.isEmpty() ) addParameter( "boundary", b ); } if ( valid() && t == "multipart" && parameter( "boundary" ).isEmpty() ) setError( "Multipart entities must have a boundary parameter." ); if ( !parameter( "charset" ).isEmpty() ) { Codec * c = Codec::byName( parameter( "charset" ) ); if ( c ) { EString cs = c->name().lower(); if ( t == "text" && cs == "us-ascii" ) removeParameter( "charset" ); else if ( cs != parameter( "charset" ).lower() ) addParameter( "charset", cs ); } } if ( valid() && !p.atEnd() && t == "text" && parameter( "charset" ).isEmpty() && s.mid( p.pos() ).lower().containsWord( "charset" ) ) { EmailParser csp( s.mid( s.lower().find( "charset" ) ) ); csp.require( "charset" ); csp.whitespace(); if ( csp.present( "=" ) ) csp.whitespace(); Codec * c = Codec::byName( csp.dotAtom() ); if ( c ) addParameter( "charset", c->name().lower() ); } if ( !valid() ) setUnparsedValue( s ); }
Bodypart * Bodypart::parseBodypart( uint start, uint end, const EString & rfc2822, Header * h, Multipart * parent ) { if ( rfc2822[start] == 13 ) start++; if ( rfc2822[start] == 10 ) start++; Bodypart * bp = new Bodypart; bp->setParent( parent ); bp->setHeader( h ); EString body; if ( end > start ) body = rfc2822.mid( start, end-start ); if ( !body.contains( '=' ) ) { // sometimes people send c-t-e: q-p _and_ c-t-e: 7bit or 8bit. // if they are equivalent we can accept it. uint i = 0; bool any = false; HeaderField * f = 0; while ( (f=h->field(HeaderField::ContentTransferEncoding,i)) != 0 ) { if ( ((ContentTransferEncoding*)f)->encoding() == EString::QP ) any = true; i++; } if ( any && i > 1 ) h->removeField( HeaderField::ContentTransferEncoding ); } EString::Encoding e = EString::Binary; ContentTransferEncoding * cte = h->contentTransferEncoding(); if ( cte ) e = cte->encoding(); if ( !body.isEmpty() ) { if ( e == EString::Base64 || e == EString::Uuencode ) body = body.decoded( e ); else body = body.crlf().decoded( e ); } ContentType * ct = h->contentType(); if ( !ct ) { switch ( h->defaultType() ) { case Header::TextPlain: h->add( "Content-Type", "text/plain" ); break; case Header::MessageRfc822: h->add( "Content-Type", "message/rfc822" ); break; } ct = h->contentType(); } if ( ct->type() == "text" ) { bool specified = false; bool unknown = false; Codec * c = 0; if ( ct ) { EString csn = ct->parameter( "charset" ); if ( csn.lower() == "default" ) csn = ""; if ( !csn.isEmpty() ) specified = true; c = Codec::byName( csn ); if ( !c ) unknown = true; if ( c && c->name().lower() == "us-ascii" ) { // Some MTAs appear to say this in case there is no // Content-Type field - without checking whether the // body actually is ASCII. If it isn't, we'd better // call our charset guesser. (void)c->toUnicode( body ); if ( !c->valid() ) specified = false; // Not pretty. } } if ( !c ) c = new AsciiCodec; bp->d->hasText = true; bp->d->text = c->toUnicode( body.crlf() ); if ( c->name() == "GB2312" || c->name() == "ISO-2022-JP" || c->name() == "KS_C_5601-1987" ) { // undefined code point usage in GB2312 spam is much too // common. (GB2312 spam is much too common, but that's // another matter.) Gb2312Codec turns all undefined code // points into U+FFFD, so here, we can take the unicode // form and say it's the canonical form. when a client // later reads the message, it gets the text in unicode, // including U+FFFD. bool bad = !c->valid(); // the header may contain some unencoded gb2312. we bang // it by hand, ignoring errors. List<HeaderField>::Iterator hf( h->fields() ); while ( hf ) { if ( !hf->valid() && hf->type() == HeaderField::Subject ) { // is it right to bang only Subject? c->reset(); hf->setValue( c->toUnicode( hf->unparsedValue() ) ); } ++hf; } // if the body was bad, we prefer the (unicode) in // bp->d->text and pretend it arrived as UTF-8: if ( bad ) { c = new Utf8Codec; body = c->fromUnicode( bp->d->text ); } } if ( ( !specified && ( !c->wellformed() || ct->subtype() == "html" ) ) || ( specified && ( !c->valid() ) ) ) { Codec * g = 0; if ( ct->subtype() == "html" ) g = guessHtmlCodec( body ); else g = guessTextCodec( body ); UString guessed; if ( g ) guessed = g->toUnicode( body.crlf() ); if ( !g ) { // if we couldn't guess anything, keep what we had if // it's valid or explicitly specified, else use // unknown-8bit. if ( !specified && !c->valid() ) { c = new Unknown8BitCodec; bp->d->text = c->toUnicode( body.crlf() ); } } else { // if we could guess something, is our guess better // than what we had? if ( g->wellformed() && !c->wellformed() ) { c = g; bp->d->text = guessed; } } } if ( specified && c->state() == Codec::Invalid ) { // the codec was specified, and the specified codec // resulted in an error, but did not abort conversion. we // respond by forgetting the error, using the conversion // result (probably including one or more U+FFFD) and // labelling the message as UTF-8. c = new Utf8Codec; body = c->fromUnicode( bp->d->text ); } else if ( !specified && c->state() == Codec::Invalid ) { // the codec was not specified, and we couldn't find // anything. we call it unknown-8bit. c = new Unknown8BitCodec; bp->d->text = c->toUnicode( body ); } // if we ended up using a 16-bit codec and were using q-p, we // need to reevaluate without any trailing CRLF if ( e == EString::QP && c->name().startsWith( "UTF-16" ) ) bp->d->text = c->toUnicode( body.stripCRLF() ); if ( !c->valid() && bp->d->error.isEmpty() ) { bp->d->error = "Could not convert body to Unicode"; if ( specified ) { EString cs; if ( ct ) cs = ct->parameter( "charset" ); if ( cs.isEmpty() ) cs = c->name(); bp->d->error.append( " from " + cs ); } if ( specified && unknown ) bp->d->error.append( ": Character set not implemented" ); else if ( !c->error().isEmpty() ) bp->d->error.append( ": " + c->error() ); } if ( c->name().lower() != "us-ascii" ) ct->addParameter( "charset", c->name().lower() ); else if ( ct ) ct->removeParameter( "charset" ); body = c->fromUnicode( bp->d->text ); bool qp = body.needsQP(); if ( cte ) { if ( !qp ) { h->removeField( HeaderField::ContentTransferEncoding ); cte = 0; } else if ( cte->encoding() != EString::QP ) { cte->setEncoding( EString::QP ); } } else if ( qp ) { h->add( "Content-Transfer-Encoding", "quoted-printable" ); cte = h->contentTransferEncoding(); } } else { bp->d->data = body; if ( ct->type() != "multipart" && ct->type() != "message" ) { e = EString::Base64; // there may be exceptions. cases where some format really // needs another content-transfer-encoding: if ( ct->type() == "application" && ct->subtype().startsWith( "pgp-" ) && !body.needsQP() ) { // seems some PGP things need "Version: 1" unencoded e = EString::Binary; } else if ( ct->type() == "application" && ct->subtype() == "octet-stream" && body.contains( "BEGIN PGP MESSAGE" ) ) { // mutt cannot handle PGP in base64 (what a crock) e = EString::Binary; } // change c-t-e to match the encoding decided above if ( e == EString::Binary ) { h->removeField( HeaderField::ContentTransferEncoding ); cte = 0; } else if ( cte ) { cte->setEncoding( e ); } else { h->add( "Content-Transfer-Encoding", "base64" ); cte = h->contentTransferEncoding(); } } } if ( ct->type() == "multipart" ) { parseMultipart( start, end, rfc2822, ct->parameter( "boundary" ), ct->subtype() == "digest", bp->children(), bp, false ); } else if ( ct->type() == "message" && ct->subtype() == "rfc822" ) { // There are sometimes blank lines before the message. while ( rfc2822[start] == 13 || rfc2822[start] == 10 ) start++; Message * m = new Message; m->setParent( bp ); m->parse( rfc2822.mid( start, end-start ) ); List<Bodypart>::Iterator it( m->children() ); while ( it ) { bp->children()->append( it ); it->setParent( bp ); ++it; } bp->setMessage( m ); body = m->rfc822( false ); } bp->d->numBytes = body.length(); if ( cte ) body = body.encoded( cte->encoding(), 72 ); bp->d->numEncodedBytes = body.length(); if ( bp->d->hasText || ( ct->type() == "message" && ct->subtype() == "rfc822" ) ) { uint n = 0; uint i = 0; uint l = body.length(); while ( i < l ) { if ( body[i] == '\n' ) n++; i++; } if ( l && body[l-1] != '\n' ) n++; bp->setNumEncodedLines( n ); } h->simplify(); return bp; }
int main(int argc, char **argv) { if (argc < 3) return 1; av::init(); av::setFFmpegLoggingLevel(AV_LOG_TRACE); string uri (argv[1]); string out (argv[2]); ssize_t audioStream = -1; AudioDecoderContext adec; Stream ast; error_code ec; int count = 0; { // // INPUT // FormatContext ictx; ictx.openInput(uri, ec); if (ec) { cerr << "Can't open input\n"; return 1; } ictx.findStreamInfo(); for (size_t i = 0; i < ictx.streamsCount(); ++i) { auto st = ictx.stream(i); if (st.isAudio()) { audioStream = i; ast = st; break; } } cerr << audioStream << endl; if (ast.isNull()) { cerr << "Audio stream not found\n"; return 1; } if (ast.isValid()) { adec = AudioDecoderContext(ast); //Codec codec = findDecodingCodec(adec.raw()->codec_id); //adec.setCodec(codec); //adec.setRefCountedFrames(true); adec.open(ec); if (ec) { cerr << "Can't open codec\n"; return 1; } } // // OUTPUT // OutputFormat ofmt; FormatContext octx; ofmt = av::guessOutputFormat(out, out); clog << "Output format: " << ofmt.name() << " / " << ofmt.longName() << '\n'; octx.setFormat(ofmt); Codec ocodec = av::findEncodingCodec(ofmt, false); Stream ost = octx.addStream(ocodec); AudioEncoderContext enc (ost); clog << ocodec.name() << " / " << ocodec.longName() << ", audio: " << (ocodec.type()==AVMEDIA_TYPE_AUDIO) << '\n'; auto sampleFmts = ocodec.supportedSampleFormats(); auto sampleRates = ocodec.supportedSamplerates(); auto layouts = ocodec.supportedChannelLayouts(); clog << "Supported sample formats:\n"; for (const auto &fmt : sampleFmts) { clog << " " << av_get_sample_fmt_name(fmt) << '\n'; } clog << "Supported sample rates:\n"; for (const auto &rate : sampleRates) { clog << " " << rate << '\n'; } clog << "Supported sample layouts:\n"; for (const auto &lay : layouts) { char buf[128] = {0}; av_get_channel_layout_string(buf, sizeof(buf), av_get_channel_layout_nb_channels(lay), lay); clog << " " << buf << '\n'; } //return 0; // Settings #if 1 enc.setSampleRate(48000); enc.setSampleFormat(sampleFmts[0]); // Layout //enc.setChannelLayout(adec.channelLayout()); enc.setChannelLayout(AV_CH_LAYOUT_STEREO); //enc.setChannelLayout(AV_CH_LAYOUT_MONO); enc.setTimeBase(Rational(1, enc.sampleRate())); enc.setBitRate(adec.bitRate()); #else enc.setSampleRate(adec.sampleRate()); enc.setSampleFormat(adec.sampleFormat()); enc.setChannelLayout(adec.channelLayout()); enc.setTimeBase(adec.timeBase()); enc.setBitRate(adec.bitRate()); #endif octx.openOutput(out, ec); if (ec) { cerr << "Can't open output\n"; return 1; } enc.open(ec); if (ec) { cerr << "Can't open encoder\n"; return 1; } clog << "Encoder frame size: " << enc.frameSize() << '\n'; octx.dump(); octx.writeHeader(); octx.flush(); // // RESAMPLER // AudioResampler resampler(enc.channelLayout(), enc.sampleRate(), enc.sampleFormat(), adec.channelLayout(), adec.sampleRate(), adec.sampleFormat()); // // PROCESS // while (true) { Packet pkt = ictx.readPacket(ec); if (ec) { clog << "Packet reading error: " << ec << ", " << ec.message() << endl; break; } if (pkt.streamIndex() != audioStream) { continue; } clog << "Read packet: isNull=" << (bool)!pkt << ", " << pkt.pts() << "(nopts:" << pkt.pts().isNoPts() << ")" << " / " << pkt.pts().seconds() << " / " << pkt.timeBase() << " / st: " << pkt.streamIndex() << endl; #if 0 if (pkt.pts() == av::NoPts && pkt.timeBase() == Rational()) { clog << "Skip invalid timestamp packet: data=" << (void*)pkt.data() << ", size=" << pkt.size() << ", flags=" << pkt.flags() << " (corrupt:" << (pkt.flags() & AV_PKT_FLAG_CORRUPT) << ";key:" << (pkt.flags() & AV_PKT_FLAG_KEY) << ")" << ", side_data=" << (void*)pkt.raw()->side_data << ", side_data_count=" << pkt.raw()->side_data_elems << endl; //continue; } #endif auto samples = adec.decode(pkt, ec); count++; //if (count > 200) // break; if (ec) { cerr << "Decode error: " << ec << ", " << ec.message() << endl; return 1; } else if (!samples) { cerr << "Empty samples set\n"; //if (!pkt) // decoder flushed here // break; //continue; } clog << " Samples [in]: " << samples.samplesCount() << ", ch: " << samples.channelsCount() << ", freq: " << samples.sampleRate() << ", name: " << samples.channelsLayoutString() << ", pts: " << samples.pts().seconds() << ", ref=" << samples.isReferenced() << ":" << samples.refCount() << endl; // Empty samples set should not be pushed to the resampler, but it is valid case for the // end of reading: during samples empty, some cached data can be stored at the resampler // internal buffer, so we should consume it. if (samples) { resampler.push(samples, ec); if (ec) { clog << "Resampler push error: " << ec << ", text: " << ec.message() << endl; continue; } } // Pop resampler data bool getAll = !samples; while (true) { AudioSamples ouSamples(enc.sampleFormat(), enc.frameSize(), enc.channelLayout(), enc.sampleRate()); // Resample: bool hasFrame = resampler.pop(ouSamples, getAll, ec); if (ec) { clog << "Resampling status: " << ec << ", text: " << ec.message() << endl; break; } else if (!hasFrame) { break; } else clog << " Samples [ou]: " << ouSamples.samplesCount() << ", ch: " << ouSamples.channelsCount() << ", freq: " << ouSamples.sampleRate() << ", name: " << ouSamples.channelsLayoutString() << ", pts: " << ouSamples.pts().seconds() << ", ref=" << ouSamples.isReferenced() << ":" << ouSamples.refCount() << endl; // ENCODE ouSamples.setStreamIndex(0); ouSamples.setTimeBase(enc.timeBase()); Packet opkt = enc.encode(ouSamples, ec); if (ec) { cerr << "Encoding error: " << ec << ", " << ec.message() << endl; return 1; } else if (!opkt) { //cerr << "Empty packet\n"; continue; } opkt.setStreamIndex(0); clog << "Write packet: pts=" << opkt.pts() << ", dts=" << opkt.dts() << " / " << opkt.pts().seconds() << " / " << opkt.timeBase() << " / st: " << opkt.streamIndex() << endl; octx.writePacket(opkt, ec); if (ec) { cerr << "Error write packet: " << ec << ", " << ec.message() << endl; return 1; } } // For the first packets samples can be empty: decoder caching if (!pkt && !samples) break; } // // Is resampler flushed? // cerr << "Delay: " << resampler.delay() << endl; // // Flush encoder queue // clog << "Flush encoder:\n"; while (true) { AudioSamples null(nullptr); Packet opkt = enc.encode(null, ec); if (ec || !opkt) break; opkt.setStreamIndex(0); clog << "Write packet: pts=" << opkt.pts() << ", dts=" << opkt.dts() << " / " << opkt.pts().seconds() << " / " << opkt.timeBase() << " / st: " << opkt.streamIndex() << endl; octx.writePacket(opkt, ec); if (ec) { cerr << "Error write packet: " << ec << ", " << ec.message() << endl; return 1; } } octx.flush(); octx.writeTrailer(); } }
static Codec * guessHtmlCodec( const EString & body ) { // Let's see if the general function has something for us. Codec * guess = guessTextCodec( body ); // HTML prescribes that 8859-1 is the default. Let's see if 8859-1 // works. if ( !guess ) { guess = new Iso88591Codec; (void)guess->toUnicode( body ); if ( !guess->valid() ) guess = 0; } if ( !guess || ( !guess->wellformed() && ( guess->name() == "ISO-8859-1" || guess->name() == "ISO-8859-15" ) ) ) { // Some people believe that Windows codepage 1252 is // ISO-8859-1. Let's see if that works. Codec * windoze = new Cp1252Codec; (void)windoze->toUnicode( body ); if ( windoze->wellformed() ) guess = windoze; } // Some user-agents add a <meta http-equiv="content-type"> instead // of the Content-Type field. Maybe that exists? And if it exists, // is it more likely to be correct than our guess above? EString b = body.lower().simplified(); int i = 0; while ( i >= 0 ) { EString tag( "<meta http-equiv=\"content-type\" content=\"" ); i = b.find( tag, i ); if ( i >= 0 ) { i = i + tag.length(); int j = i; while ( j < (int)b.length() && b[j] != '"' ) j++; HeaderField * hf = HeaderField::create( "Content-Type", b.mid( i, j-i ) ); EString cs = ((MimeField*)hf)->parameter( "charset" ); Codec * meta = 0; if ( !cs.isEmpty() ) meta = Codec::byName( cs ); UString m; if ( meta ) m = meta->toUnicode( body ); UString g; if ( guess ) g = guess->toUnicode( body ); if ( meta && ( ( !m.isEmpty() && m == g ) || ( meta->wellformed() && ( !guess || !guess->wellformed() ) ) || ( meta->valid() && !guess ) || ( meta->valid() && guess && guess->name() == "ISO-8859-1" ) || ( meta->valid() && guess && !guess->valid() ) ) && meta->toUnicode( b ).ascii().contains( tag ) ) { guess = meta; } } } return guess; }