template <typename T> static void ResampleChannelBuffer(SpeexResamplerState* aResampler, uint32_t aChannel, const T* aInput, uint32_t aInputDuration, nsTArray<float>* aOutput) { if (!aResampler) { float* out = aOutput->AppendElements(aInputDuration); for (uint32_t i = 0; i < aInputDuration; ++i) { out[i] = AudioSampleToFloat(aInput[i]); } return; } uint32_t processed = 0; while (processed < aInputDuration) { uint32_t prevLength = aOutput->Length(); float* output = aOutput->AppendElements(SPEEX_RESAMPLER_PROCESS_MAX_OUTPUT); uint32_t in = aInputDuration - processed; uint32_t out = aOutput->Length() - prevLength; WebAudioUtils::SpeexResamplerProcess(aResampler, aChannel, aInput + processed, &in, output, &out); processed += in; aOutput->SetLength(prevLength + out); } }
static AlignedAudioBuffer CopyAndPackAudio(AVFrame* aFrame, uint32_t aNumChannels, uint32_t aNumAFrames) { MOZ_ASSERT(aNumChannels <= MAX_CHANNELS); AlignedAudioBuffer audio(aNumChannels * aNumAFrames); if (!audio) { return audio; } if (aFrame->format == AV_SAMPLE_FMT_FLT) { // Audio data already packed. No need to do anything other than copy it // into a buffer we own. memcpy(audio.get(), aFrame->data[0], aNumChannels * aNumAFrames * sizeof(AudioDataValue)); } else if (aFrame->format == AV_SAMPLE_FMT_FLTP) { // Planar audio data. Pack it into something we can understand. AudioDataValue* tmp = audio.get(); AudioDataValue** data = reinterpret_cast<AudioDataValue**>(aFrame->data); for (uint32_t frame = 0; frame < aNumAFrames; frame++) { for (uint32_t channel = 0; channel < aNumChannels; channel++) { *tmp++ = data[channel][frame]; } } } else if (aFrame->format == AV_SAMPLE_FMT_S16) { // Audio data already packed. Need to convert from S16 to 32 bits Float AudioDataValue* tmp = audio.get(); int16_t* data = reinterpret_cast<int16_t**>(aFrame->data)[0]; for (uint32_t frame = 0; frame < aNumAFrames; frame++) { for (uint32_t channel = 0; channel < aNumChannels; channel++) { *tmp++ = AudioSampleToFloat(*data++); } } } else if (aFrame->format == AV_SAMPLE_FMT_S16P) { // Planar audio data. Convert it from S16 to 32 bits float // and pack it into something we can understand. AudioDataValue* tmp = audio.get(); int16_t** data = reinterpret_cast<int16_t**>(aFrame->data); for (uint32_t frame = 0; frame < aNumAFrames; frame++) { for (uint32_t channel = 0; channel < aNumChannels; channel++) { *tmp++ = AudioSampleToFloat(data[channel][frame]); } } } return audio; }
bool DirectShowReader::DecodeAudioData() { MOZ_ASSERT(mDecoder->OnDecodeThread(), "Should be on decode thread."); HRESULT hr; SampleSink* sink = mAudioSinkFilter->GetSampleSink(); if (sink->AtEOS()) { // End of stream. return Finish(S_OK); } // Get the next chunk of audio samples. This blocks until the sample // arrives, or an error occurs (like the stream is shutdown). RefPtr<IMediaSample> sample; hr = sink->Extract(sample); if (FAILED(hr) || hr == S_FALSE) { return Finish(hr); } int64_t start = 0, end = 0; sample->GetMediaTime(&start, &end); LOG("DirectShowReader::DecodeAudioData [%4.2lf-%4.2lf]", RefTimeToSeconds(start), RefTimeToSeconds(end)); LONG length = sample->GetActualDataLength(); LONG numSamples = length / mBytesPerSample; LONG numFrames = length / mBytesPerSample / mNumChannels; BYTE* data = nullptr; hr = sample->GetPointer(&data); NS_ENSURE_TRUE(SUCCEEDED(hr), Finish(hr)); nsAutoArrayPtr<AudioDataValue> buffer(new AudioDataValue[numSamples]); AudioDataValue* dst = buffer.get(); if (mBytesPerSample == 1) { uint8_t* src = reinterpret_cast<uint8_t*>(data); for (int32_t i = 0; i < numSamples; ++i) { dst[i] = UnsignedByteToAudioSample(src[i]); } } else if (mBytesPerSample == 2) { int16_t* src = reinterpret_cast<int16_t*>(data); for (int32_t i = 0; i < numSamples; ++i) { dst[i] = AudioSampleToFloat(src[i]); } } mAudioQueue.Push(new AudioData(mDecoder->GetResource()->Tell(), RefTimeToUsecs(start), RefTimeToUsecs(end - start), numFrames, buffer.forget(), mNumChannels)); return true; }
static void InterleaveAndConvertBuffer(const SrcT** aSourceChannels, int32_t aLength, float aVolume, int32_t aChannels, DestT* aOutput) { DestT* output = aOutput; for (int32_t i = 0; i < aLength; ++i) { for (int32_t channel = 0; channel < aChannels; ++channel) { float v = AudioSampleToFloat(aSourceChannels[channel][i])*aVolume; *output = FloatToAudioSample<DestT>(v); ++output; } } }
template <> int SpeexResamplerProcess<int16_t>(SpeexResamplerState* aResampler, uint32_t aChannel, const int16_t* aInput, uint32_t* aIn, float* aOutput, uint32_t* aOut) { NS_ASSERTION(*aOut <= SPEEX_RESAMPLER_PROCESS_MAX_OUTPUT, "Bad aOut"); int16_t tmp[SPEEX_RESAMPLER_PROCESS_MAX_OUTPUT]; int result = speex_resampler_process_int(aResampler, aChannel, aInput, aIn, tmp, aOut); if (result == RESAMPLER_ERR_SUCCESS) { for (uint32_t i = 0; i < *aOut; ++i) { aOutput[i] = AudioSampleToFloat(tmp[i]); } } return result; }
void DownmixAndInterleave(const nsTArray<const void*>& aChannelData, AudioSampleFormat aSourceFormat, int32_t aDuration, float aVolume, uint32_t aOutputChannels, AudioDataValue* aOutput) { nsAutoTArray<const void*,GUESS_AUDIO_CHANNELS> channelData; nsAutoTArray<float,AUDIO_PROCESSING_FRAMES*GUESS_AUDIO_CHANNELS> downmixConversionBuffer; nsAutoTArray<float,AUDIO_PROCESSING_FRAMES*GUESS_AUDIO_CHANNELS> downmixOutputBuffer; channelData.SetLength(aChannelData.Length()); if (aSourceFormat != AUDIO_FORMAT_FLOAT32) { NS_ASSERTION(aSourceFormat == AUDIO_FORMAT_S16, "unknown format"); downmixConversionBuffer.SetLength(aDuration*aChannelData.Length()); for (uint32_t i = 0; i < aChannelData.Length(); ++i) { float* conversionBuf = downmixConversionBuffer.Elements() + (i*aDuration); const int16_t* sourceBuf = static_cast<const int16_t*>(aChannelData[i]); for (uint32_t j = 0; j < (uint32_t)aDuration; ++j) { conversionBuf[j] = AudioSampleToFloat(sourceBuf[j]); } channelData[i] = conversionBuf; } } else { for (uint32_t i = 0; i < aChannelData.Length(); ++i) { channelData[i] = aChannelData[i]; } } downmixOutputBuffer.SetLength(aDuration*aOutputChannels); nsAutoTArray<float*,GUESS_AUDIO_CHANNELS> outputChannelBuffers; nsAutoTArray<const void*,GUESS_AUDIO_CHANNELS> outputChannelData; outputChannelBuffers.SetLength(aOutputChannels); outputChannelData.SetLength(aOutputChannels); for (uint32_t i = 0; i < (uint32_t)aOutputChannels; ++i) { outputChannelData[i] = outputChannelBuffers[i] = downmixOutputBuffer.Elements() + aDuration*i; } if (channelData.Length() > aOutputChannels) { AudioChannelsDownMix(channelData, outputChannelBuffers.Elements(), aOutputChannels, aDuration); } InterleaveAndConvertBuffer(outputChannelData.Elements(), AUDIO_FORMAT_FLOAT32, aDuration, aVolume, aOutputChannels, aOutput); }
uint32_t operator()(AudioDataValue *aBuffer, uint32_t aSamples) { uint32_t maxSamples = std::min(aSamples, mSamples - mNextSample); uint32_t frames = maxSamples / mChannels; size_t byteOffset = mNextSample * mBytesPerSample; if (mBytesPerSample == 1) { for (uint32_t i = 0; i < maxSamples; ++i) { uint8_t *sample = mSource + byteOffset; aBuffer[i] = UnsignedByteToAudioSample(*sample); byteOffset += mBytesPerSample; } } else if (mBytesPerSample == 2) { for (uint32_t i = 0; i < maxSamples; ++i) { int16_t *sample = reinterpret_cast<int16_t *>(mSource + byteOffset); aBuffer[i] = AudioSampleToFloat(*sample); byteOffset += mBytesPerSample; } } mNextSample += maxSamples; return frames; }
void AudioCallbackAdapter::Decoded(const nsTArray<int16_t>& aPCM, uint64_t aTimeStamp, uint32_t aChannels, uint32_t aRate) { MOZ_ASSERT(IsOnGMPThread()); if (aRate == 0 || aChannels == 0) { NS_WARNING("Invalid rate or num channels returned on GMP audio samples"); mCallback->Error(); return; } size_t numFrames = aPCM.Length() / aChannels; MOZ_ASSERT((aPCM.Length() % aChannels) == 0); nsAutoArrayPtr<AudioDataValue> audioData(new AudioDataValue[aPCM.Length()]); for (size_t i = 0; i < aPCM.Length(); ++i) { audioData[i] = AudioSampleToFloat(aPCM[i]); } if (mMustRecaptureAudioPosition) { mAudioFrameSum = 0; auto timestamp = UsecsToFrames(aTimeStamp, aRate); if (!timestamp.isValid()) { NS_WARNING("Invalid timestamp"); mCallback->Error(); return; } mAudioFrameOffset = timestamp.value(); MOZ_ASSERT(mAudioFrameOffset >= 0); mMustRecaptureAudioPosition = false; } auto timestamp = FramesToUsecs(mAudioFrameOffset + mAudioFrameSum, aRate); if (!timestamp.isValid()) { NS_WARNING("Invalid timestamp on audio samples"); mCallback->Error(); return; } mAudioFrameSum += numFrames; auto duration = FramesToUsecs(numFrames, aRate); if (!duration.isValid()) { NS_WARNING("Invalid duration on audio samples"); mCallback->Error(); return; } nsRefPtr<AudioData> audio(new AudioData(mLastStreamOffset, timestamp.value(), duration.value(), numFrames, audioData.forget(), aChannels, aRate)); #ifdef LOG_SAMPLE_DECODE LOG("Decoded audio sample! timestamp=%lld duration=%lld currentLength=%u", timestamp, duration, currentLength); #endif mCallback->Output(audio); }
HRESULT WMFAudioMFTManager::Output(int64_t aStreamOffset, nsRefPtr<MediaData>& aOutData) { aOutData = nullptr; RefPtr<IMFSample> sample; HRESULT hr; int typeChangeCount = 0; while (true) { hr = mDecoder->Output(&sample); if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT) { return hr; } if (hr == MF_E_TRANSFORM_STREAM_CHANGE) { hr = UpdateOutputType(); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); // Catch infinite loops, but some decoders perform at least 2 stream // changes on consecutive calls, so be permissive. // 100 is arbitrarily > 2. NS_ENSURE_TRUE(typeChangeCount < 100, MF_E_TRANSFORM_STREAM_CHANGE); ++typeChangeCount; continue; } break; } NS_ENSURE_TRUE(SUCCEEDED(hr), hr); RefPtr<IMFMediaBuffer> buffer; hr = sample->ConvertToContiguousBuffer(byRef(buffer)); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); BYTE* data = nullptr; // Note: *data will be owned by the IMFMediaBuffer, we don't need to free it. DWORD maxLength = 0, currentLength = 0; hr = buffer->Lock(&data, &maxLength, ¤tLength); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); // Sometimes when starting decoding, the AAC decoder gives us samples // with a negative timestamp. AAC does usually have preroll (or encoder // delay) encoded into its bitstream, but the amount encoded to the stream // is variable, and it not signalled in-bitstream. There is sometimes // signalling in the MP4 container what the preroll amount, but it's // inconsistent. It looks like WMF's AAC encoder may take this into // account, so strip off samples with a negative timestamp to get us // to a 0-timestamp start. This seems to maintain A/V sync, so we can run // with this until someone complains... // We calculate the timestamp and the duration based on the number of audio // frames we've already played. We don't trust the timestamp stored on the // IMFSample, as sometimes it's wrong, possibly due to buggy encoders? // If this sample block comes after a discontinuity (i.e. a gap or seek) // reset the frame counters, and capture the timestamp. Future timestamps // will be offset from this block's timestamp. UINT32 discontinuity = false; sample->GetUINT32(MFSampleExtension_Discontinuity, &discontinuity); if (mMustRecaptureAudioPosition || discontinuity) { // Update the output type, in case this segment has a different // rate. This also triggers on the first sample, which can have a // different rate than is advertised in the container, and sometimes we // don't get a MF_E_TRANSFORM_STREAM_CHANGE when the rate changes. hr = UpdateOutputType(); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); mAudioFrameSum = 0; LONGLONG timestampHns = 0; hr = sample->GetSampleTime(×tampHns); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); mAudioTimeOffset = media::TimeUnit::FromMicroseconds(timestampHns / 10); mMustRecaptureAudioPosition = false; } // We can assume PCM 16 output. int32_t numSamples = currentLength / 2; int32_t numFrames = numSamples / mAudioChannels; MOZ_ASSERT(numFrames >= 0); MOZ_ASSERT(numSamples >= 0); if (numFrames == 0) { // All data from this chunk stripped, loop back and try to output the next // frame, if possible. return S_OK; } nsAutoArrayPtr<AudioDataValue> audioData(new AudioDataValue[numSamples]); int16_t* pcm = (int16_t*)data; for (int32_t i = 0; i < numSamples; ++i) { audioData[i] = AudioSampleToFloat(pcm[i]); } buffer->Unlock(); media::TimeUnit timestamp = mAudioTimeOffset + FramesToTimeUnit(mAudioFrameSum, mAudioRate); NS_ENSURE_TRUE(timestamp.IsValid(), E_FAIL); mAudioFrameSum += numFrames; media::TimeUnit duration = FramesToTimeUnit(numFrames, mAudioRate); NS_ENSURE_TRUE(duration.IsValid(), E_FAIL); aOutData = new AudioData(aStreamOffset, timestamp.ToMicroseconds(), duration.ToMicroseconds(), numFrames, audioData.forget(), mAudioChannels, mAudioRate); #ifdef LOG_SAMPLE_DECODE LOG("Decoded audio sample! timestamp=%lld duration=%lld currentLength=%u", timestamp.ToMicroseconds(), duration.ToMicroseconds(), currentLength); #endif return S_OK; }
template <> inline float SignedShortToAudioSample<float>(int16_t aValue) { return AudioSampleToFloat(aValue); }
HRESULT WMFAudioMFTManager::Output(int64_t aStreamOffset, nsAutoPtr<MediaData>& aOutData) { aOutData = nullptr; RefPtr<IMFSample> sample; HRESULT hr; while (true) { hr = mDecoder->Output(&sample); if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT) { return hr; } if (hr == MF_E_TRANSFORM_STREAM_CHANGE) { hr = UpdateOutputType(); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); continue; } break; } NS_ENSURE_TRUE(SUCCEEDED(hr), hr); RefPtr<IMFMediaBuffer> buffer; hr = sample->ConvertToContiguousBuffer(byRef(buffer)); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); BYTE* data = nullptr; // Note: *data will be owned by the IMFMediaBuffer, we don't need to free it. DWORD maxLength = 0, currentLength = 0; hr = buffer->Lock(&data, &maxLength, ¤tLength); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); // Sometimes when starting decoding, the AAC decoder gives us samples // with a negative timestamp. AAC does usually have preroll (or encoder // delay) encoded into its bitstream, but the amount encoded to the stream // is variable, and it not signalled in-bitstream. There is sometimes // signalling in the MP4 container what the preroll amount, but it's // inconsistent. It looks like WMF's AAC encoder may take this into // account, so strip off samples with a negative timestamp to get us // to a 0-timestamp start. This seems to maintain A/V sync, so we can run // with this until someone complains... // We calculate the timestamp and the duration based on the number of audio // frames we've already played. We don't trust the timestamp stored on the // IMFSample, as sometimes it's wrong, possibly due to buggy encoders? // If this sample block comes after a discontinuity (i.e. a gap or seek) // reset the frame counters, and capture the timestamp. Future timestamps // will be offset from this block's timestamp. UINT32 discontinuity = false; int32_t numFramesToStrip = 0; sample->GetUINT32(MFSampleExtension_Discontinuity, &discontinuity); if (mMustRecaptureAudioPosition || discontinuity) { // Update the output type, in case this segment has a different // rate. This also triggers on the first sample, which can have a // different rate than is advertised in the container, and sometimes we // don't get a MF_E_TRANSFORM_STREAM_CHANGE when the rate changes. hr = UpdateOutputType(); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); mAudioFrameSum = 0; LONGLONG timestampHns = 0; hr = sample->GetSampleTime(×tampHns); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); hr = HNsToFrames(timestampHns, mAudioRate, &mAudioFrameOffset); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); if (mAudioFrameOffset < 0) { // First sample has a negative timestamp. Strip off the samples until // we reach positive territory. numFramesToStrip = -mAudioFrameOffset; mAudioFrameOffset = 0; } mMustRecaptureAudioPosition = false; } MOZ_ASSERT(numFramesToStrip >= 0); int32_t numSamples = currentLength / mAudioBytesPerSample; int32_t numFrames = numSamples / mAudioChannels; int32_t offset = std::min<int32_t>(numFramesToStrip, numFrames); numFrames -= offset; numSamples -= offset * mAudioChannels; MOZ_ASSERT(numFrames >= 0); MOZ_ASSERT(numSamples >= 0); if (numFrames == 0) { // All data from this chunk stripped, loop back and try to output the next // frame, if possible. return S_OK; } nsAutoArrayPtr<AudioDataValue> audioData(new AudioDataValue[numSamples]); // Just assume PCM output for now... MOZ_ASSERT(mAudioBytesPerSample == 2); int16_t* pcm = ((int16_t*)data) + (offset * mAudioChannels); MOZ_ASSERT(pcm >= (int16_t*)data); MOZ_ASSERT(pcm <= (int16_t*)(data + currentLength)); MOZ_ASSERT(pcm+numSamples <= (int16_t*)(data + currentLength)); for (int32_t i = 0; i < numSamples; ++i) { audioData[i] = AudioSampleToFloat(pcm[i]); } buffer->Unlock(); int64_t timestamp; hr = FramesToUsecs(mAudioFrameOffset + mAudioFrameSum, mAudioRate, ×tamp); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); mAudioFrameSum += numFrames; int64_t duration; hr = FramesToUsecs(numFrames, mAudioRate, &duration); NS_ENSURE_TRUE(SUCCEEDED(hr), hr); aOutData = new AudioData(aStreamOffset, timestamp, duration, numFrames, audioData.forget(), mAudioChannels, mAudioRate); #ifdef LOG_SAMPLE_DECODE LOG("Decoded audio sample! timestamp=%lld duration=%lld currentLength=%u", timestamp, duration, currentLength); #endif return S_OK; }
void AudioNodeExternalInputStream::TrackMapEntry::ResampleInputData(AudioSegment* aSegment) { AudioSegment::ChunkIterator ci(*aSegment); while (!ci.IsEnded()) { const AudioChunk& chunk = *ci; nsAutoTArray<const void*,2> channels; if (chunk.GetDuration() > UINT32_MAX) { // This will cause us to OOM or overflow below. So let's just bail. NS_ERROR("Chunk duration out of bounds"); return; } uint32_t duration = uint32_t(chunk.GetDuration()); if (chunk.IsNull()) { nsAutoTArray<AudioDataValue,1024> silence; silence.SetLength(duration); PodZero(silence.Elements(), silence.Length()); channels.SetLength(mResamplerChannelCount); for (uint32_t i = 0; i < channels.Length(); ++i) { channels[i] = silence.Elements(); } ResampleChannels(channels, duration, AUDIO_OUTPUT_FORMAT, 0.0f); } else if (chunk.mChannelData.Length() == mResamplerChannelCount) { // Common case, since mResamplerChannelCount is set to the first chunk's // number of channels. channels.AppendElements(chunk.mChannelData); ResampleChannels(channels, duration, chunk.mBufferFormat, chunk.mVolume); } else { // Uncommon case. Since downmixing requires channels to be floats, // convert everything to floats now. uint32_t upChannels = GetAudioChannelsSuperset(chunk.mChannelData.Length(), mResamplerChannelCount); nsTArray<float> buffer; if (chunk.mBufferFormat == AUDIO_FORMAT_FLOAT32) { channels.AppendElements(chunk.mChannelData); } else { NS_ASSERTION(chunk.mBufferFormat == AUDIO_FORMAT_S16, "Unknown format"); if (duration > UINT32_MAX/chunk.mChannelData.Length()) { NS_ERROR("Chunk duration out of bounds"); return; } buffer.SetLength(chunk.mChannelData.Length()*duration); for (uint32_t i = 0; i < chunk.mChannelData.Length(); ++i) { const int16_t* samples = static_cast<const int16_t*>(chunk.mChannelData[i]); float* converted = &buffer[i*duration]; for (uint32_t j = 0; j < duration; ++j) { converted[j] = AudioSampleToFloat(samples[j]); } channels.AppendElement(converted); } } nsTArray<float> zeroes; if (channels.Length() < upChannels) { zeroes.SetLength(duration); PodZero(zeroes.Elements(), zeroes.Length()); AudioChannelsUpMix(&channels, upChannels, zeroes.Elements()); } if (channels.Length() == mResamplerChannelCount) { ResampleChannels(channels, duration, AUDIO_FORMAT_FLOAT32, chunk.mVolume); } else { nsTArray<float> output; if (duration > UINT32_MAX/mResamplerChannelCount) { NS_ERROR("Chunk duration out of bounds"); return; } output.SetLength(duration*mResamplerChannelCount); nsAutoTArray<float*,2> outputPtrs; nsAutoTArray<const void*,2> outputPtrsConst; for (uint32_t i = 0; i < mResamplerChannelCount; ++i) { outputPtrs.AppendElement(output.Elements() + i*duration); outputPtrsConst.AppendElement(outputPtrs[i]); } AudioChannelsDownMix(channels, outputPtrs.Elements(), outputPtrs.Length(), duration); ResampleChannels(outputPtrsConst, duration, AUDIO_FORMAT_FLOAT32, chunk.mVolume); } } ci.Next(); } }