void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
                                        const float aScale[WEBAUDIO_BLOCK_SIZE],
                                        float aOutput[WEBAUDIO_BLOCK_SIZE]) {
  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
      vout1, vout2, vout3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aScale);
  ASSERT_ALIGNED16(aOutput);

  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
    vscaled0 = _mm_load_ps(&aScale[i]);
    vscaled1 = _mm_load_ps(&aScale[i + 4]);
    vscaled2 = _mm_load_ps(&aScale[i + 8]);
    vscaled3 = _mm_load_ps(&aScale[i + 12]);

    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);

    vout0 = _mm_mul_ps(vin0, vscaled0);
    vout1 = _mm_mul_ps(vin1, vscaled1);
    vout2 = _mm_mul_ps(vin2, vscaled2);
    vout3 = _mm_mul_ps(vin3, vscaled3);

    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}
void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
                               float* aOutput, uint32_t aSize) {
  unsigned i;
  __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
      outimag1, outimag2, outimag3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aScale);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  for (i = 0; i < aSize * 2; i += 16) {
    in0 = _mm_load_ps(&aInput[i]);
    in1 = _mm_load_ps(&aInput[i + 4]);
    in2 = _mm_load_ps(&aInput[i + 8]);
    in3 = _mm_load_ps(&aInput[i + 12]);

    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_load_ps(&aScale[i]);
    in1 = _mm_load_ps(&aScale[i + 4]);
    in2 = _mm_load_ps(&aScale[i + 8]);
    in3 = _mm_load_ps(&aScale[i + 12]);

    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));

    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
                     _mm_mul_ps(outimag0, outimag1));
    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
                     _mm_mul_ps(outimag0, outreal1));
    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
                     _mm_mul_ps(outimag2, outimag3));
    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
                     _mm_mul_ps(outimag2, outreal3));

    outreal0 = _mm_unpacklo_ps(in0, in1);
    outreal1 = _mm_unpackhi_ps(in0, in1);
    outreal2 = _mm_unpacklo_ps(in2, in3);
    outreal3 = _mm_unpackhi_ps(in2, in3);

    _mm_store_ps(&aOutput[i], outreal0);
    _mm_store_ps(&aOutput[i + 4], outreal1);
    _mm_store_ps(&aOutput[i + 8], outreal2);
    _mm_store_ps(&aOutput[i + 12], outreal3);
  }
}
void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize) {
  __m128 vout0, vout1, vout2, vout3, vgain0, vgain1, vgain2, vgain3, vin0, vin1,
      vin2, vin3;

  ASSERT_ALIGNED16(aBlock);
  ASSERT_MULTIPLE16(aSize);

  for (unsigned i = 0; i < aSize; i += 16) {
    vin0 = _mm_load_ps(&aBlock[i]);
    vin1 = _mm_load_ps(&aBlock[i + 4]);
    vin2 = _mm_load_ps(&aBlock[i + 8]);
    vin3 = _mm_load_ps(&aBlock[i + 12]);
    vgain0 = _mm_load_ps(&aScale[i]);
    vgain1 = _mm_load_ps(&aScale[i + 4]);
    vgain2 = _mm_load_ps(&aScale[i + 8]);
    vgain3 = _mm_load_ps(&aScale[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain0);
    vout1 = _mm_mul_ps(vin1, vgain1);
    vout2 = _mm_mul_ps(vin2, vgain2);
    vout3 = _mm_mul_ps(vin3, vgain3);
    _mm_store_ps(&aBlock[i], vout0);
    _mm_store_ps(&aBlock[i + 4], vout1);
    _mm_store_ps(&aBlock[i + 8], vout2);
    _mm_store_ps(&aBlock[i + 12], vout3);
  }
}
void
AudioNodeStream::ObtainInputBlock(AudioBlock& aTmpChunk,
                                  uint32_t aPortIndex)
{
  uint32_t inputCount = mInputs.Length();
  uint32_t outputChannelCount = 1;
  AutoTArray<const AudioBlock*,250> inputChunks;
  for (uint32_t i = 0; i < inputCount; ++i) {
    if (aPortIndex != mInputs[i]->InputNumber()) {
      // This input is connected to a different port
      continue;
    }
    MediaStream* s = mInputs[i]->GetSource();
    AudioNodeStream* a = static_cast<AudioNodeStream*>(s);
    MOZ_ASSERT(a == s->AsAudioNodeStream());
    if (a->IsAudioParamStream()) {
      continue;
    }

    const AudioBlock* chunk = &a->mLastChunks[mInputs[i]->OutputNumber()];
    MOZ_ASSERT(chunk);
    if (chunk->IsNull() || chunk->mChannelData.IsEmpty()) {
      continue;
    }

    inputChunks.AppendElement(chunk);
    outputChannelCount =
      GetAudioChannelsSuperset(outputChannelCount, chunk->ChannelCount());
  }

  outputChannelCount = ComputedNumberOfChannels(outputChannelCount);

  uint32_t inputChunkCount = inputChunks.Length();
  if (inputChunkCount == 0 ||
      (inputChunkCount == 1 && inputChunks[0]->ChannelCount() == 0)) {
    aTmpChunk.SetNull(WEBAUDIO_BLOCK_SIZE);
    return;
  }

  if (inputChunkCount == 1 &&
      inputChunks[0]->ChannelCount() == outputChannelCount) {
    aTmpChunk = *inputChunks[0];
    return;
  }

  if (outputChannelCount == 0) {
    aTmpChunk.SetNull(WEBAUDIO_BLOCK_SIZE);
    return;
  }

  aTmpChunk.AllocateChannels(outputChannelCount);
  DownmixBufferType downmixBuffer;
  ASSERT_ALIGNED16(downmixBuffer.Elements());

  for (uint32_t i = 0; i < inputChunkCount; ++i) {
    AccumulateInputChunk(i, *inputChunks[i], &aTmpChunk, &downmixBuffer);
  }
}
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
                                 float* aOutput, uint32_t aSize) {
  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
      vout1, vout2, vout3, vgain;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < aSize; i += 16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);

    vscaled0 = _mm_mul_ps(vin0, vgain);
    vscaled1 = _mm_mul_ps(vin1, vgain);
    vscaled2 = _mm_mul_ps(vin2, vgain);
    vscaled3 = _mm_mul_ps(vin3, vgain);

    vin0 = _mm_load_ps(&aOutput[i]);
    vin1 = _mm_load_ps(&aOutput[i + 4]);
    vin2 = _mm_load_ps(&aOutput[i + 8]);
    vin3 = _mm_load_ps(&aOutput[i + 12]);

    vout0 = _mm_add_ps(vin0, vscaled0);
    vout1 = _mm_add_ps(vin1, vscaled1);
    vout2 = _mm_add_ps(vin2, vscaled2);
    vout3 = _mm_add_ps(vin3, vscaled3);

    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}
void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
                                        float* aOutput) {
  __m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);

  __m128 vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);
    vout0 = _mm_mul_ps(vin0, vgain);
    vout1 = _mm_mul_ps(vin1, vgain);
    vout2 = _mm_mul_ps(vin2, vgain);
    vout3 = _mm_mul_ps(vin3, vgain);
    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}
float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) {
  unsigned i;
  __m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3;
  float out[4];

  ASSERT_ALIGNED16(aInput);
  ASSERT_MULTIPLE16(aLength);

  acc0 = _mm_setzero_ps();
  acc1 = _mm_setzero_ps();
  acc2 = _mm_setzero_ps();
  acc3 = _mm_setzero_ps();

  for (i = 0; i < aLength; i += 16) {
    in0 = _mm_load_ps(&aInput[i]);
    in1 = _mm_load_ps(&aInput[i + 4]);
    in2 = _mm_load_ps(&aInput[i + 8]);
    in3 = _mm_load_ps(&aInput[i + 12]);

    in0 = _mm_mul_ps(in0, in0);
    in1 = _mm_mul_ps(in1, in1);
    in2 = _mm_mul_ps(in2, in2);
    in3 = _mm_mul_ps(in3, in3);

    acc0 = _mm_add_ps(acc0, in0);
    acc1 = _mm_add_ps(acc1, in1);
    acc2 = _mm_add_ps(acc2, in2);
    acc3 = _mm_add_ps(acc3, in3);
  }

  acc0 = _mm_add_ps(acc0, acc1);
  acc0 = _mm_add_ps(acc0, acc2);
  acc0 = _mm_add_ps(acc0, acc3);

  _mm_store_ps(out, acc0);

  return out[0] + out[1] + out[2] + out[3];
}
void
AudioNodeExternalInputStream::ProcessInput(GraphTime aFrom, GraphTime aTo,
                                           uint32_t aFlags)
{
  // According to spec, number of outputs is always 1.
  MOZ_ASSERT(mLastChunks.Length() == 1);

  // GC stuff can result in our input stream being destroyed before this stream.
  // Handle that.
  if (!IsEnabled() || mInputs.IsEmpty() || mPassThrough) {
    mLastChunks[0].SetNull(WEBAUDIO_BLOCK_SIZE);
    return;
  }

  MOZ_ASSERT(mInputs.Length() == 1);

  MediaStream* source = mInputs[0]->GetSource();
  AutoTArray<AudioSegment,1> audioSegments;
  uint32_t inputChannels = 0;
  for (StreamTracks::TrackIter tracks(source->mTracks);
       !tracks.IsEnded(); tracks.Next()) {
    const StreamTracks::Track& inputTrack = *tracks;
    if (!mInputs[0]->PassTrackThrough(tracks->GetID())) {
      continue;
    }

    if (inputTrack.GetSegment()->GetType() == MediaSegment::VIDEO) {
      MOZ_ASSERT(false, "AudioNodeExternalInputStream shouldn't have video tracks");
      continue;
    }

    const AudioSegment& inputSegment =
        *static_cast<AudioSegment*>(inputTrack.GetSegment());
    if (inputSegment.IsNull()) {
      continue;
    }

    AudioSegment& segment = *audioSegments.AppendElement();
    GraphTime next;
    for (GraphTime t = aFrom; t < aTo; t = next) {
      MediaInputPort::InputInterval interval = mInputs[0]->GetNextInputInterval(t);
      interval.mEnd = std::min(interval.mEnd, aTo);
      if (interval.mStart >= interval.mEnd)
        break;
      next = interval.mEnd;

      // We know this stream does not block during the processing interval ---
      // we're not finished, we don't underrun, and we're not suspended.
      StreamTime outputStart = GraphTimeToStreamTime(interval.mStart);
      StreamTime outputEnd = GraphTimeToStreamTime(interval.mEnd);
      StreamTime ticks = outputEnd - outputStart;

      if (interval.mInputIsBlocked) {
        segment.AppendNullData(ticks);
      } else {
        // The input stream is not blocked in this interval, so no need to call
        // GraphTimeToStreamTimeWithBlocking.
        StreamTime inputStart =
          std::min(inputSegment.GetDuration(),
                   source->GraphTimeToStreamTime(interval.mStart));
        StreamTime inputEnd =
          std::min(inputSegment.GetDuration(),
                   source->GraphTimeToStreamTime(interval.mEnd));

        segment.AppendSlice(inputSegment, inputStart, inputEnd);
        // Pad if we're looking past the end of the track
        segment.AppendNullData(ticks - (inputEnd - inputStart));
      }
    }

    for (AudioSegment::ChunkIterator iter(segment); !iter.IsEnded(); iter.Next()) {
      inputChannels = GetAudioChannelsSuperset(inputChannels, iter->ChannelCount());
    }
  }

  uint32_t accumulateIndex = 0;
  if (inputChannels) {
    DownmixBufferType downmixBuffer;
    ASSERT_ALIGNED16(downmixBuffer.Elements());
    for (uint32_t i = 0; i < audioSegments.Length(); ++i) {
      AudioBlock tmpChunk;
      ConvertSegmentToAudioBlock(&audioSegments[i], &tmpChunk, inputChannels);
      if (!tmpChunk.IsNull()) {
        if (accumulateIndex == 0) {
          mLastChunks[0].AllocateChannels(inputChannels);
        }
        AccumulateInputChunk(accumulateIndex, tmpChunk, &mLastChunks[0], &downmixBuffer);
        accumulateIndex++;
      }
    }
  }
  if (accumulateIndex == 0) {
    mLastChunks[0].SetNull(WEBAUDIO_BLOCK_SIZE);
  }
}
void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                     const float aInputR[WEBAUDIO_BLOCK_SIZE],
                                     float aGainL, float aGainR,
                                     bool aIsOnTheLeft,
                                     float aOutputL[WEBAUDIO_BLOCK_SIZE],
                                     float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
  __m128 vinl0, vinr0, vinl1, vinr1, vout0, vout1, vscaled0, vscaled1, vgainl,
      vgainr;

  ASSERT_ALIGNED16(aInputL);
  ASSERT_ALIGNED16(aInputR);
  ASSERT_ALIGNED16(aOutputL);
  ASSERT_ALIGNED16(aOutputR);

  vgainl = _mm_load1_ps(&aGainL);
  vgainr = _mm_load1_ps(&aGainR);

  if (aIsOnTheLeft) {
    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinl0 = _mm_load_ps(&aInputL[i]);
      vinr0 = _mm_load_ps(&aInputR[i]);
      vinl1 = _mm_load_ps(&aInputL[i + 4]);
      vinr1 = _mm_load_ps(&aInputR[i + 4]);

      /* left channel : aOutputL  = aInputL + aInputR * gainL */
      vscaled0 = _mm_mul_ps(vinr0, vgainl);
      vscaled1 = _mm_mul_ps(vinr1, vgainl);
      vout0 = _mm_add_ps(vscaled0, vinl0);
      vout1 = _mm_add_ps(vscaled1, vinl1);
      _mm_store_ps(&aOutputL[i], vout0);
      _mm_store_ps(&aOutputL[i + 4], vout1);

      /* right channel : aOutputR = aInputR * gainR */
      vscaled0 = _mm_mul_ps(vinr0, vgainr);
      vscaled1 = _mm_mul_ps(vinr1, vgainr);
      _mm_store_ps(&aOutputR[i], vscaled0);
      _mm_store_ps(&aOutputR[i + 4], vscaled1);
    }
  } else {
    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
      vinl0 = _mm_load_ps(&aInputL[i]);
      vinr0 = _mm_load_ps(&aInputR[i]);
      vinl1 = _mm_load_ps(&aInputL[i + 4]);
      vinr1 = _mm_load_ps(&aInputR[i + 4]);

      /* left channel : aInputL * gainL */
      vscaled0 = _mm_mul_ps(vinl0, vgainl);
      vscaled1 = _mm_mul_ps(vinl1, vgainl);
      _mm_store_ps(&aOutputL[i], vscaled0);
      _mm_store_ps(&aOutputL[i + 4], vscaled1);

      /* right channel: aOutputR = aInputR + aInputL * gainR */
      vscaled0 = _mm_mul_ps(vinl0, vgainr);
      vscaled1 = _mm_mul_ps(vinl1, vgainr);
      vout0 = _mm_add_ps(vscaled0, vinr0);
      vout1 = _mm_add_ps(vscaled1, vinr1);
      _mm_store_ps(&aOutputR[i], vout0);
      _mm_store_ps(&aOutputR[i + 4], vout1);
    }
  }
}