Exemplo n.º 1
0
StreamMinibatchPtr SequencePacker::PackStreamMinibatch(const std::vector<SequenceDataPtr>& sequences, size_t streamId)
{
    // Create sequence info for each sequences that we have got from the transformer.

    std::vector<MBLayout::SequenceInfo> inputSequences;
    for (size_t index = 0; index < sequences.size(); ++index)
    {
        MBLayout::SequenceInfo info;

        // In each minibatch sequence ids should be unique.
        // They have to match between different input streams in the same minibatch.
        // We are using sequence index in the set of received sequences.
        // TODO: should we use m_key as sequence id and pass it with data?
        info.seqId = index;

        info.tBegin = 0;
        info.tEnd = sequences[index]->m_numberOfSamples;
        inputSequences.push_back(info);
    }

    std::vector<std::pair<size_t, size_t>> placement;
    std::vector<size_t> rowAllocations;

    // Creating the minibatch layout.
    MBLayoutPtr layout = std::make_shared<MBLayout>();
    layout->InitAsPackedSequences(inputSequences, placement, rowAllocations);

    // Allocating necessary data buffer for the stream.
    size_t sampleSize = GetSampleSize(m_inputStreams[streamId]);
    size_t totalNumberOfSamplesInBytes = layout->GetNumCols() * sampleSize;
    if (m_streamBufferSizes[streamId] < totalNumberOfSamplesInBytes)
    {
        m_streamBuffers[streamId] = AllocateBuffer(layout->GetNumCols(), sampleSize);
        m_streamBufferSizes[streamId] = totalNumberOfSamplesInBytes;
    }

    // Packing the actual data.
    StorageType storageType = m_inputStreams[streamId]->m_storageType;
    size_t elementSize = GetSizeByType(m_inputStreams[streamId]->m_elementType);
    const auto& packedSequences = layout->GetAllSequences();
    char* streamBuffer = m_streamBuffers[streamId].get();
    for (const auto& sequence : packedSequences)
    {
        if (sequence.seqId == GAP_SEQUENCE_ID)
            continue;
        const auto& data = sequences[sequence.seqId];

        // Packing the sequence
        for (size_t sampleIndex = 0; sampleIndex < sequence.GetNumTimeSteps(); ++sampleIndex)
        {
            char* destination = streamBuffer + layout->GetColumnIndex(sequence, sampleIndex) * sampleSize;
            if (storageType == StorageType::dense)
            {
                PackDenseSample(destination, data, sampleIndex, elementSize, sampleSize);
            }
            else // sparse
            {
                assert(storageType == StorageType::sparse_csc);
                PackSparseSample(destination, data, sampleIndex, elementSize, sampleSize);
            }
        }
    }

    // Ok, minibatch is ready, give it out.
    StreamMinibatchPtr result = std::make_shared<StreamMinibatch>();
    result->m_data = m_streamBuffers[streamId].get();
    result->m_layout = layout;
    return result;
}
Exemplo n.º 2
0
MBLayoutPtr SequencePacker::PackDenseStream(const StreamBatch& batch, size_t streamIndex)
{
    assert(m_outputStreamDescriptions[streamIndex]->m_storageType == StorageType::dense);
    const auto& stream = m_inputStreamDescriptions[streamIndex];
    auto& buffer = m_streamBuffers[m_currentBufferIndex][streamIndex];
    size_t sampleSize = GetSampleSize(m_outputStreamDescriptions[streamIndex]);
    auto pMBLayout = CreateMBLayout(batch);
    size_t requiredSize = pMBLayout->GetNumCols() * sampleSize;
    if (buffer.m_size < requiredSize)
    {
        buffer.Resize(requiredSize);
    }

    auto elementSize = GetSizeByType(stream->m_elementType);

    const auto& sequenceInfos = pMBLayout->GetAllSequences();

    // Iterate over sequences in the layout, copy samples from the
    // source sequences into the buffer (at appropriate offsets).
    for (int i = 0; i < sequenceInfos.size(); ++i)
    {
        const auto& sequenceInfo = sequenceInfos[i];
        // skip gaps
        if (sequenceInfo.seqId == GAP_SEQUENCE_ID)
        {
            continue;
        }

        const auto& sequence = batch[sequenceInfo.seqId];
        size_t numSamples = sequence->m_numberOfSamples;
        assert(numSamples == sequenceInfo.GetNumTimeSteps());

        char* bufferPtr = buffer.m_data.get();
        // Iterate over all samples in the sequence, keep track of the sample offset (which is especially
        // important for sparse input, where offset == number of preceding nnz elements).
        for (size_t sampleIndex = 0, sampleOffset = 0; sampleIndex < numSamples; ++sampleIndex)
        {
            // Compute the offset into the destination buffer, using the layout information 
            // to get the column index corresponding to the given sample.
            auto destinationOffset = pMBLayout->GetColumnIndex(sequenceInfo, sampleIndex) * sampleSize;
            // verify that there's enough space left in the buffer to fit a full sample.
            assert(destinationOffset <= buffer.m_size - sampleSize);
            auto* destination = bufferPtr + destinationOffset;
            if (stream->m_storageType == StorageType::dense)
            {
                // verify that the offset (an invariant for dense).
                assert(sampleOffset == sampleIndex * sampleSize);
                PackDenseSample(destination, sequence, sampleOffset, sampleSize);
                sampleOffset += sampleSize;
            }
            else if (stream->m_storageType == StorageType::sparse_csc)
            {
                // TODO: make type casts members of the SparseSequenceData
                SparseSequenceDataPtr sparseSequence = static_pointer_cast<SparseSequenceData>(sequence);
                // make sure that the sequence meta-data is correct.
                assert(numSamples == sparseSequence->m_nnzCounts.size());
                PackSparseSampleAsDense(destination, sparseSequence, sampleIndex, sampleOffset, sampleSize, elementSize);
                // move the offset by nnz count of the sample.
                sampleOffset += sparseSequence->m_nnzCounts[sampleIndex];
                // verify that the offset is within the bounds (less or equal 
                // to the total nnz count of the sequence).
                assert(sampleOffset <= sparseSequence->m_totalNnzCount);
            }
            else
            {
                RuntimeError("Storage type %d is not supported.", (int)stream->m_storageType);
            }
        }
    }

    return pMBLayout;
}