// The whole CompositeDataReader is meant as a stopgap to allow deserializers/transformers composition until SGD talkes // directly to the new Reader API. // For more information please see its header file. // This method composes together packers + randomizer + a set of transformers and deserializers. CompositeDataReader::CompositeDataReader(const ConfigParameters& config, MemoryProviderPtr provider) : m_layout(make_shared<MBLayout>()), m_corpus(std::make_shared<CorpusDescriptor>()), m_provider(provider) { wstring action = config(L"action", L""); bool isActionWrite = AreEqualIgnoreCase(action, L"write"); // Identifying packing mode. bool frameMode = config(L"frameMode", false); bool truncated = config(L"truncated", false); if (frameMode && truncated) { LogicError("frameMode and truncated BPTT are mutually exclusive."); } if (isActionWrite) // For writing we always use sequence mode. { m_packingMode = PackingMode::sequence; } else if (frameMode) { m_packingMode = PackingMode::sample; } else if (truncated) { m_packingMode = PackingMode::truncated; m_truncationLength = config(L"truncationLength", 0); if (m_truncationLength == 0) { InvalidArgument("Truncation length cannot be 0."); } } else { m_packingMode = PackingMode::sequence; } m_precision = config("precision", "float"); // Creating deserializers. // TODO: Currently the primary deserializer defines the corpus. The logic will be moved to CorpusDescriptor class. CreateDeserializers(config); if (m_deserializers.empty()) { InvalidArgument("Could not find deserializers in the reader config."); } IDataDeserializerPtr deserializer = m_deserializers.front(); if (m_deserializers.size() > 1) { // Bundling deserializers together. // Option whether we need to check data between different deserializers. bool cleanse = config(L"checkData", true); deserializer = std::make_shared<Bundler>(config, deserializer, m_deserializers, cleanse); } int verbosity = config(L"verbosity", 0); // Pick up the randomizer, always picking up no randomization for the write mode. bool randomize = isActionWrite ? false : config(L"randomize", false); // By default do not use omp threads for deserialization of sequences. // It makes sense to put it to true for cases when deserialization is CPU intensive, // i.e. decompression of images. bool multiThreadedDeserialization = config(L"multiThreadedDeserialization", false); if (randomize) { // By default randomizing the whole data set. size_t randomizationWindow = config(L"randomizationWindow", requestDataSize); // By default using STL random number generator. bool useLegacyRandomization = config(L"useLegacyRandomization", false); m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization); } else { m_sequenceEnumerator = std::make_shared<NoRandomizer>(deserializer, multiThreadedDeserialization); } // In case when there are transforms, applying them to the data. m_sequenceEnumerator = m_transforms.empty() ? m_sequenceEnumerator : std::make_shared<TransformController>(m_transforms, m_sequenceEnumerator); // Create output stream descriptions - where to get those? from config? what if it is not the same as network expects? // TODO: Currently only dense output streams. // TODO: Check here. We should already support repacking sparse into dense in the shim/matrix. for (const auto& streamDescription : m_sequenceEnumerator->GetStreamDescriptions()) { StreamDescriptionPtr stream = std::make_shared<StreamDescription>(*streamDescription); stream->m_storageType = StorageType::dense; m_streams.push_back(stream); m_nameToStreamId.insert(std::make_pair(streamDescription->m_name, streamDescription->m_id)); } }
// The whole CompositeDataReader is meant as a stopgap to allow deserializers/transformers composition until SGD talkes // directly to the new Reader API. // For more information please see its header file. // This method composes together packers + randomizer + a set of transformers and deserializers. CompositeDataReader::CompositeDataReader(const ConfigParameters& config) : m_corpus(std::make_shared<CorpusDescriptor>()) { wstring action = config(L"action", L""); bool isActionWrite = AreEqualIgnoreCase(action, L"write"); // Identifying packing mode. bool frameMode = config(L"frameMode", false); bool truncated = config(L"truncated", false); if (frameMode && truncated) { LogicError("frameMode and truncated BPTT are mutually exclusive."); } if (isActionWrite) // For writing we always use sequence mode. { m_packingMode = PackingMode::sequence; } else if (frameMode) { m_packingMode = PackingMode::sample; } else if (truncated) { m_packingMode = PackingMode::truncated; m_truncationLength = config(L"truncationLength", 0); if (m_truncationLength == 0) { InvalidArgument("Truncation length cannot be 0."); } } else { m_packingMode = PackingMode::sequence; } m_precision = config("precision", "float"); // Creating deserializers. // TODO: Currently the primary deserializer defines the corpus. The logic will be moved to CorpusDescriptor class. CreateDeserializers(config); if (m_deserializers.empty()) { InvalidArgument("Could not find deserializers in the reader config."); } IDataDeserializerPtr deserializer = m_deserializers.front(); if (m_deserializers.size() > 1) { // Bundling deserializers together. // Option whether we need to check data between different deserializers. bool cleanse = config(L"checkData", true); deserializer = std::make_shared<Bundler>(config, deserializer, m_deserializers, cleanse); } int verbosity = config(L"verbosity", 0); // Pick up the randomizer, always picking up no randomization for the write mode. bool randomize = isActionWrite ? false : config(L"randomize", false); // By default do not use omp threads for deserialization of sequences. // It makes sense to put it to true for cases when deserialization is CPU intensive, // i.e. decompression of images. bool multiThreadedDeserialization = config(L"multiThreadedDeserialization", false); if (randomize) { // By default randomizing the whole data set. size_t randomizationWindow = config(L"randomizationWindow", requestDataSize); // By default using STL random number generator. bool useLegacyRandomization = config(L"useLegacyRandomization", false); m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, true /* should Prefetch */, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization); } else { m_sequenceEnumerator = std::make_shared<NoRandomizer>(deserializer, multiThreadedDeserialization); } // In case when there are transforms, applying them to the data. m_sequenceEnumerator = m_transforms.empty() ? m_sequenceEnumerator : std::make_shared<TransformController>(m_transforms, m_sequenceEnumerator); // TODO: Creating output stream descriptions - this should come from the network so that we can check // that input matches what the network expects (including tensor shape, etc.). for (const auto& streamDescription : m_sequenceEnumerator->GetStreamDescriptions()) { StreamDescriptionPtr stream = std::make_shared<StreamDescription>(*streamDescription); if (m_packingMode == PackingMode::truncated) { // TODO: Currently BPTT does not support sparse format as output. // We always require dense. stream->m_storageType = StorageType::dense; } m_streams.push_back(stream); } switch (m_packingMode) { case PackingMode::sample: m_packer = std::make_shared<FramePacker>( m_sequenceEnumerator, m_streams); break; case PackingMode::sequence: m_packer = std::make_shared<SequencePacker>( m_sequenceEnumerator, m_streams); break; case PackingMode::truncated: { m_packer = std::make_shared<TruncatedBPTTPacker>( m_sequenceEnumerator, m_streams); break; } default: LogicError("Unsupported type of packer '%d'.", (int)m_packingMode); } }
HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider, const ConfigParameters& readerConfig) : m_seed(0), m_provider(provider) { // TODO: deserializers and transformers will be dynamically loaded // from external libraries based on the configuration/brain script. bool frameMode = readerConfig(L"frameMode", true); bool truncated = readerConfig(L"truncated", false); if (frameMode && truncated) { LogicError("frameMode and truncated BPTT are mutually exclusive."); } if (frameMode) { m_packingMode = PackingMode::sample; } else if (truncated) { m_packingMode = PackingMode::truncated; } else { m_packingMode = PackingMode::sequence; } // nbruttsineachrecurrentiter is old reader configuration, truncationLength is the new one. // If truncation length is specified we estimate // the number of parallel sequences we have to pack as max(1, (mbsize/truncationLength)) // If nbruttsineachrecurrentiter is specified we assume that the truncation size is mbSize // and the real minibatch size in mbSize * nbruttsineachrecurrentiter[epochIndex] m_truncationLength = readerConfig(L"truncationLength", 0); m_numParallelSequencesForAllEpochs = readerConfig(L"nbruttsineachrecurrentiter", ConfigParameters::Array(intargvector(vector<int> { 1 }))); ConfigHelper config(readerConfig); size_t window = config.GetRandomizationWindow(); auto deserializers = CreateDeserializers(readerConfig); if (deserializers.empty()) { LogicError("Please specify at least a single input stream."); } auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers, false); int verbosity = readerConfig(L"verbosity", 2); std::wstring readMethod = config.GetRandomizer(); // TODO: this should be bool. Change when config per deserializer is allowed. if (AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize"))) { m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */); } else if (AreEqualIgnoreCase(readMethod, std::wstring(L"none"))) { m_randomizer = std::make_shared<NoRandomizer>(bundler); } else { RuntimeError("readMethod must be 'blockRandomize' or 'none'."); } m_randomizer->Initialize(nullptr, readerConfig); // Create output stream descriptions (all dense) for (auto d : deserializers) { for (auto i : d->GetStreamDescriptions()) { StreamDescriptionPtr stream = std::make_shared<StreamDescription>(*i); stream->m_storageType = StorageType::dense; stream->m_id = m_streams.size(); m_streams.push_back(stream); } } // TODO: should we unify sample and sequence mode packers into a single one. // TODO: functionally they are the same, the only difference is how we handle // TODO: MBlayout and what is the perf hit for iterating/copying sequences. // TODO: Should do more perf tests before unifying these two. // TODO: As the next step the packers will be moved out of the readers into the // TODO: core CNTK. They are format agnostic and can be used with any type of // TODO: deserializers. switch (m_packingMode) { case PackingMode::sample: m_packer = std::make_shared<FramePacker>(m_provider, m_randomizer, m_streams); break; case PackingMode::sequence: m_packer = std::make_shared<SequencePacker>(m_provider, m_randomizer, m_streams); break; case PackingMode::truncated: m_packer = std::make_shared<TruncatedBPTTPacker>(m_provider, m_randomizer, m_streams); break; default: LogicError("Unsupported type of packer '%d'.", (int)m_packingMode); } }