// Create deserializers based on the specified configuration. // deserializers = [ // [ type = "ImageDataDeserializer" module = "ImageReader" ...] // [ type = "CNTKTextFormatDeserializer" module = "CNTKTextFormatReader" ...] bool CompositeDataReader::CreateDeserializers(const ConfigParameters& readerConfig) { argvector<ConfigValue> deserializerConfigs = readerConfig(L"deserializers", ConfigParameters::Array(argvector<ConfigValue>(vector<ConfigValue> {}))); assert(m_deserializers.empty()); auto traceLevel = readerConfig.Find("traceLevel"); bool composable = true; bool primary = true; // Currently, the first deserializer becomes primary - it drives chunking. for (size_t i = 0; i < deserializerConfigs.size(); ++i) { // TODO: Should go away in the future. Framing can be done on top of deserializers. ConfigParameters p = deserializerConfigs[i]; p.Insert("frameMode", m_packingMode == PackingMode::sample ? "true" : "false"); p.Insert("precision", m_precision); if (!traceLevel.empty()) { p.Insert("traceLevel", traceLevel); } composable &= p(L"composable", true); DataDeserializerPtr d = CreateDeserializer(p, primary); primary = false; m_deserializers.push_back(d); } return composable; }
// TODO: Not safe from the ABI perspective. Will be uglified to make the interface ABI. // A factory method for creating text deserializers. extern "C" DATAREADER_API bool CreateDeserializer(IDataDeserializer** deserializer, const std::wstring& type, const ConfigParameters& deserializerConfig, CorpusDescriptorPtr corpus, bool) { string precision = deserializerConfig.Find("precision", "float"); if (!AreEqualIgnoreCase(precision, "float") && !AreEqualIgnoreCase(precision, "double")) { InvalidArgument("Unsupported precision '%s'", precision.c_str()); } // TODO: Remove type from the parser. Current implementation does not support streams of different types. if (type == L"CNTKTextFormatDeserializer") { if (precision == "float") *deserializer = new TextParser<float>(corpus, TextConfigHelper(deserializerConfig)); else // double *deserializer = new TextParser<double>(corpus, TextConfigHelper(deserializerConfig)); } else InvalidArgument("Unknown deserializer type '%ls'", type.c_str()); // Deserializer created. return true; }
ImageConfigHelper::ImageConfigHelper(const ConfigParameters& config) : m_dataFormat(CHW) { std::vector<std::string> featureNames = GetSectionsWithParameter(config, "width"); std::vector<std::string> labelNames = GetSectionsWithParameter(config, "labelDim"); // REVIEW alexeyk: currently support only one feature and label section. if (featureNames.size() != 1 || labelNames.size() != 1) { RuntimeError( "ImageReader currently supports a single feature and label stream. '%d' features , '%d' labels found.", static_cast<int>(featureNames.size()), static_cast<int>(labelNames.size())); } ConfigParameters featureSection = config(featureNames[0]); size_t w = featureSection("width"); size_t h = featureSection("height"); size_t c = featureSection("channels"); std::string mbFmt = featureSection("mbFormat", "nchw"); if (AreEqualIgnoreCase(mbFmt, "nhwc") || AreEqualIgnoreCase(mbFmt, "legacy")) { m_dataFormat = HWC; } else if (!AreEqualIgnoreCase(mbFmt, "nchw") || AreEqualIgnoreCase(mbFmt, "cudnn")) { RuntimeError("ImageReader does not support the sample format '%s', only 'nchw' and 'nhwc' are supported.", mbFmt.c_str()); } auto features = std::make_shared<StreamDescription>(); features->m_id = 0; features->m_name = msra::strfun::utf16(featureSection.ConfigName()); features->m_sampleLayout = std::make_shared<TensorShape>(ImageDimensions(w, h, c).AsTensorShape(m_dataFormat)); m_streams.push_back(features); ConfigParameters label = config(labelNames[0]); size_t labelDimension = label("labelDim"); auto labelSection = std::make_shared<StreamDescription>(); labelSection->m_id = 1; labelSection->m_name = msra::strfun::utf16(label.ConfigName()); labelSection->m_sampleLayout = std::make_shared<TensorShape>(labelDimension); m_streams.push_back(labelSection); m_mapPath = config(L"file"); std::string rand = config(L"randomize", "auto"); if (AreEqualIgnoreCase(rand, "auto")) { m_randomize = true; } else if (AreEqualIgnoreCase(rand, "none")) { m_randomize = false; } else { RuntimeError("'randomize' parameter must be set to 'auto' or 'none'"); } // Identify precision string precision = config.Find("precision", "float"); if (AreEqualIgnoreCase(precision, "float")) { features->m_elementType = ElementType::tfloat; labelSection->m_elementType = ElementType::tfloat; } else if (AreEqualIgnoreCase(precision, "double")) { features->m_elementType = ElementType::tdouble; labelSection->m_elementType = ElementType::tdouble; } else { RuntimeError("Not supported precision '%s'. Expected 'double' or 'float'.", precision.c_str()); } m_cpuThreadCount = config(L"numCPUThreads", 0); }
TextConfigHelper::TextConfigHelper(const ConfigParameters& config) { if (!config.ExistsCurrent(L"input")) { RuntimeError("CNTKTextFormatReader configuration does not contain \"input\" section."); } const ConfigParameters& input = config(L"input"); if (input.empty()) { RuntimeError("CNTKTextFormatReader configuration contains an empty \"input\" section."); } string precision = config.Find("precision", "float"); if (AreEqualIgnoreCase(precision, "double")) { m_elementType = ElementType::tdouble; } else if (AreEqualIgnoreCase(precision, "float")) { m_elementType = ElementType::tfloat; } else { RuntimeError("Not supported precision '%s'. Expected 'double' or 'float'.", precision.c_str()); } StreamId id = 0; map<string, wstring> aliasToInputMap; for (const pair<string, ConfigParameters>& section : input) { ConfigParameters input = section.second; wstring name = msra::strfun::utf16(section.first); if (!input.ExistsCurrent(L"dim") || !input.ExistsCurrent(L"format")) { RuntimeError("Input section for input '%ls' does not specify all the required parameters, " "\"dim\" and \"format\".", name.c_str()); } StreamDescriptor stream; stream.m_id = id++; stream.m_name = name; stream.m_sampleDimension = input(L"dim"); string type = input(L"format"); if (AreEqualIgnoreCase(type, "dense")) { stream.m_storageType = StorageType::dense; } else if (AreEqualIgnoreCase(type, "sparse")) { stream.m_storageType = StorageType::sparse_csc; if (stream.m_sampleDimension > numeric_limits<IndexType>::max()) { RuntimeError("Sample dimension (%" PRIu64 ") for sparse input '%ls'" " exceeds the maximum allowed value (%" PRIu64 ").\n", stream.m_sampleDimension, name.c_str(), (size_t)numeric_limits<IndexType>::max()); } } else { RuntimeError("'format' parameter must be set either to 'dense' or 'sparse'."); } // alias is optional if (input.ExistsCurrent(L"alias")) { stream.m_alias = input(L"alias"); if (stream.m_alias.empty()) { RuntimeError("Alias value for input '%ls' is empty.", name.c_str()); } } else { stream.m_alias = section.first; } if (aliasToInputMap.find(stream.m_alias) != aliasToInputMap.end()) { RuntimeError("Alias %s is already mapped to input %ls.", stream.m_alias.c_str(), aliasToInputMap[stream.m_alias].c_str()); } else { aliasToInputMap[stream.m_alias] = stream.m_name; } stream.m_elementType = m_elementType; m_streams.push_back(stream); } m_filepath = msra::strfun::utf16(config(L"file")); if (config.Exists(L"randomize")) { wstring randomizeString = config.CanBeString(L"randomize") ? config(L"randomize") : wstring(); if (!_wcsicmp(randomizeString.c_str(), L"none")) { m_randomizationWindow = randomizeNone; } else if (!_wcsicmp(randomizeString.c_str(), L"auto")) { m_randomizationWindow = randomizeAuto; } else { m_randomizationWindow = config(L"randomize"); } } else { m_randomizationWindow = randomizeAuto; } m_skipSequenceIds = config(L"skipSequenceIds", false); m_maxErrors = config(L"maxErrors", 0); m_traceLevel = config(L"traceLevel", 0); m_chunkSizeBytes = config(L"chunkSizeInBytes", 32 * 1024 * 1024); // 32 MB by default m_chunkCacheSize = config(L"numChunksToCache", 32); // 32 * 32 MB = 1 GB of memory in total }
TextConfigHelper::TextConfigHelper(const ConfigParameters& config) { if (!config.ExistsCurrent(L"input")) { RuntimeError("CNTKTextFormatReader configuration does not contain \"input\" section."); } const ConfigParameters& input = config(L"input"); if (input.empty()) { RuntimeError("CNTKTextFormatReader configuration contains an empty \"input\" section."); } string precision = config.Find("precision", "float"); if (AreEqualIgnoreCase(precision, "double")) { m_elementType = DataType::Double; } else if (AreEqualIgnoreCase(precision, "float")) { m_elementType = DataType::Float; } else { RuntimeError("Not supported precision '%s'. Expected 'double' or 'float'.", precision.c_str()); } StreamId id = 0; map<string, wstring> aliasToInputMap; for (const pair<string, ConfigParameters>& section : input) { ConfigParameters input2 = section.second; wstring name = msra::strfun::utf16(section.first); if (!input2.ExistsCurrent(L"dim") || !input2.ExistsCurrent(L"format")) { RuntimeError("Input section for input '%ls' does not specify all the required parameters, " "\"dim\" and \"format\".", name.c_str()); } StreamDescriptor stream; stream.m_id = id++; stream.m_name = name; stream.m_sampleDimension = input2(L"dim"); stream.m_definesMbSize = input2(L"definesMBSize", false); string type = input2(L"format"); if (AreEqualIgnoreCase(type, "dense")) { stream.m_storageFormat = StorageFormat::Dense; } else if (AreEqualIgnoreCase(type, "sparse")) { stream.m_storageFormat = StorageFormat::SparseCSC; if (stream.m_sampleDimension > numeric_limits<IndexType>::max()) { RuntimeError("Sample dimension (%" PRIu64 ") for sparse input '%ls'" " exceeds the maximum allowed value (%" PRIu64 ").\n", stream.m_sampleDimension, name.c_str(), (size_t)numeric_limits<IndexType>::max()); } } else { RuntimeError("'format' parameter must be set either to 'dense' or 'sparse'."); } // alias is optional if (input2.ExistsCurrent(L"alias")) { stream.m_alias = input2(L"alias"); if (stream.m_alias.empty()) { RuntimeError("Alias value for input '%ls' is empty.", name.c_str()); } } else { stream.m_alias = section.first; } if (aliasToInputMap.find(stream.m_alias) != aliasToInputMap.end()) { RuntimeError("Alias %s is already mapped to input %ls.", stream.m_alias.c_str(), aliasToInputMap[stream.m_alias].c_str()); } else { aliasToInputMap[stream.m_alias] = stream.m_name; } stream.m_elementType = m_elementType; m_streams.push_back(stream); } m_filepath = msra::strfun::utf16(config(L"file")); m_skipSequenceIds = config(L"skipSequenceIds", false); m_maxErrors = config(L"maxErrors", 0); m_traceLevel = config(L"traceLevel", 1); m_chunkSizeBytes = config(L"chunkSizeInBytes", g_32MB); // 32 MB by default m_keepDataInMemory = config(L"keepDataInMemory", false); m_frameMode = config(L"frameMode", false); m_randomizationWindow = GetRandomizationWindowFromConfig(config); m_sampleBasedRandomizationWindow = config(L"sampleBasedRandomizationWindow", false); if (!m_sampleBasedRandomizationWindow && m_randomizationWindow == randomizeAuto) { m_randomizationWindow = g_4GB / m_chunkSizeBytes; // ~ 4 GB (on disk) worth of chunks } }