void LearnableParameter<ElemType>::InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly) { // fprintf(stderr, "%d x %d: %d %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str()); // the random seed offset is set via the "randomSeedOffset" parameter in config if (initOnCPUOnly) Value().TransferToDeviceIfNotThere(CPUDEVICE, true); #if 1 // this more complex version is needed to repro test cases generated with an older version auto& value = GetSampleLayout().GetRank() > 2 ? Value() : ValueAsMatrix(); #else auto& value = Value(); #endif if (uniformInit) { // TODO: move these hidden extra factors out from here and into NDL, and make them visible in BS ElemType randRange = 0.05f * initValueScale; value.SetUniformRandomValue(-randRange, randRange, randomSeed); } else { size_t inputSize = value.GetNumCols(); ElemType randInitstd = 0.2f * initValueScale / sqrt(ElemType(inputSize)); value.SetGaussianRandomValue(0, randInitstd, randomSeed); } if (initOnCPUOnly) Value().TransferToDeviceIfNotThere(m_deviceId, true); }
void LearnableParameter<ElemType>::InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols) { // infer tensor dimensions from input file if not set // Note: The mapping of dimensions of the input matrix to tensor dimensions are somewhat confusing. // The file contains a 2D matrix (one row per text line) that is saved into our column-major representation. // That representation is then reshaped into a column-major tensor. if (GetSampleLayout().GetNumElements() == 0) // at least one dimension is 0 { auto dims = GetSampleLayout().GetDims(); // infer rank if (dims.size() == 0) dims.push_back(0); if (dims.size() == 1 && numCols != 1) dims.push_back(0); // infer #rows if (dims[0] == 0) // infer row dimension as input matrix row dimension dims[0] = numRows; // (if already set, then mismatch will be caught in VerifyDataSize() below) // infer #cols: product of all dimensions but the first must match matrix #cols; if there is a single 0 position, we infer it size_t zeroDim = 0; // 0 means not found size_t prod = 1; for (size_t k = 1; k < dims.size(); k++) { auto dim = dims[k]; if (dim != 0) prod *= dim; else if (zeroDim == 0) zeroDim = k; else InvalidArgument("%ls %ls operation's specified shape [%s] cannot be inferred: Too many unknown dimensions.", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str()); } if (zeroDim != 0) // we found a zero { dims[zeroDim] = numCols / prod; if (prod * dims[zeroDim] != numCols) InvalidArgument("%ls %ls operation's specified shape [%s] cannot be inferred: Tensor shape cannot hold a [%d x %d] matrix.", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str(), (int)numRows, (int)numCols); } SetDims(TensorShape(dims), false); } // BUGBUG: We should allow to read an arbitrary tensor from a single-column file. // Currently, this would cause a matrix/tensor dimension mismatch. --TODO: Is this comment up-to-date? Value().SetValue(numRows, numCols, m_deviceId, const_cast<ElemType*>(array.data()), matrixFlagNormal); // TODO: Get rid of that const_cast, as soon as after Ryan's Matrix-lib refactoring separated out SetValue() from external vs. from deep copy VerifyDataSize(Value()); // sanity check }
// form the actual tensor that describes the full object TensorShape ComputationNodeBase::GetTensorShape(size_t rank) const { // If we have an MB layout then add the necessary sequence and time axes. If we have none, then absorb the column dimension. TensorShape tensorShape = GetSampleLayout(); // TODO: Do we need to expect this tensor to have arbitrary strides? In case it came out of a Slice, Reshape, or Transpose op in-place? if (HasMBLayout()) { size_t i = (rank != SIZE_MAX) ? rank : tensorShape.GetRank(); tensorShape.AppendInPlace(i++, GetMBLayout()->GetNumParallelSequences()); tensorShape.AppendInPlace(i++, GetMBLayout()->GetNumTimeSteps()); } return tensorShape; }
// determine the sample tensor dimension to use for operations based on output and all inputs // 'Sample tensor' means we only consider single samples. If we have an MBLayout, that is the sample layout of a single matrix column. // TODO: Turn rank into a member variable, and call this method once in validation (currently called for every single ForwardProp/BackpropTo()). size_t ComputationNodeBase::DetermineElementwiseTensorRank() const { // determine largest tensor dimension amongst the sample shapes of output and the selected inputs size_t maxRank = GetSampleLayout().GetRank(); for (size_t i = 0; i < GetNumInputs(); i++) { size_t rank = Input(i)->GetSampleLayout().GetRank(); if (maxRank < rank) maxRank = rank; } return maxRank; }
// helper function for validation // In complex cases of convolution, dimensions are quite difficult for a user to know/derive. // This is a feature that allows a node to help resizing its input node to the expected value // iff that input must be a learnable parameter. void ComputationNodeBase::ValidateInferBinaryInputDims() { // limited inference of children dimensions // if dimension not specified we assume two operands' dimensions should be the same // NOTE: The assert is set to check if >= 2 since this is called from nodes which have more than two children. // The number of children is formally verified elsewhere, so this will not break consistency. assert(m_inputs.size() >= 2); for (size_t index = 0; index < 2; index++) { auto in = Input( index); auto other = Input(1 - index); // borrow any unset dimension on one input from the other input in->ValidateInferInputDimsFrom(other->GetSampleLayout()); } }
void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const FrameRange& fr, size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, bool isSparse, const vector<string>& labelMapping, const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator, string valueFormatString, bool outputGradient) const { // get minibatch matrix -> matData, matRows, matStride const Matrix<ElemType>& outputValues = outputGradient ? Gradient() : Value(); let matRows = outputValues.GetNumRows(); let matStride = matRows; // how to get from one column to the next unique_ptr<ElemType[]> matDataPtr(outputValues.CopyToArray()); ElemType* matData = matDataPtr.get(); let sampleLayout = GetSampleLayout(); // this is currently only used for sparse; dense tensors are linearized // process all sequences one by one MBLayoutPtr pMBLayout = GetMBLayout(); if (!pMBLayout) // no MBLayout: We are printing aggregates (or LearnableParameters?) { pMBLayout = make_shared<MBLayout>(); pMBLayout->Init(1, outputValues.GetNumCols()); // treat this as if we have one single sequence consisting of the columns pMBLayout->AddSequence(0, 0, 0, outputValues.GetNumCols()); } let& sequences = pMBLayout->GetAllSequences(); let width = pMBLayout->GetNumTimeSteps(); TensorShape tensorShape = GetSampleLayout(); stringstream str; let dims = tensorShape.GetDims(); for (auto dim : dims) str << dim << ' '; let shape = str.str(); // BUGBUG: change to string(tensorShape) to make sure we always use the same format bool sequencePrologueHasShape = sequencePrologue.find("%x") != sequencePrologue.npos; bool sampleSeparatorHasShape = sampleSeparator.find("%x") != sampleSeparator.npos; bool sequencePrologueHasSeqId = sequencePrologue.find("%d") != sequencePrologue.npos; bool sampleSeparatorHasSeqId = sampleSeparator.find("%d") != sampleSeparator.npos; for (size_t s = 0; s < sequences.size(); s++) { const auto& seqInfo = sequences[s]; if (seqInfo.seqId == GAP_SEQUENCE_ID) // nothing in gaps to print continue; let tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; let tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; // [tBegin,tEnd) is where the sequence resides. // fr is also referencing where a sequence resides. // narrow to FrameRange if needed auto t0 = fr.IsAllFrames() ? tBegin : fr.m_timeOffset + (ptrdiff_t)fr.timeIdxInSeq; auto t1 = fr.IsAllFrames() ? tEnd : fr.m_timeOffset + (ptrdiff_t)fr.timeIdxInSeq + (ptrdiff_t)fr.m_timeRange; if (t0 < tBegin) t0 = tBegin; if (t1 > tEnd) t1 = tEnd; // [t0,t1) is the range we want to print if (t0 > (ptrdiff_t)t1) continue; // skip this sequence // get sequence matrix -> seqData, seqRows, seqCols, seqStride let seqData = matData + pMBLayout->GetColumnIndex(seqInfo, t0 - tBegin) * matStride; auto seqRows = matRows; let seqCols = t1 - t0; let seqStride = pMBLayout->GetNumParallelSequences() * matStride; auto seqProl = sequencePrologue; auto sampleSep = sampleSeparator; if (sequencePrologueHasShape || sampleSeparatorHasShape) { auto sh = msra::strfun::_strprintf<char>("%s%ld", shape.c_str(), (unsigned long long)seqInfo.GetNumTimeSteps()); if (sequencePrologueHasShape) seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%x", sh); if (sampleSeparatorHasShape) sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%x", sh); } if (sequencePrologueHasSeqId || sampleSeparatorHasSeqId) { auto sh = msra::strfun::_strprintf<char>("%ld", (unsigned long long)seqInfo.seqId); if (sequencePrologueHasSeqId) seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%d", sh); if (sampleSeparatorHasSeqId) sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%d", sh); } if (s > 0) fprintfOrDie(f, "%s", sequenceSeparator.c_str()); fprintfOrDie(f, "%s", seqProl.c_str()); // output it according to our format specification auto formatChar = valueFormatString.back(); if (isCategoryLabel) // if is category then find the max value and output its index (possibly mapped to a string) { if (formatChar == 's') // verify label dimension { if (outputValues.GetNumRows() != labelMapping.size() && sampleLayout[0] != labelMapping.size()) // if we match the first dim then use that { static size_t warnings = 0; if (warnings++ < 5) fprintf(stderr, "write: Row dimension %d does not match number of entries %d in labelMappingFile, not using mapping\n", (int)seqRows, (int)labelMapping.size()); valueFormatString.back() = 'u'; // this is a fallback formatChar = valueFormatString.back(); } } // update the matrix in-place from one-hot (or max) to index // find the max in each column for (size_t j = 0; j < seqCols; j++) // loop over all time steps of the sequence { double maxLoc = -1; double maxVal = 0; for (size_t i = 0; i < seqRows; i++) // loop over rows { let val = seqData[i + j * seqStride]; if (maxLoc < 0 || val >= maxVal) { maxLoc = (double)i; maxVal = val; } } seqData[0 + j * seqStride] = (ElemType)maxLoc; // overwrite first element in-place } seqRows = 1; // ignore remaining dimensions } // function to print a value auto print = [&](double dval) { if (formatChar == 'f') // print as real number { if (dval == 0) dval = fabs(dval); // clear the sign of a negative 0, which are produced inconsistently between CPU and GPU fprintfOrDie(f, valueFormatString.c_str(), dval); } else if (formatChar == 'u') // print category as integer index { fprintfOrDie(f, valueFormatString.c_str(), (unsigned int)dval); } else if (formatChar == 's') // print category as a label string { size_t uval = (size_t)dval; if (!labelMapping.empty()) uval %= labelMapping.size(); assert(uval < labelMapping.size()); const char * sval = labelMapping[uval].c_str(); fprintfOrDie(f, valueFormatString.c_str(), sval); } }; // bounds for printing let iend = transpose ? seqRows : seqCols; // true dimension of the data to print let jend = transpose ? seqCols : seqRows; let istop = transpose ? onlyUpToRow : onlyUpToT; // we stop at these dimensions (for debugging, one often needs only the first few values of those huge matrices) let jstop = transpose ? onlyUpToT : onlyUpToRow; let istride = transpose ? 1 : seqStride; let jstride = transpose ? seqStride : 1; if (isSparse) { // sparse linearizes the entire matrix into a single vector, and prints that one with coordinates // TODO: This can be done more nicely. We should keep the block structure. size_t numPrinted = 0; for (size_t i = 0; i < iend; i++) // loop over elements --we just flatten them all out { for (size_t j = 0; j < jend; j++) // loop over rows { double dval = seqData[i * istride + j * jstride]; if (dval == 0) // only print non-0 values continue; if (numPrinted++ > 0) fprintfOrDie(f, "%s", transpose ? sampleSeparator.c_str() : elementSeparator.c_str()); if (dval != 1.0 || formatChar != 'f') // hack: we assume that we are either one-hot or never precisely hitting 1.0 print(dval); size_t row = transpose ? i : j; size_t col = transpose ? j : i; for (size_t k = 0; k < sampleLayout.size(); k++) { fprintfOrDie(f, "%c%d", k == 0 ? '[' : ',', row % sampleLayout[k]); if (sampleLayout[k] == labelMapping.size()) // annotate index with label if dimensions match (which may misfire once in a while) fprintfOrDie(f, "=%s", labelMapping[row % sampleLayout[k]].c_str()); row /= sampleLayout[k]; } if (seqInfo.GetNumTimeSteps() > 1) fprintfOrDie(f, ";%d", col); fprintfOrDie(f, "]"); } } } else { for (size_t j = 0; j < jend; j++) // loop over output rows --BUGBUG: row index is 'i'!! Rename these!! { if (j > 0) fprintfOrDie(f, "%s", sampleSep.c_str()); if (j == jstop && jstop < jend - 1) // if jstop == jend-1 we may as well just print the value instead of '...' { fprintfOrDie(f, "...+%d", (int)(jend - jstop)); // 'nuff said break; } // inject sample tensor index if we are printing row-wise and it's a tensor if (!transpose && sampleLayout.size() > 1 && !isCategoryLabel) // each row is a different sample dimension { for (size_t k = 0; k < sampleLayout.size(); k++) fprintfOrDie(f, "%c%d", k == 0 ? '[' : ',', (int)((j / sampleLayout.GetStrides()[k])) % sampleLayout[k]); fprintfOrDie(f, "]\t"); } // print a row of values for (size_t i = 0; i < iend; i++) // loop over elements { if (i > 0) fprintfOrDie(f, "%s", elementSeparator.c_str()); if (i == istop && istop < iend - 1) { fprintfOrDie(f, "...+%d", (int)(iend - istop)); break; } double dval = seqData[i * istride + j * jstride]; print(dval); } } } fprintfOrDie(f, "%s", sequenceEpilogue.c_str()); } // end loop over sequences fflushOrDie(f); }