/*virtual*/ void TraceNode<ElemType>::Log(const FrameRange& fr, bool logGradientInstead) const { if (m_numMBsRun == 1) { const auto prologue = m_formattingOptions.Processed(NodeName(), m_formattingOptions.prologue, m_numMBsRun); fprintf(stderr, "%s", prologue.c_str()); } if (m_numMBsRun <= m_logFirst || (m_logFrequency && (m_numMBsRun-1) % m_logFrequency == 0)) { char formatChar = !m_formattingOptions.isCategoryLabel ? 'f' : !m_formattingOptions.labelMappingFile.empty() ? 's' : 'u'; auto valueFormatString = "%" + m_formattingOptions.precisionFormat + formatChar; // format string used in fprintf() for formatting the values const auto sequenceSeparator = m_formattingOptions.Processed(NodeName(), m_formattingOptions.sequenceSeparator, m_numMBsRun); const auto sequencePrologue = m_formattingOptions.Processed(NodeName(), m_formattingOptions.sequencePrologue, m_numMBsRun); const auto sequenceEpilogue = m_formattingOptions.Processed(NodeName(), m_formattingOptions.sequenceEpilogue, m_numMBsRun); const auto elementSeparator = m_formattingOptions.Processed(NodeName(), m_formattingOptions.elementSeparator, m_numMBsRun); const auto sampleSeparator = m_formattingOptions.Processed(NodeName(), m_formattingOptions.sampleSeparator, m_numMBsRun); let timeRange = fr.GetTimeRange(); fprintf(stderr, "------- Trace["); // --- for better visual separability from actual content if (fr.IsAllFrames()) ; else if (timeRange.second == timeRange.first + 1) fprintf(stderr, "%d", (int)timeRange.first); else if (timeRange.second > timeRange.first + 1) fprintf(stderr, "%d..%d", (int)timeRange.first, (int)timeRange.second-1); fprintf(stderr, "] %ls %s--> %s\n", m_message.c_str(), logGradientInstead ? "(gradient) " : "", InputRef(0).FormatOperationPrototype("").c_str()); InputRef(0).WriteMinibatchWithFormatting(stderr, fr, m_onlyUpToRow, m_onlyUpToT, m_formattingOptions.transpose, m_formattingOptions.isCategoryLabel, m_formattingOptions.isSparse, m_labelMapping, sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator, valueFormatString, logGradientInstead); } }
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/ { assert(inputIndex == 0), inputIndex; // get the args size_t rank = DetermineElementwiseTensorRank(); auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one... auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, fr); // ...to this one // gradients are not as simple as passing an op-code, unfortunately switch (m_reductionOp) { case ElementWiseOperator::opSum: // "Sum": broadcast the gradient // "Mean": same as "Sum" with scaling by 1/#dims sliceInputGrad.AddCopyOf(sliceOutputGrad, m_scale); break; case ElementWiseOperator::opLogSum: { auto input = InputRef(inputIndex).ValueTensorFor(rank, fr); auto output = ValueTensorFor(rank, fr.AllowBroadcast()); // Let: f(x, y, z) = log(exp x + exp y + exp z) // For the derivative we get: // df / dx = exp(x)/exp(f) // = exp(x – f) sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output); } break; case ElementWiseOperator::opMin: case ElementWiseOperator::opMax: auto input = InputRef(inputIndex).ValueTensorFor(rank, fr); auto output = ValueTensorFor(rank, fr.AllowBroadcast()); // POTENTIAL PROBLEM: // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points. // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time. // In these cases there is no correct gradient.The question is if this could lead to any problems. // Let's look at two scenarios where this might happen: // // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value. // In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway. // // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data. // // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution. // Also note that for Clip , Min, Max and ReLU we have the same kind of problem. sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad); break; // more coming } }
// same as GetTensorSliceFor() except that 'fr' refers to a single column, and result will not have seq/time axes // This is needed by TimesNode when the left argument has to be broken up into individual matrices/GEMM calls. // To enable its first argument to have an MBLayout, it needs to un-pad if we have an MBLayout but only refer to a single sequence and time step. TensorShape ComputationNodeBase::GetOneSampleTensorSliceFor(size_t rank, const FrameRange& fr) const { TensorShape result = GetTensorSliceFor(rank, fr); // undo the adding of (seq, time) axes that was done by GetTensorShape() if (!fr.IsOneColumnWrt(GetMBLayout())) LogicError("GetOneSampleTensorSliceFor: Requires 'fr' to refer to a single sample."); if (HasMBLayout()) result.TrimRankInPlace(rank); // Note: This function will verify once again that the extra dimensions have been reduced to [1 x 1] return result; }
void Logger::createXMLDocument(string versionString, FrameRange fr, string timecodefirst, string timecodelast, DetectionParameters params) { if (!itsXMLfileCreated) { itsXMLParser->creatDOMDocument(versionString, fr.getFirst(), fr.getLast(), timecodefirst, timecodelast); // add in source metadata if specified if (itsMetadataSource.getVal().length() > 0) { itsXMLParser->addSourceMetaData(itsMetadataSource.getVal()); } // add in detection parameters itsXMLParser->addDetectionParameters(params); itsXMLParser->writeDocument(itsSaveXMLEventSetName.getVal().c_str()); itsXMLfileCreated = true; } }
void JitWriter::startBlock(const FrameState* frame) { newBlock(frame->abc_pc, frame); MethodSignaturep signature = method_->getMethodSignature(); const Type** types = new (abc_->alloc0()) const Type*[signature->frame_size()]; FrameRange<const FrameValue> from = range(&frame->value(0), frame, signature); FrameRange<const Type*> t = range(types, frame, signature); for (; !from.empty(); from.popFront(), t.popFront()) t.front() = jit_mgr_->lattice()->makeType(from.front()); current_block_->start_types = types; }
/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/ { assert(inputIndex == 0), inputIndex; // get the args size_t rank = DetermineElementwiseTensorRank(); auto sliceOutputGrad = GradientTensorFor(rank, fr); // propagate from this one... auto sliceInputGrad = Input(0)->GradientTensorFor(rank, fr); // ...to this one // gradients are not as simple as passing an op-code, unfortunately switch (m_reductionOp) { case ElementWiseOperator::opSum: // "Sum": broadcast the gradient sliceInputGrad.AddCopyOf(sliceOutputGrad); break; case ElementWiseOperator::opMax: case ElementWiseOperator::opMin: auto input = Input(inputIndex)->ValueTensorFor(rank, fr); auto output = ValueTensorFor(rank, fr.AllowBroadcast()); // POTENTIAL PROBLEM: // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points. // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time. // In these cases there is no correct gradient.The question is if this could lead to any problems. // Let's look at two scenarios where this might happen: // // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value. // In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway. // // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data. // // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution. // Also note that for Clip , Min, Max and ReLU we have the same kind of problem. sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad); break; // more coming // "LogPlus": softmax // f(x) = log(sum_i exp x_i), hence gradient is: // df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i - ReduceLogPlus(x)) // targetGradient = gradientFromTop .* Exp (inputValue - outputValue) --TODO: verify // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary. } }
void Logger::saveVisualEventSetToXML(std::list<MbariVisualEvent::VisualEvent *> &eventList, int eventframe, string eventframetimecode, FrameRange fr) { if (!itsXMLfileCreated) LFATAL("Error: Create an XML document first with createXMLDocument()"); else { itsXMLParser->add(itsSaveBoringEvents.getVal(), eventList, eventframe, eventframetimecode, itsScaleW, itsScaleH); } if (fr.getLast() == eventframe) { if (!itsXMLParser->isXMLValid(itsSaveXMLEventSetName.getVal().c_str())) LFATAL("Error: There is something wrong with the XML auto generated"); else { itsXMLParser->writeDocument(itsSaveXMLEventSetName.getVal().c_str()); LINFO("The XML output is valid"); } } }
void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const FrameRange& fr, size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, bool isSparse, const vector<string>& labelMapping, const string& sequenceSeparator, const string& sequencePrologue, const string& sequenceEpilogue, const string& elementSeparator, const string& sampleSeparator, string valueFormatString, bool outputGradient) const { // get minibatch matrix -> matData, matRows, matStride const Matrix<ElemType>& outputValues = outputGradient ? Gradient() : Value(); let matRows = outputValues.GetNumRows(); let matStride = matRows; // how to get from one column to the next unique_ptr<ElemType[]> matDataPtr(outputValues.CopyToArray()); ElemType* matData = matDataPtr.get(); let sampleLayout = GetSampleLayout(); // this is currently only used for sparse; dense tensors are linearized // process all sequences one by one MBLayoutPtr pMBLayout = GetMBLayout(); if (!pMBLayout) // no MBLayout: We are printing aggregates (or LearnableParameters?) { pMBLayout = make_shared<MBLayout>(); pMBLayout->Init(1, outputValues.GetNumCols()); // treat this as if we have one single sequence consisting of the columns pMBLayout->AddSequence(0, 0, 0, outputValues.GetNumCols()); } let& sequences = pMBLayout->GetAllSequences(); let width = pMBLayout->GetNumTimeSteps(); TensorShape tensorShape = GetSampleLayout(); stringstream str; let dims = tensorShape.GetDims(); for (auto dim : dims) str << dim << ' '; let shape = str.str(); // BUGBUG: change to string(tensorShape) to make sure we always use the same format bool sequencePrologueHasShape = sequencePrologue.find("%x") != sequencePrologue.npos; bool sampleSeparatorHasShape = sampleSeparator.find("%x") != sampleSeparator.npos; bool sequencePrologueHasSeqId = sequencePrologue.find("%d") != sequencePrologue.npos; bool sampleSeparatorHasSeqId = sampleSeparator.find("%d") != sampleSeparator.npos; for (size_t s = 0; s < sequences.size(); s++) { const auto& seqInfo = sequences[s]; if (seqInfo.seqId == GAP_SEQUENCE_ID) // nothing in gaps to print continue; let tBegin = seqInfo.tBegin >= 0 ? seqInfo.tBegin : 0; let tEnd = seqInfo.tEnd <= width ? seqInfo.tEnd : width; // [tBegin,tEnd) is where the sequence resides. // fr is also referencing where a sequence resides. // narrow to FrameRange if needed auto t0 = fr.IsAllFrames() ? tBegin : fr.m_timeOffset + (ptrdiff_t)fr.timeIdxInSeq; auto t1 = fr.IsAllFrames() ? tEnd : fr.m_timeOffset + (ptrdiff_t)fr.timeIdxInSeq + (ptrdiff_t)fr.m_timeRange; if (t0 < tBegin) t0 = tBegin; if (t1 > tEnd) t1 = tEnd; // [t0,t1) is the range we want to print if (t0 > (ptrdiff_t)t1) continue; // skip this sequence // get sequence matrix -> seqData, seqRows, seqCols, seqStride let seqData = matData + pMBLayout->GetColumnIndex(seqInfo, t0 - tBegin) * matStride; auto seqRows = matRows; let seqCols = t1 - t0; let seqStride = pMBLayout->GetNumParallelSequences() * matStride; auto seqProl = sequencePrologue; auto sampleSep = sampleSeparator; if (sequencePrologueHasShape || sampleSeparatorHasShape) { auto sh = msra::strfun::_strprintf<char>("%s%ld", shape.c_str(), (unsigned long long)seqInfo.GetNumTimeSteps()); if (sequencePrologueHasShape) seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%x", sh); if (sampleSeparatorHasShape) sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%x", sh); } if (sequencePrologueHasSeqId || sampleSeparatorHasSeqId) { auto sh = msra::strfun::_strprintf<char>("%ld", (unsigned long long)seqInfo.seqId); if (sequencePrologueHasSeqId) seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%d", sh); if (sampleSeparatorHasSeqId) sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%d", sh); } if (s > 0) fprintfOrDie(f, "%s", sequenceSeparator.c_str()); fprintfOrDie(f, "%s", seqProl.c_str()); // output it according to our format specification auto formatChar = valueFormatString.back(); if (isCategoryLabel) // if is category then find the max value and output its index (possibly mapped to a string) { if (formatChar == 's') // verify label dimension { if (outputValues.GetNumRows() != labelMapping.size() && sampleLayout[0] != labelMapping.size()) // if we match the first dim then use that { static size_t warnings = 0; if (warnings++ < 5) fprintf(stderr, "write: Row dimension %d does not match number of entries %d in labelMappingFile, not using mapping\n", (int)seqRows, (int)labelMapping.size()); valueFormatString.back() = 'u'; // this is a fallback formatChar = valueFormatString.back(); } } // update the matrix in-place from one-hot (or max) to index // find the max in each column for (size_t j = 0; j < seqCols; j++) // loop over all time steps of the sequence { double maxLoc = -1; double maxVal = 0; for (size_t i = 0; i < seqRows; i++) // loop over rows { let val = seqData[i + j * seqStride]; if (maxLoc < 0 || val >= maxVal) { maxLoc = (double)i; maxVal = val; } } seqData[0 + j * seqStride] = (ElemType)maxLoc; // overwrite first element in-place } seqRows = 1; // ignore remaining dimensions } // function to print a value auto print = [&](double dval) { if (formatChar == 'f') // print as real number { if (dval == 0) dval = fabs(dval); // clear the sign of a negative 0, which are produced inconsistently between CPU and GPU fprintfOrDie(f, valueFormatString.c_str(), dval); } else if (formatChar == 'u') // print category as integer index { fprintfOrDie(f, valueFormatString.c_str(), (unsigned int)dval); } else if (formatChar == 's') // print category as a label string { size_t uval = (size_t)dval; if (!labelMapping.empty()) uval %= labelMapping.size(); assert(uval < labelMapping.size()); const char * sval = labelMapping[uval].c_str(); fprintfOrDie(f, valueFormatString.c_str(), sval); } }; // bounds for printing let iend = transpose ? seqRows : seqCols; // true dimension of the data to print let jend = transpose ? seqCols : seqRows; let istop = transpose ? onlyUpToRow : onlyUpToT; // we stop at these dimensions (for debugging, one often needs only the first few values of those huge matrices) let jstop = transpose ? onlyUpToT : onlyUpToRow; let istride = transpose ? 1 : seqStride; let jstride = transpose ? seqStride : 1; if (isSparse) { // sparse linearizes the entire matrix into a single vector, and prints that one with coordinates // TODO: This can be done more nicely. We should keep the block structure. size_t numPrinted = 0; for (size_t i = 0; i < iend; i++) // loop over elements --we just flatten them all out { for (size_t j = 0; j < jend; j++) // loop over rows { double dval = seqData[i * istride + j * jstride]; if (dval == 0) // only print non-0 values continue; if (numPrinted++ > 0) fprintfOrDie(f, "%s", transpose ? sampleSeparator.c_str() : elementSeparator.c_str()); if (dval != 1.0 || formatChar != 'f') // hack: we assume that we are either one-hot or never precisely hitting 1.0 print(dval); size_t row = transpose ? i : j; size_t col = transpose ? j : i; for (size_t k = 0; k < sampleLayout.size(); k++) { fprintfOrDie(f, "%c%d", k == 0 ? '[' : ',', row % sampleLayout[k]); if (sampleLayout[k] == labelMapping.size()) // annotate index with label if dimensions match (which may misfire once in a while) fprintfOrDie(f, "=%s", labelMapping[row % sampleLayout[k]].c_str()); row /= sampleLayout[k]; } if (seqInfo.GetNumTimeSteps() > 1) fprintfOrDie(f, ";%d", col); fprintfOrDie(f, "]"); } } } else { for (size_t j = 0; j < jend; j++) // loop over output rows --BUGBUG: row index is 'i'!! Rename these!! { if (j > 0) fprintfOrDie(f, "%s", sampleSep.c_str()); if (j == jstop && jstop < jend - 1) // if jstop == jend-1 we may as well just print the value instead of '...' { fprintfOrDie(f, "...+%d", (int)(jend - jstop)); // 'nuff said break; } // inject sample tensor index if we are printing row-wise and it's a tensor if (!transpose && sampleLayout.size() > 1 && !isCategoryLabel) // each row is a different sample dimension { for (size_t k = 0; k < sampleLayout.size(); k++) fprintfOrDie(f, "%c%d", k == 0 ? '[' : ',', (int)((j / sampleLayout.GetStrides()[k])) % sampleLayout[k]); fprintfOrDie(f, "]\t"); } // print a row of values for (size_t i = 0; i < iend; i++) // loop over elements { if (i > 0) fprintfOrDie(f, "%s", elementSeparator.c_str()); if (i == istop && istop < iend - 1) { fprintfOrDie(f, "...+%d", (int)(iend - istop)); break; } double dval = seqData[i * istride + j * jstride]; print(dval); } } } fprintfOrDie(f, "%s", sequenceEpilogue.c_str()); } // end loop over sequences fflushOrDie(f); }