// Runs forward propagation of activations on the input line. // See NetworkCpp for a detailed discussion of the arguments. void Parallel::Forward(bool debug, const NetworkIO& input, const TransposedArray* input_transpose, NetworkScratch* scratch, NetworkIO* output) { bool parallel_debug = false; // If this parallel is a replicator of convolvers, or holds a 1-d LSTM pair, // or a 2-d LSTM quad, do debug locally, and don't pass the flag on. if (debug && type_ != NT_PARALLEL) { parallel_debug = true; debug = false; } int stack_size = stack_.size(); if (type_ == NT_PAR_2D_LSTM) { // Special case, run parallel in parallel. GenericVector<NetworkScratch::IO> results; results.init_to_size(stack_size, NetworkScratch::IO()); for (int i = 0; i < stack_size; ++i) { results[i].Resize(input, stack_[i]->NumOutputs(), scratch); } #ifdef _OPENMP #pragma omp parallel for num_threads(stack_size) #endif for (int i = 0; i < stack_size; ++i) { stack_[i]->Forward(debug, input, nullptr, scratch, results[i]); } // Now pack all the results (serially) into the output. int out_offset = 0; output->Resize(*results[0], NumOutputs()); for (int i = 0; i < stack_size; ++i) { out_offset = output->CopyPacking(*results[i], out_offset); } } else { // Revolving intermediate result. NetworkScratch::IO result(input, scratch); // Source for divided replicated. NetworkScratch::IO source_part; TransposedArray* src_transpose = nullptr; if (IsTraining() && type_ == NT_REPLICATED) { // Make a transposed copy of the input. input.Transpose(&transposed_input_); src_transpose = &transposed_input_; } // Run each network, putting the outputs into result. int out_offset = 0; for (int i = 0; i < stack_size; ++i) { stack_[i]->Forward(debug, input, src_transpose, scratch, result); // All networks must have the same output width if (i == 0) { output->Resize(*result, NumOutputs()); } else { ASSERT_HOST(result->Width() == output->Width()); } out_offset = output->CopyPacking(*result, out_offset); } } if (parallel_debug) { DisplayForward(*output); } }
// Runs forward propagation of activations on the input line. // See NetworkCpp for a detailed discussion of the arguments. void LSTM::Forward(bool debug, const NetworkIO& input, const TransposedArray* input_transpose, NetworkScratch* scratch, NetworkIO* output) { input_map_ = input.stride_map(); input_width_ = input.Width(); if (softmax_ != NULL) output->ResizeFloat(input, no_); else if (type_ == NT_LSTM_SUMMARY) output->ResizeXTo1(input, no_); else output->Resize(input, no_); ResizeForward(input); // Temporary storage of forward computation for each gate. NetworkScratch::FloatVec temp_lines[WT_COUNT]; for (int i = 0; i < WT_COUNT; ++i) temp_lines[i].Init(ns_, scratch); // Single timestep buffers for the current/recurrent output and state. NetworkScratch::FloatVec curr_state, curr_output; curr_state.Init(ns_, scratch); ZeroVector<double>(ns_, curr_state); curr_output.Init(ns_, scratch); ZeroVector<double>(ns_, curr_output); // Rotating buffers of width buf_width allow storage of the state and output // for the other dimension, used only when working in true 2D mode. The width // is enough to hold an entire strip of the major direction. int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1; GenericVector<NetworkScratch::FloatVec> states, outputs; if (Is2D()) { states.init_to_size(buf_width, NetworkScratch::FloatVec()); outputs.init_to_size(buf_width, NetworkScratch::FloatVec()); for (int i = 0; i < buf_width; ++i) { states[i].Init(ns_, scratch); ZeroVector<double>(ns_, states[i]); outputs[i].Init(ns_, scratch); ZeroVector<double>(ns_, outputs[i]); } } // Used only if a softmax LSTM. NetworkScratch::FloatVec softmax_output; NetworkScratch::IO int_output; if (softmax_ != NULL) { softmax_output.Init(no_, scratch); ZeroVector<double>(no_, softmax_output); int rounded_softmax_inputs = gate_weights_[CI].RoundInputs(ns_); if (input.int_mode()) int_output.Resize2d(true, 1, rounded_softmax_inputs, scratch); softmax_->SetupForward(input, NULL); } NetworkScratch::FloatVec curr_input; curr_input.Init(na_, scratch); StrideMap::Index src_index(input_map_); // Used only by NT_LSTM_SUMMARY. StrideMap::Index dest_index(output->stride_map()); do { int t = src_index.t(); // True if there is a valid old state for the 2nd dimension. bool valid_2d = Is2D(); if (valid_2d) { StrideMap::Index dim_index(src_index); if (!dim_index.AddOffset(-1, FD_HEIGHT)) valid_2d = false; } // Index of the 2-D revolving buffers (outputs, states). int mod_t = Modulo(t, buf_width); // Current timestep. // Setup the padded input in source. source_.CopyTimeStepGeneral(t, 0, ni_, input, t, 0); if (softmax_ != NULL) { source_.WriteTimeStepPart(t, ni_, nf_, softmax_output); } source_.WriteTimeStepPart(t, ni_ + nf_, ns_, curr_output); if (Is2D()) source_.WriteTimeStepPart(t, ni_ + nf_ + ns_, ns_, outputs[mod_t]); if (!source_.int_mode()) source_.ReadTimeStep(t, curr_input); // Matrix multiply the inputs with the source. PARALLEL_IF_OPENMP(GFS) // It looks inefficient to create the threads on each t iteration, but the // alternative of putting the parallel outside the t loop, a single around // the t-loop and then tasks in place of the sections is a *lot* slower. // Cell inputs. if (source_.int_mode()) gate_weights_[CI].MatrixDotVector(source_.i(t), temp_lines[CI]); else gate_weights_[CI].MatrixDotVector(curr_input, temp_lines[CI]); FuncInplace<GFunc>(ns_, temp_lines[CI]); SECTION_IF_OPENMP // Input Gates. if (source_.int_mode()) gate_weights_[GI].MatrixDotVector(source_.i(t), temp_lines[GI]); else gate_weights_[GI].MatrixDotVector(curr_input, temp_lines[GI]); FuncInplace<FFunc>(ns_, temp_lines[GI]); SECTION_IF_OPENMP // 1-D forget gates. if (source_.int_mode()) gate_weights_[GF1].MatrixDotVector(source_.i(t), temp_lines[GF1]); else gate_weights_[GF1].MatrixDotVector(curr_input, temp_lines[GF1]); FuncInplace<FFunc>(ns_, temp_lines[GF1]); // 2-D forget gates. if (Is2D()) { if (source_.int_mode()) gate_weights_[GFS].MatrixDotVector(source_.i(t), temp_lines[GFS]); else gate_weights_[GFS].MatrixDotVector(curr_input, temp_lines[GFS]); FuncInplace<FFunc>(ns_, temp_lines[GFS]); } SECTION_IF_OPENMP // Output gates. if (source_.int_mode()) gate_weights_[GO].MatrixDotVector(source_.i(t), temp_lines[GO]); else gate_weights_[GO].MatrixDotVector(curr_input, temp_lines[GO]); FuncInplace<FFunc>(ns_, temp_lines[GO]); END_PARALLEL_IF_OPENMP // Apply forget gate to state. MultiplyVectorsInPlace(ns_, temp_lines[GF1], curr_state); if (Is2D()) { // Max-pool the forget gates (in 2-d) instead of blindly adding. inT8* which_fg_col = which_fg_[t]; memset(which_fg_col, 1, ns_ * sizeof(which_fg_col[0])); if (valid_2d) { const double* stepped_state = states[mod_t]; for (int i = 0; i < ns_; ++i) { if (temp_lines[GF1][i] < temp_lines[GFS][i]) { curr_state[i] = temp_lines[GFS][i] * stepped_state[i]; which_fg_col[i] = 2; } } } } MultiplyAccumulate(ns_, temp_lines[CI], temp_lines[GI], curr_state); // Clip curr_state to a sane range. ClipVector<double>(ns_, -kStateClip, kStateClip, curr_state); if (IsTraining()) { // Save the gate node values. node_values_[CI].WriteTimeStep(t, temp_lines[CI]); node_values_[GI].WriteTimeStep(t, temp_lines[GI]); node_values_[GF1].WriteTimeStep(t, temp_lines[GF1]); node_values_[GO].WriteTimeStep(t, temp_lines[GO]); if (Is2D()) node_values_[GFS].WriteTimeStep(t, temp_lines[GFS]); } FuncMultiply<HFunc>(curr_state, temp_lines[GO], ns_, curr_output); if (IsTraining()) state_.WriteTimeStep(t, curr_state); if (softmax_ != NULL) { if (input.int_mode()) { int_output->WriteTimeStepPart(0, 0, ns_, curr_output); softmax_->ForwardTimeStep(NULL, int_output->i(0), t, softmax_output); } else { softmax_->ForwardTimeStep(curr_output, NULL, t, softmax_output); } output->WriteTimeStep(t, softmax_output); if (type_ == NT_LSTM_SOFTMAX_ENCODED) { CodeInBinary(no_, nf_, softmax_output); } } else if (type_ == NT_LSTM_SUMMARY) { // Output only at the end of a row. if (src_index.IsLast(FD_WIDTH)) { output->WriteTimeStep(dest_index.t(), curr_output); dest_index.Increment(); } } else { output->WriteTimeStep(t, curr_output); } // Save states for use by the 2nd dimension only if needed. if (Is2D()) { CopyVector(ns_, curr_state, states[mod_t]); CopyVector(ns_, curr_output, outputs[mod_t]); } // Always zero the states at the end of every row, but only for the major // direction. The 2-D state remains intact. if (src_index.IsLast(FD_WIDTH)) { ZeroVector<double>(ns_, curr_state); ZeroVector<double>(ns_, curr_output); } } while (src_index.Increment()); #if DEBUG_DETAIL > 0 tprintf("Source:%s\n", name_.string()); source_.Print(10); tprintf("State:%s\n", name_.string()); state_.Print(10); tprintf("Output:%s\n", name_.string()); output->Print(10); #endif if (debug) DisplayForward(*output); }
// Recognizes the image_data, returning the labels, // scores, and corresponding pairs of start, end x-coords in coords. bool LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, bool debug, bool re_invert, float* scale_factor, NetworkIO* inputs, NetworkIO* outputs) { // Maximum width of image to train on. const int kMaxImageWidth = 2560; // This ensures consistent recognition results. SetRandomSeed(); int min_width = network_->XScaleFactor(); Pix* pix = Input::PrepareLSTMInputs(image_data, network_, min_width, &randomizer_, scale_factor); if (pix == NULL) { tprintf("Line cannot be recognized!!\n"); return false; } if (network_->IsTraining() && pixGetWidth(pix) > kMaxImageWidth) { tprintf("Image too large to learn!! Size = %dx%d\n", pixGetWidth(pix), pixGetHeight(pix)); pixDestroy(&pix); return false; } // Reduction factor from image to coords. *scale_factor = min_width / *scale_factor; inputs->set_int_mode(IsIntMode()); SetRandomSeed(); Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs); network_->Forward(debug, *inputs, NULL, &scratch_space_, outputs); // Check for auto inversion. float pos_min, pos_mean, pos_sd; OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd); if (invert && pos_min < 0.5) { // Run again inverted and see if it is any better. NetworkIO inv_inputs, inv_outputs; inv_inputs.set_int_mode(IsIntMode()); SetRandomSeed(); pixInvert(pix, pix); Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, &inv_inputs); network_->Forward(debug, inv_inputs, NULL, &scratch_space_, &inv_outputs); float inv_min, inv_mean, inv_sd; OutputStats(inv_outputs, &inv_min, &inv_mean, &inv_sd); if (inv_min > pos_min && inv_mean > pos_mean && inv_sd < pos_sd) { // Inverted did better. Use inverted data. if (debug) { tprintf("Inverting image: old min=%g, mean=%g, sd=%g, inv %g,%g,%g\n", pos_min, pos_mean, pos_sd, inv_min, inv_mean, inv_sd); } *outputs = inv_outputs; *inputs = inv_inputs; } else if (re_invert) { // Inverting was not an improvement, so undo and run again, so the // outputs match the best forward result. SetRandomSeed(); network_->Forward(debug, *inputs, NULL, &scratch_space_, outputs); } } pixDestroy(&pix); if (debug) { GenericVector<int> labels, coords; LabelsFromOutputs(*outputs, &labels, &coords); DisplayForward(*inputs, labels, coords, "LSTMForward", &debug_win_); DebugActivationPath(*outputs, labels, coords); } return true; }