コード例 #1
0
ファイル: fnnlm_base.cpp プロジェクト: hao-fang/NLMRepo
void FNeuralNetLMBase::BatchSGDTrain(FNNLMDataReader &train_data, FNNLMDataReader &validation_data,
                                     const string &outbase, bool nce_ppl) {
  const size_t eos_widx = word_vocab_.eos_idx();
  const size_t unk_widx = word_vocab_.unk_idx();
  const vector<size_t> eos_fidx = { factor_vocab_.eos_idx() };

  vector<pair<size_t, vector<size_t>>> sentence;

  double last_logp = -numeric_limits<double>::max();
  double curr_logp = -numeric_limits<double>::max();
  bool halve_alpha = false;
  // set the current learning rate.
  float curr_learning_rate = algopts_.init_learning_rate_;

  size_t sents_processed = 0;
  int iteration = 0;

  clock_t start_time = clock();
  clock_t end_time = start_time;
  while (true) {
    cout << "******************************* ITERATION " << iteration++ << " *******************************" << endl;

    train_data.StartEpoch();

    ResetActivations();

    cout << "learning_rate = " << curr_learning_rate << endl;

    int bpos = 0;
    double logp = 0.0;
    nce_obj_ = 0;
    size_t ivcount = 0;
    size_t oovcount = 0;
    // NOTE: the vector "sentence" does not include </s> at the end!
    while (train_data.GetSentence(sentence)) {
      assert(!sentence.empty());

      if (independent_) {
        ResetActivations();
      }
      ForwardPropagate(eos_widx, eos_fidx);
      for (vector<pair<size_t, vector<size_t>>>::const_iterator it = sentence.begin(); it != sentence.end(); ++it) {
        // train all words even if it is an OOV since <unk> in the vocabulary
        if (!unk_ && it->first == unk_widx) {
          oovcount++;
        } else {
          logp += GetLogProb(it->first, !nce_);
          ivcount++;
        }
        BackPropagate(it->first, it->second);
        if (++bpos == algopts_.batch_size_) {
          FastUpdateWeightsMajor(curr_learning_rate);
          bpos = 0;
        }
        ForwardPropagate(it->first, it->second);
      }
      if (nce_) {
        logp += GetLogProb(eos_widx, false);
      } else {
        logp += GetLogProb(eos_widx, true);
      }
      ivcount++;
      BackPropagate(eos_widx, eos_fidx);

      sents_processed++;
      if ((sents_processed % 500) == 0)
        cout << "." << flush;
    }
    // Do the update for current epoch since the last minibatch.
    FastUpdateWeightsMajor(curr_learning_rate);
    bpos = 0;
    FastUpdateWeightsMinor();

    cout << "\nnum IV words (including </s>) in training: " << ivcount << endl;
    cout << "number of OOV words in training: " << oovcount << endl;
    if (!nce()) {
      cout << "training entropy (base 2): " << -logp / log(2) / ivcount << endl;
      cout << "model perplexity on training: " << exp(-logp / ivcount) << endl;
      cout << "log-likelihood (base e) on training is: " << logp << endl;
    } else {
      cout << "NCE objective value on training is: " << nce_obj_ << endl;
      cout << "un-normalized training entropy (base 2): " << -logp / log(2) / ivcount << endl;
      cout << "unnormalied model perplexity on training: " << exp(-logp / ivcount) << endl;
      cout << "un-normalized log-likelihood (base e) on training is: " << logp << endl;
    }
    cout << "epoch finished" << endl << flush;

    if (!outbase.empty()) {
      if (debug_ > 0) {
        WriteLM(outbase + ".ITER_" + to_string(iteration - 1));
      }
    }

    cout << "----------VALIDATION----------" << endl;
    double curr_logp = EvalLM(validation_data, nce_ppl);
    cout << "log-likelihood (base e) on validation is: " << curr_logp << endl;

    clock_t last_end_time = end_time;
    end_time = clock();
    cout << "time elasped " 
        << static_cast<double>(end_time - last_end_time) / CLOCKS_PER_SEC << " secs for this iteration out of "
        << static_cast<double>(end_time - start_time) / CLOCKS_PER_SEC << " secs in total." << endl;

    if (curr_logp < last_logp) {
      cout << "validation log-likelihood decrease; resetting parameters" << endl;
      RestoreLastParams();
    } else {
      CacheCurrentParams();
    }

    if (curr_logp * algopts_.min_improvement_ <= last_logp) {
      if (!halve_alpha) {
        halve_alpha = true;
      } else {
        if (!outbase.empty()) {
          WriteLM(outbase);
        }
        break;
      }
    }

    if (halve_alpha) {
      curr_learning_rate /= 2;
    }

    last_logp = curr_logp;
  }
}
コード例 #2
0
ファイル: newalg.c プロジェクト: seanderson/wordrep
int main(int argc, char **argv) {
    int i,j,k = 0;//counters
    if(argc == 1) { //printing instructions
        printf("\n");
        printf("Forward propagation of sentences in a file delimited by \\n\n\n");
        printf("Parameters:\n");
        printf("\tValue for the vocabulary size that resulted from training (first number in the output file of word2vec):\n");
        printf("\t\t-vocab_size <int>\n");
        printf("\tValue for the layer size used in training (second number in the output file of word2vec):\n");
        printf("\t\t-layer_size <int>\n");
        printf("\tValue for the window size:\n");
        printf("\t\t-window <int>\n\n");
        return 0;
    } //reading command line arguments
    if ((i = ArgPos((char *)"-layer_size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-vocab_size", argc, argv)) > 0) vocab_size = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);

    // allocating memory to store the network elements
    syn0 = (real *)calloc(layer1_size*vocab_size,sizeof(real));
    syn1 = (real *)calloc(layer1_size*vocab_size,sizeof(real));
    neu1 = (real *)calloc(layer1_size,sizeof(real));

    index_buff = (char *)calloc(MAX_INDEX_BUFF_SIZE,sizeof(char));
    // reading the network from file
    read_syn0();
    read_syn1();

    expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); //allocating memory for expTable
    for (i = 0; i < EXP_TABLE_SIZE; i++) {
        expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table in the same way as in word2vec
        expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
    }
    //building the vocabulary and the vocabulary hash from the files it was stored in
    BuildVocabFromFile();
    BuildVocabHashFromFile();

    int length = 0; //word lenght of sentence variable
    int syno_length = 0; //how many synonyms/replacements
    long long * sen; //sentence variable where words are represented as vocabualry indices
    long long * sen_temp; //temporary sentence variable where words are represented as vocabulary indices
    sen_temp = (long long *)calloc(MAX_SENTENCE_LENGTH,sizeof(long long)); //allocating memory for sen_temp
    long long * synonym; //replacement word (in vocabulary index form)
    long double prob = 0; //probability variable
    long long ptr = 0, ptr_temp = 0; //pointer used to go through the sentences file
    long long syno_ptr = 0, syno_ptr_temp = 0; //pointer used to go through the synonyms/replacements file


    FILE *sentfile = fopen("sentences","r");
    FILE *indices = fopen("indices","r");
    FILE *synfile = fopen("synonyms","r");
    FILE *fo = fopen("wordprobs","w");
    int lines = 0;
    char line[MAX_SENTENCE_LENGTH]; // buffer to store current sentence
    char synline[MAX_SENTENCE_LENGTH]; // buffer to store synonyms


    lines = Lines(sentfile); // how many lines in the sentences file, which is used as the outer loop delimiter
    //(this can be done) since all the files "sentences", "synonyms" and "indices" have the same number of lines delimited by "\n"
    rewind(sentfile);
    rewind(synfile);

    for(i = 0; i<lines; i++) { //outer loop iterating through "sentences", "synonyms" and "indices" line by line

        // read sentence
        ptr = ftell(sentfile); // store beginning of line
        if (readLine(sentfile,line) < 0) break;
        length = LineWordCount(line);
        //printf("sent words %d\n",length);

        // read word replacements
        syno_ptr = ftell(synfile); // store beginning of line
        if (readLine(synfile,synline) < 0) break;
        syno_length = LineWordCount(synline);
        printf("synline %s\n",synline);

        fseek(sentfile,ptr,SEEK_SET); // move the pointer back to the beginning of the line
        sen = FileToSen(length,sentfile); //sen is an array of longs with the words of the sentence in a vocabulary index format

        fseek(synfile,syno_ptr,SEEK_SET);
        synonym = FileToSen(syno_length,synfile); //synonym is an array of longs with the replacements/synonyms from the "synonyms" file in vocabulary index format

        fseek(sentfile,1,SEEK_CUR); // added to get past newline
        fseek(synfile,1,SEEK_CUR);

        ReadIndexFromFile(indices); //reads the index and puts it in the char array "index_buff"
        target_index = GetIndex(); //returns a numerical value from what is in the char array "index_buff"
        for(k=0; k<syno_length; k++) { //repeats forward propagation for each synonym in the line
            memcpy(sen_temp,sen,MAX_SENTENCE_LENGTH*sizeof(long long)); //copying the sentence into sen_temp where synonyms will be changed
            sen_temp[target_index] = synonym[k]; //replacing the target word with a synonym/replacement
            prob = ForwardPropagate(length,sen_temp); //doing forward propagation to get the probability
            //prob = prob * 100000; // multiplying the probabilty by 100000 or taking the negative log is done in this line

            fprintf(fo,"%s %Lf\n",vocab[synonym[k]].word,prob); // SEA the replacement word and its probability
        }
    }

    fclose(fo);
    fclose(sentfile);
    fclose(synfile);
    fclose(indices);

    return 0;
}
コード例 #3
0
ファイル: fnnlm_base.cpp プロジェクト: hao-fang/NLMRepo
double FNeuralNetLMBase::EvalLM(FNNLMDataReader &data, bool nce_ppl) {
  CheckParams();

  if (nce_ppl && !nce_) {
    cerr << "nce_ppl == true but nce_ is false!" << endl;
    exit(EXIT_FAILURE);
  }

  const size_t eos_widx = word_vocab_.eos_idx();
  const size_t unk_widx = word_vocab_.unk_idx();
  const vector<size_t> eos_fidx = { factor_vocab_.eos_idx() };

  vector<pair<size_t, vector<size_t>>> sentence;
  double total_logp = 0.0;
  size_t sents_processed = 0;
  size_t ivcount = 0;
  size_t oovcount = 0;

  data.StartEpoch();

  ResetActivations();

  while (data.GetSentence(sentence)) {
    assert(!sentence.empty());

    if (independent_) {
      ResetActivations();
    }
    double curr_logp = 0.0;
    ForwardPropagate(eos_widx, eos_fidx);
    for (vector<pair<size_t, vector<size_t>>>::const_iterator it = sentence.begin(); it != sentence.end(); ++it) {
      if (!unk_ && it->first == unk_widx) {
        oovcount++;
      } else {
        curr_logp += GetLogProb(it->first, !nce_ppl);
        ivcount++;
      }
      ForwardPropagate(it->first, it->second);
    }
    curr_logp += GetLogProb(eos_widx, !nce_ppl);
    ivcount++;

    total_logp += curr_logp;
    sents_processed++;
    if ((sents_processed % 200) == 0) {
      cout << "." << flush;
    }

    if (debug_ > 1) {
      if (nce_ppl) {
        cerr << "unnormalized log-likelihood (base e) on " << sents_processed << "-th sentence is: " << curr_logp << endl;
      } else {
        cerr << "log-likelihood (base e) on " << sents_processed << "-th sentence is: " << curr_logp << endl;
      }
    }
  }

  if (ivcount == 0) {
    cerr << "zero IV words!" << endl;
    exit(EXIT_FAILURE);
  }

  cout << "\nnumber of IV words  (including </s>): " << ivcount << endl;
  cout << "number of OOV words: " << oovcount << endl;
  cout << "entropy (base 2): " << -total_logp / log(2) / ivcount << endl;
  if (nce_ppl) {
    cout << "unnormalied model perplexity: " << exp(-total_logp / ivcount) << endl;
  } else {
    cout << "model perplexity: " << exp(-total_logp / ivcount) << endl;
  }

  return total_logp;
}