Пример #1
0
void TrainModel() {
  long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
  char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
  real score;
  FILE *fo, *fin;
  printf("Starting training using file %s\n", train_file);
  LearnVocabFromTrainFile();
  fin = fopen(train_file, "rb");
  fo = fopen(output_file, "wb");
  word[0] = 0;
  while (1) {
    strcpy(last_word, word);
    ReadWord(word, fin);
    if (feof(fin)) break;
    if (!strcmp(word, "</s>")) {
      fprintf(fo, "\n");
      continue;
    }
    cn++;
    if ((debug_mode > 1) && (cn % 100000 == 0)) {
      printf("Words written: %lldK%c", cn / 1000, 13);
      fflush(stdout);
    }
    oov = 0;
    i = SearchVocab(word);
    if (i == -1) oov = 1; else pb = vocab[i].cn;
    if (li == -1) oov = 1;
    li = i;
    sprintf(bigram_word, "%s_%s", last_word, word);
    bigram_word[MAX_STRING - 1] = 0;
    i = SearchVocab(bigram_word);
    if (i == -1) oov = 1; else pab = vocab[i].cn;
    if (pa < min_count) oov = 1;
    if (pb < min_count) oov = 1;
    if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
    if (score > threshold) {
      fprintf(fo, "_%s", word);
      pb = 0;
    } else fprintf(fo, " %s", word);
    pa = pb;
  }
  fclose(fo);
  fclose(fin);
}
Пример #2
0
void TrainModel() {
  long a;
  pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
  if (model_file[0] == 0) return;
  int iter = 0;

  FILE *t1 = fopen(model_file, "rb");
  FILE *t2 = fopen(model_file_nnet, "rb");
  if(t1 != NULL && t2 != NULL) {
    fclose(t1);
    fclose(t2);
    fprintf(stderr, "Restoring nnet from existing files %s, %s\n", model_file, model_file_nnet);
    LoadNnet();
  } else {
    LearnVocabFromTrainFile();
    if(maxent_hash_size) {
      maxent_hash_size *= 1000000;
      maxent_hash_size -= maxent_hash_size % vocab_size;
    }
    InitNet();
    SaveNnet();
  } 

  if(test_file[0] != 0) {
    counter = 0;
    real sumlogprob = EvaluateModel(test_file, 1);
    fprintf(stderr, "Test entropy %f\n", sumlogprob/log10(2)/(real)counter);
    return;
  }

  if(gen > 0) {
    Sample(gen, 0);
    return;
  } else if(gen < 0) {
    while(1) {
      Sample(-gen, 1);
    }
    return;
  }

  fprintf(stderr, "Starting training using file %s\n", train_file);

  FILE *fi = fopen(valid_file, "rb");
  valid_words = 0;
  while (1) {
    ReadWordIndex(fi);
    ++valid_words;
    if (feof(fi)) break;
  }    
  valid_file_size = ftell(fi);
  fclose(fi);

  real old_entropy = 1e99;
  real entropy;
  real diff = 1e99;
  int retry = 0;
  int decay = 0;
  while(retry < max_retry) {
    if(iter  != 0) {
      if(decay) {
	alpha /= 2.0;
	maxent_alpha /= 2.0;
      }
      word_count_actual = 0;
      counter = 0;
      start = clock();
      for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
      for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
    }
    fprintf(stderr, "Iteration %d\t", iter);
    sumlogprob_valid = 0;
    counter = 0;
    sumlogprob_valid = EvaluateModel(valid_file, 0);
    entropy = sumlogprob_valid/log10(2)/(real)counter;
    fprintf(stderr, "Valid Entropy %f", entropy);
    ++iter;

    diff = old_entropy/entropy;  
    if (isnan(entropy) || isinf(entropy) || diff < stop) {
      if (decay == 1) {
	++retry;
	fprintf(stderr, "\tRetry %d/%d", retry, max_retry);
      } else {
	decay = 1;
	fprintf(stderr, "\tDecay started");
      }
      if(isnan(entropy) || isinf(entropy) || diff < reject_threshold) {
	fprintf(stderr, "\tNnet rejected");
	FreeNnet();
	int debug_ = debug_mode;
	debug_mode = 0;
	LoadNnet();
	debug_mode = debug_;
      }
    }
    fprintf(stderr, "\n");

    if(diff > 1.0) {  
      SaveNnet();
      old_entropy = entropy;
    }
  }
}