void TrainModel() { long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; real score; FILE *fo, *fin; printf("Starting training using file %s\n", train_file); LearnVocabFromTrainFile(); fin = fopen(train_file, "rb"); fo = fopen(output_file, "wb"); word[0] = 0; while (1) { strcpy(last_word, word); ReadWord(word, fin); if (feof(fin)) break; if (!strcmp(word, "</s>")) { fprintf(fo, "\n"); continue; } cn++; if ((debug_mode > 1) && (cn % 100000 == 0)) { printf("Words written: %lldK%c", cn / 1000, 13); fflush(stdout); } oov = 0; i = SearchVocab(word); if (i == -1) oov = 1; else pb = vocab[i].cn; if (li == -1) oov = 1; li = i; sprintf(bigram_word, "%s_%s", last_word, word); bigram_word[MAX_STRING - 1] = 0; i = SearchVocab(bigram_word); if (i == -1) oov = 1; else pab = vocab[i].cn; if (pa < min_count) oov = 1; if (pb < min_count) oov = 1; if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; if (score > threshold) { fprintf(fo, "_%s", word); pb = 0; } else fprintf(fo, " %s", word); pa = pb; } fclose(fo); fclose(fin); }
void TrainModel() { long a; pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); if (model_file[0] == 0) return; int iter = 0; FILE *t1 = fopen(model_file, "rb"); FILE *t2 = fopen(model_file_nnet, "rb"); if(t1 != NULL && t2 != NULL) { fclose(t1); fclose(t2); fprintf(stderr, "Restoring nnet from existing files %s, %s\n", model_file, model_file_nnet); LoadNnet(); } else { LearnVocabFromTrainFile(); if(maxent_hash_size) { maxent_hash_size *= 1000000; maxent_hash_size -= maxent_hash_size % vocab_size; } InitNet(); SaveNnet(); } if(test_file[0] != 0) { counter = 0; real sumlogprob = EvaluateModel(test_file, 1); fprintf(stderr, "Test entropy %f\n", sumlogprob/log10(2)/(real)counter); return; } if(gen > 0) { Sample(gen, 0); return; } else if(gen < 0) { while(1) { Sample(-gen, 1); } return; } fprintf(stderr, "Starting training using file %s\n", train_file); FILE *fi = fopen(valid_file, "rb"); valid_words = 0; while (1) { ReadWordIndex(fi); ++valid_words; if (feof(fi)) break; } valid_file_size = ftell(fi); fclose(fi); real old_entropy = 1e99; real entropy; real diff = 1e99; int retry = 0; int decay = 0; while(retry < max_retry) { if(iter != 0) { if(decay) { alpha /= 2.0; maxent_alpha /= 2.0; } word_count_actual = 0; counter = 0; start = clock(); for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); } fprintf(stderr, "Iteration %d\t", iter); sumlogprob_valid = 0; counter = 0; sumlogprob_valid = EvaluateModel(valid_file, 0); entropy = sumlogprob_valid/log10(2)/(real)counter; fprintf(stderr, "Valid Entropy %f", entropy); ++iter; diff = old_entropy/entropy; if (isnan(entropy) || isinf(entropy) || diff < stop) { if (decay == 1) { ++retry; fprintf(stderr, "\tRetry %d/%d", retry, max_retry); } else { decay = 1; fprintf(stderr, "\tDecay started"); } if(isnan(entropy) || isinf(entropy) || diff < reject_threshold) { fprintf(stderr, "\tNnet rejected"); FreeNnet(); int debug_ = debug_mode; debug_mode = 0; LoadNnet(); debug_mode = debug_; } } fprintf(stderr, "\n"); if(diff > 1.0) { SaveNnet(); old_entropy = entropy; } } }