// Reads a word and returns its index in the vocabulary // 从文件流中读取一个词,并返回这个词在词汇表中的位置 int ReadWordIndex(FILE *fin) { char word[MAX_STRING]; ReadWord(word, fin); if (feof(fin)) return -1; return SearchVocab(word); }
void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { fprintf(stderr, "%lldK%c", train_words / 1000, 13); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { fprintf(stderr, "Vocab size: %lld\n", vocab_size); fprintf(stderr, "Words in train file: %lld\n", train_words); } file_size = ftell(fin); fclose(fin); }
void LearnVocabFromTrainFile() { char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; FILE *fin; long long a, i, start = 1; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; if (!strcmp(word, "</s>")) { start = 1; continue; } else start = 0; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (start) continue; sprintf(bigram_word, "%s_%s", last_word, word); bigram_word[MAX_STRING - 1] = 0; strcpy(last_word, word); i = SearchVocab(bigram_word); if (i == -1) { a = AddWordToVocab(bigram_word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } fclose(fin); }
void TrainModel() { long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0; char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; real score; FILE *fo, *fin; printf("Starting training using file %s\n", train_file); LearnVocabFromTrainFile(); fin = fopen(train_file, "rb"); fo = fopen(output_file, "wb"); word[0] = 0; while (1) { strcpy(last_word, word); ReadWord(word, fin); if (feof(fin)) break; if (!strcmp(word, "</s>")) { fprintf(fo, "\n"); continue; } cn++; if ((debug_mode > 1) && (cn % 100000 == 0)) { printf("Words written: %lldK%c", cn / 1000, 13); fflush(stdout); } oov = 0; i = SearchVocab(word); if (i == -1) oov = 1; else pb = vocab[i].cn; if (li == -1) oov = 1; li = i; sprintf(bigram_word, "%s_%s", last_word, word); bigram_word[MAX_STRING - 1] = 0; i = SearchVocab(bigram_word); if (i == -1) oov = 1; else pab = vocab[i].cn; if (pa < min_count) oov = 1; if (pb < min_count) oov = 1; if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words; if (score > threshold) { fprintf(fo, "_%s", word); pb = 0; } else fprintf(fo, " %s", word); pa = pb; } fclose(fo); fclose(fin); }
void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if(strlen(word)>1)if(word[0]=='_'&&word[1]=='*')continue; if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("%lldK%c", train_words / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } long long freq_count=0; //for(a=0;a<vocab_size;a++){ // if(vocab[a].cn > 50000){freq_count+=vocab[a].cn;printf("%s\n",vocab[a].word);} //} //printf("freq_count: %lld\n", freq_count); file_size = ftell(fin); fclose(fin); }
void Sample(int num_sentences, int interactive) { long long last_word; long long sen[MAX_SENTENCE_LENGTH + 1]; long long l2; real f; real *neu1; int begin = 0; posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); sen[0] = 0; if(interactive) { printf("Enter the phrase to be continued:\n"); while(1) { int word = ReadWordIndex(stdin); if(word == 0) break; if(word == -1) word = SearchVocab("<unk>"); ++begin; sen[begin] = word; } } int sentence = 0; while (sentence < num_sentences) { memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); // clean activations for(int i = 1; i <= begin; ++i) printf("%s ", vocab[sen[i]].word); if(begin) printf("| "); int input = 0; real logprob = 0.0; while(1) { if (input != 0) { for(int c = 0; c < layer1_size; ++c) { for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation } } last_word = sen[input]; for(int c = 0; c < layer1_size; ++c) { neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden } ApplySigmoid(neu1+layer1_size*input, layer1_size); if(input < begin) { ++input; continue; } long long feature_hashes[MAX_NGRAM_ORDER] = {0}; if(maxent_order) { for(int order = 0; order < maxent_order && input >= order; ++order) { feature_hashes[order] = PRIMES[0]*PRIMES[1]; for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[input-b]+1); feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size); } } int node = vocab_size - 2; while(node > 0) { // Propagate hidden -> output f = 0.0; l2 = node * layer1_size; for(int c = 0; c < layer1_size; ++c) { f += neu1[input*layer1_size + c] * nnet.syn1[l2 + c]; } for(int order = 0; order < maxent_order && input >= order; ++order) { f += nnet.synMaxent[feature_hashes[order] + node]; } f = exp(f)/(1+exp(f)); // sigmoid real random = rand() / (real)RAND_MAX; if (f > random) { node = tree[node].child0; logprob += log10(f); } else { node = tree[node].child1; logprob += log10(1-f); } } ++input; sen[input] = node + vocab_size; printf("%s ", vocab[sen[input]].word); if(sen[input] == 0 || input == MAX_SENTENCE_LENGTH) { printf("%f %f\n", logprob, logprob /(input-begin)); break; } } ++sentence; } free(neu1); }
// Tokenize sentences void tokenizeSentences(struct Sentence* collection, long noSents){ long i; for (i = 0; i < noSents; i++){ //printf("**************************\n"); // Copy the word into a local variable line char* line = (char*) malloc(MAX_SENTENCE); strcpy(line, collection[i].sent); int count = 0, n, actCount = 0, sentCount = 0; // Split based on 's char* first = multi_tok(line, "'s"); char* second = multi_tok(NULL, "'s"); // Join both the parts without the 's (from baseline: add it at the end) if(second != NULL) line = strcat(first, strcat(second, " \'s")); else line = first; char* temp = (char*) malloc(MAX_SENTENCE); strcpy(temp, line); // Remove ' ', ',', '.', '?', '!', '\', '/' char* delim = " ,/!?\\"; // Ignore the full stop, used to demarcate end of sentence line = strtok(line, delim); // Going over the line to determine the number of parts while(line != NULL){ count++; // Check if an ending word if(line[strlen(line)-1] == '.') sentCount++; // Get the next word line = strtok(NULL, delim); } // Now store the word components, looping over them if(sentCount == 0) sentCount = 1; // Punctuations not present, treat as one sentence collection[i].index = (int*) malloc(count * sizeof(int)); collection[i].endIndex = (int*) malloc(sentCount * sizeof(int)); line = strtok(temp, delim); count = 0, sentCount = 0; int lineEnd; int wordIndex; while(line != NULL){ // Convert the token into lower case for(n = 0; line[n]; n++){ line[n] = tolower(line[n]); // Check if it has a trailing full stop, if yes, removeit and report if (line[n] == '.'){ lineEnd = 1; line[n] = '\0'; } } wordIndex = SearchVocab(line); // Exists in vocab, save if (wordIndex != -1){ collection[i].index[count] = wordIndex; actCount++; count++; } // Adjust end of line count if(lineEnd){ collection[i].endIndex[sentCount] = count-1; sentCount++; lineEnd = 0; } // Next word line = strtok(NULL, delim); } // Punctuations absent, treat everything as one setnence if(sentCount == 0){ sentCount = 1; collection[i].endIndex[0] = count-1; } // Now store the word components, looping over them collection[i].count = count; collection[i].actCount = actCount; collection[i].sentCount = sentCount; //printf("Sent count: %s\n%d\n", collection[i].sent, collection[i].sentCount); } printf("\nTokenized %ld sentences!\n", noSents); }
void *TrainCBOWModelThreadGram(void *arg) { /*Get Parameters*/ threadParameters *params = arg; vocabulary *voc = params->voc; //shared int id = params->threadNumber; int MAX_STRING = params->max_string; int MAX_EXP = params->max_exp; int ngram = params->ngram; int layer1_size = params->layer1_size; int num_threads = params->num_threads; int file_size = params->file_size; int window = params->window; int hs = params->hs; int negative = params->negative; int EXP_TABLE_SIZE = params->exp_table_size; int table_size = params->table_size; int position = params->position; int overlap = params->overlap; int hashbang = params->hashbang; long long int *word_count_actual = params->word_count_actual; //shared int *table = params->table; char *train_file = params->train_file; real starting_alpha = params->starting_alpha; real sample = params->sample; real *alpha = params->alpha; //shared real *syn0 = params->syn0; //shared real *syn1 = params->syn1; //shared real *syn1neg = params->syn1neg; //shared real *expTable = params->expTable; //shared free(arg); long long a, b, d, i, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l2, c, target, label; unsigned long long next_random = (long long)id; real f, g; clock_t now; char wordToGram[MAX_STRING]; char gram[ngram+3]; int start = 0; int end; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //one vector real *neu1e = (real *)calloc(layer1_size, sizeof(real)); FILE *fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { (*word_count_actual) += word_count - last_word_count; last_word_count = word_count; if ((DEBUG_MODE > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, (*alpha), (*word_count_actual) / (real)(voc->train_words + 1) * 100, (*word_count_actual) / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } (*alpha) = starting_alpha * (1 - (*word_count_actual) / (real)(voc->train_words + 1)); if ((*alpha) < starting_alpha * 0.0001) (*alpha) = starting_alpha * 0.0001; } if (sentence_length == 0) { wordToGram[0] = '\0'; //so length is 0 end = 0; while (1) { if (feof(fi)) break; if(end == 0){ if(hashbang) ReadWordHashbang(wordToGram, fi); else ReadWord(wordToGram,fi); i = 0; } end = getGrams(wordToGram,gram,i, ngram, overlap, position,hashbang); if(end == -1) word = SearchVocab(voc,wordToGram); else word = SearchVocab(voc, gram); word_count++; i += 1; if(end == 0){ continue; } if (end == -1) end = 0; if (word == -1) continue; if (word == 0) //context break break; // The subsampling randomly discards frequent words while keeping the ranking same if (sample > 0) { real ran = (sqrt(voc->vocab[word].cn / (sample * voc->train_words)) + 1) * (sample * voc->train_words) / voc->vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } sen[sentence_length] = word; sentence_length++; if (sentence_length >= MAX_SENTENCE_LENGTH) break; } sentence_position = 0; } if (feof(fi)) //end file break; if (word_count > voc->train_words / num_threads) //trained all word break; word = sen[sentence_position]; //index if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; b = next_random % window; /*--- Training ---*/ // in -> hidden for (a = b; a < window * 2 + 1 - b; a++) //a = [0 window]->[(window*2+1)-rand] -> dynamic window if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; //index of word if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) // c is each vector index neu1[c] += syn0[c + last_word * layer1_size]; //sum of all vectors in input window (fig cbow) -> vectors on hidden } if (hs) for (d = 0; d < voc->vocab[word].codelen; d++) { f = 0; l2 = voc->vocab[word].point[d] * layer1_size; //offset of word // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //sum vectors input window * word weights on syn1 -> output vectors if (f <= -MAX_EXP) //sigmoid activation function - precalculated in expTable continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - voc->vocab[word].code[d] - f) * (*alpha); // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //save to modify vectors // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; //modify weights } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; //(w,c) in corpus } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (voc->vocab_size - 1) + 1; if (target == word) continue; label = 0; //(w,c) not in corpus } l2 = target * layer1_size; //get word vector index f = 0; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; //vector*weights if (f > MAX_EXP) //sigmoid g = (label - 1) * (*alpha); else if (f < -MAX_EXP) g = (label - 0) * (*alpha); else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * (*alpha); for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //saving error for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; } // hidden -> in for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; //modify word vectors with error } sentence_position++; if (sentence_position >= sentence_length) { sentence_length = 0; continue; } } fclose(fi); free(neu1); free(neu1e); pthread_exit(NULL); }
/* Returns the word index by calling SearchVocab. */ int ReadWordIndex(FILE *fin) { //imported from word2vec char word[MAX_STRING]; ReadWord(word, fin); if (feof(fin)) return -1; return SearchVocab(word); }
// Reads a word and returns its index in the vocabulary int ReadWordIndex(struct vocabulary *v, FILE *fin) { char word[MAX_STRING]; ReadWord(word, fin, MAX_STRING); if (feof(fin)) return -1; return SearchVocab(v, word); }