Example #1
0
// Reads a word and returns its index in the vocabulary
// 从文件流中读取一个词,并返回这个词在词汇表中的位置
int ReadWordIndex(FILE *fin)
{
  char word[MAX_STRING];
  ReadWord(word, fin);
  if (feof(fin)) return -1;
  return SearchVocab(word);
}
Example #2
0
void LearnVocabFromTrainFile() {
  char word[MAX_STRING];
  FILE *fin;
  long long a, i;
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  fin = fopen(train_file, "rb");
  if (fin == NULL) {
    printf("ERROR: training data file not found!\n");
    exit(1);
  }
  vocab_size = 0;
  AddWordToVocab((char *)"</s>");
  while (1) {
    ReadWord(word, fin);
    if (feof(fin)) break;
    train_words++;
    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
      fprintf(stderr, "%lldK%c", train_words / 1000, 13);
    }
    i = SearchVocab(word);
    if (i == -1) {
      a = AddWordToVocab(word);
      vocab[a].cn = 1;
    } else vocab[i].cn++;
    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
  }
  SortVocab();
  if (debug_mode > 0) {
    fprintf(stderr, "Vocab size: %lld\n", vocab_size);
    fprintf(stderr, "Words in train file: %lld\n", train_words);
  }
  file_size = ftell(fin);
  fclose(fin);
}
Example #3
0
void LearnVocabFromTrainFile() {
  char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
  FILE *fin;
  long long a, i, start = 1;
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  fin = fopen(train_file, "rb");
  if (fin == NULL) {
    printf("ERROR: training data file not found!\n");
    exit(1);
  }
  vocab_size = 0;
  AddWordToVocab((char *)"</s>");
  while (1) {
    ReadWord(word, fin);
    if (feof(fin)) break;
    if (!strcmp(word, "</s>")) {
      start = 1;
      continue;
    } else start = 0;
    train_words++;
    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
      printf("Words processed: %lldK     Vocab size: %lldK  %c", train_words / 1000, vocab_size / 1000, 13);
      fflush(stdout);
    }
    i = SearchVocab(word);
    if (i == -1) {
      a = AddWordToVocab(word);
      vocab[a].cn = 1;
    } else vocab[i].cn++;
    if (start) continue;
    sprintf(bigram_word, "%s_%s", last_word, word);
    bigram_word[MAX_STRING - 1] = 0;
    strcpy(last_word, word);
    i = SearchVocab(bigram_word);
    if (i == -1) {
      a = AddWordToVocab(bigram_word);
      vocab[a].cn = 1;
    } else vocab[i].cn++;
    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
  }
  SortVocab();
  if (debug_mode > 0) {
    printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
    printf("Words in train file: %lld\n", train_words);
  }
  fclose(fin);
}
Example #4
0
void TrainModel() {
  long long pa = 0, pb = 0, pab = 0, oov, i, li = -1, cn = 0;
  char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2];
  real score;
  FILE *fo, *fin;
  printf("Starting training using file %s\n", train_file);
  LearnVocabFromTrainFile();
  fin = fopen(train_file, "rb");
  fo = fopen(output_file, "wb");
  word[0] = 0;
  while (1) {
    strcpy(last_word, word);
    ReadWord(word, fin);
    if (feof(fin)) break;
    if (!strcmp(word, "</s>")) {
      fprintf(fo, "\n");
      continue;
    }
    cn++;
    if ((debug_mode > 1) && (cn % 100000 == 0)) {
      printf("Words written: %lldK%c", cn / 1000, 13);
      fflush(stdout);
    }
    oov = 0;
    i = SearchVocab(word);
    if (i == -1) oov = 1; else pb = vocab[i].cn;
    if (li == -1) oov = 1;
    li = i;
    sprintf(bigram_word, "%s_%s", last_word, word);
    bigram_word[MAX_STRING - 1] = 0;
    i = SearchVocab(bigram_word);
    if (i == -1) oov = 1; else pab = vocab[i].cn;
    if (pa < min_count) oov = 1;
    if (pb < min_count) oov = 1;
    if (oov) score = 0; else score = (pab - min_count) / (real)pa / (real)pb * (real)train_words;
    if (score > threshold) {
      fprintf(fo, "_%s", word);
      pb = 0;
    } else fprintf(fo, " %s", word);
    pa = pb;
  }
  fclose(fo);
  fclose(fin);
}
Example #5
0
void LearnVocabFromTrainFile() {
  char word[MAX_STRING];
  FILE *fin;
  long long a, i;
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  fin = fopen(train_file, "rb");
  if (fin == NULL) {
    printf("ERROR: training data file not found!\n");
    exit(1);
  }
  vocab_size = 0;
  AddWordToVocab((char *)"</s>");
  while (1) {
    ReadWord(word, fin);
    if(strlen(word)>1)if(word[0]=='_'&&word[1]=='*')continue;
    if (feof(fin)) break;
    train_words++;
    if ((debug_mode > 1) && (train_words % 100000 == 0)) {
      printf("%lldK%c", train_words / 1000, 13);
      fflush(stdout);
    }
    i = SearchVocab(word);
    if (i == -1) {
      a = AddWordToVocab(word);
      vocab[a].cn = 1;
    } else vocab[i].cn++;
    if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
  }
  SortVocab();
  if (debug_mode > 0) {
    printf("Vocab size: %lld\n", vocab_size);
    printf("Words in train file: %lld\n", train_words);
  }
  long long freq_count=0;
  //for(a=0;a<vocab_size;a++){
  //  if(vocab[a].cn > 50000){freq_count+=vocab[a].cn;printf("%s\n",vocab[a].word);}
  //}
  //printf("freq_count: %lld\n", freq_count);
  file_size = ftell(fin);
  fclose(fin);
}
Example #6
0
void Sample(int num_sentences, int interactive) {

  long long last_word;
  long long sen[MAX_SENTENCE_LENGTH + 1];
  long long l2;
  real f;
  real *neu1;
  int begin = 0;
  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
  sen[0] = 0;
  if(interactive) {
    printf("Enter the phrase to be continued:\n");
    while(1) {
      int word = ReadWordIndex(stdin);
      if(word == 0) break;
      if(word == -1) word = SearchVocab("<unk>");
      ++begin;
      sen[begin] = word;      
    }

  }

  int sentence = 0;
  while (sentence < num_sentences) {
    memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); // clean activations

    for(int i = 1; i <= begin; ++i) printf("%s ", vocab[sen[i]].word);
    if(begin) printf("| ");
    int input = 0;
    real logprob = 0.0;
    while(1) {

      if (input != 0) { 
	for(int c = 0; c < layer1_size; ++c) {
	  for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation
	}
      }
      last_word = sen[input];
      for(int c = 0; c < layer1_size; ++c) {
	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
      }
      ApplySigmoid(neu1+layer1_size*input, layer1_size);
    
      if(input < begin) {
	++input;
	continue;
      }

      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
      if(maxent_order) {
	for(int order = 0; order < maxent_order && input >= order; ++order) {
	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[input-b]+1);
	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
	}
      }
     
      int node = vocab_size - 2;
      while(node > 0) {
	// Propagate hidden -> output
	f = 0.0;
	l2 = node * layer1_size;
	for(int c = 0; c < layer1_size; ++c) {
	  f += neu1[input*layer1_size + c] * nnet.syn1[l2 + c];
	}
	for(int order = 0; order < maxent_order && input >= order; ++order) {
	  f += nnet.synMaxent[feature_hashes[order] + node];
	}
	f = exp(f)/(1+exp(f)); // sigmoid
	real random = rand() / (real)RAND_MAX;
	if (f > random) {
	  node = tree[node].child0; 
	  logprob += log10(f);
	} else {
	  node = tree[node].child1; 
	  logprob += log10(1-f);
	}
      }
      ++input;
      sen[input] = node + vocab_size;
      printf("%s ", vocab[sen[input]].word);
      if(sen[input] == 0 || input == MAX_SENTENCE_LENGTH) {
	printf("%f %f\n", logprob, logprob /(input-begin));
	break;
      }
    }
    ++sentence;
  }
  free(neu1);
}
// Tokenize sentences
void tokenizeSentences(struct Sentence* collection, long noSents){
    long i;
    for (i = 0; i < noSents; i++){
        //printf("**************************\n");
        // Copy the word into a local variable line
        char* line = (char*) malloc(MAX_SENTENCE);
        strcpy(line, collection[i].sent);

        int count = 0, n, actCount = 0, sentCount = 0;

        // Split based on 's
        char* first = multi_tok(line, "'s");
        char* second = multi_tok(NULL, "'s");

        // Join both the parts without the 's (from baseline: add it at the end)
        if(second != NULL) line = strcat(first, strcat(second, " \'s"));
        else line = first;

        char* temp = (char*) malloc(MAX_SENTENCE);
        strcpy(temp, line);
        
        // Remove ' ', ',', '.', '?', '!', '\', '/'
        char* delim = " ,/!?\\"; // Ignore the full stop, used to demarcate end of sentence
        line = strtok(line, delim);
        // Going over the line to determine the number of parts
        while(line != NULL){
            count++;

            // Check if an ending word
            if(line[strlen(line)-1] == '.')
                sentCount++;
            
            // Get the next word
            line = strtok(NULL, delim);
        }

        // Now store the word components, looping over them
        if(sentCount == 0) sentCount = 1; // Punctuations not present, treat as one sentence

        collection[i].index = (int*) malloc(count * sizeof(int));
        collection[i].endIndex = (int*) malloc(sentCount * sizeof(int));

        line = strtok(temp, delim);
        count = 0, sentCount = 0;
        int lineEnd;
        int wordIndex;
        while(line != NULL){
            // Convert the token into lower case
            for(n = 0; line[n]; n++){
                line[n] = tolower(line[n]);

                // Check if it has a trailing full stop, if yes, removeit and report
                if (line[n] == '.'){
                    lineEnd = 1;
                    line[n] = '\0';
                }
            }
            wordIndex = SearchVocab(line);
            // Exists in vocab, save
            if (wordIndex != -1){
                collection[i].index[count] = wordIndex;
            
                actCount++;
                count++;
            }
            
            // Adjust end of line count
            if(lineEnd){
                collection[i].endIndex[sentCount] = count-1;
                sentCount++;
                lineEnd = 0;
            }
            
            // Next word
            line = strtok(NULL, delim);
        }

        // Punctuations absent, treat everything as one setnence
        if(sentCount == 0){
            sentCount = 1;
            collection[i].endIndex[0] = count-1;
        }

        // Now store the word components, looping over them
        collection[i].count = count;
        collection[i].actCount = actCount;
        collection[i].sentCount = sentCount;

        //printf("Sent count: %s\n%d\n", collection[i].sent, collection[i].sentCount);
    }

    printf("\nTokenized %ld sentences!\n", noSents);
}
Example #8
0
void *TrainCBOWModelThreadGram(void *arg) {

	/*Get Parameters*/
	threadParameters *params = arg;

	vocabulary *voc = params->voc;	 //shared
	int id = params->threadNumber;
	int MAX_STRING = params->max_string;
	int MAX_EXP = params->max_exp;
	int ngram = params->ngram;
	int layer1_size = params->layer1_size;
	int num_threads = params->num_threads;
	int file_size = params->file_size;
	int window = params->window;
	int hs = params->hs;
	int negative = params->negative;
	int EXP_TABLE_SIZE = params->exp_table_size;
	int table_size = params->table_size;
	int position = params->position;
	int overlap = params->overlap;
	int hashbang = params->hashbang;
	long long int *word_count_actual = params->word_count_actual; //shared
	int *table = params->table;
	char *train_file = params->train_file;
	real starting_alpha = params->starting_alpha;
	real sample = params->sample;
	real *alpha = params->alpha;	 //shared
	real *syn0  = params->syn0; 	//shared
	real *syn1 = params->syn1;		 //shared
	real *syn1neg = params->syn1neg;	 //shared
	real *expTable = params->expTable;	 //shared

	free(arg);




	long long a, b, d, i, word, last_word, sentence_length = 0, sentence_position = 0;
	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
	long long  l2, c, target, label;
	unsigned long long next_random = (long long)id;
	

	real f, g;
	clock_t now;

	char wordToGram[MAX_STRING];
	char gram[ngram+3];
	int start = 0;
	int end;

	real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //one vector
	real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 
	FILE *fi = fopen(train_file, "rb");

	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);

	while (1) {


		if (word_count - last_word_count > 10000) {
			(*word_count_actual) += word_count - last_word_count;
			last_word_count = word_count;

			if ((DEBUG_MODE > 1)) {
				now=clock();
				printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, (*alpha),
				(*word_count_actual) / (real)(voc->train_words + 1) * 100,
				(*word_count_actual) / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
				fflush(stdout);
			}

			(*alpha) = starting_alpha * (1 - (*word_count_actual) / (real)(voc->train_words + 1));

			if ((*alpha) < starting_alpha * 0.0001)
				(*alpha) = starting_alpha * 0.0001;
		}

		if (sentence_length == 0) {

			wordToGram[0] = '\0'; //so length is 0
			end = 0;

			while (1) {



				if (feof(fi))
					break;	

				if(end == 0){
					if(hashbang)
						ReadWordHashbang(wordToGram, fi);
					else
						ReadWord(wordToGram,fi);
					i = 0;
				}

				end = getGrams(wordToGram,gram,i, ngram, overlap, position,hashbang);

				if(end == -1)
					word = SearchVocab(voc,wordToGram);
				else
					word = SearchVocab(voc, gram);	

				word_count++;
				i += 1;

				if(end == 0){
					continue;
				}

				if (end == -1)
					end = 0;

				if (word == -1)
					continue;		

				if (word == 0) //context break
					break;

				// The subsampling randomly discards frequent words while keeping the ranking same
				if (sample > 0) {
					real ran = (sqrt(voc->vocab[word].cn / (sample * voc->train_words)) + 1) * (sample * voc->train_words) / voc->vocab[word].cn;
					next_random = next_random * (unsigned long long)25214903917 + 11;
					
					if (ran < (next_random & 0xFFFF) / (real)65536)
						continue;
				}

				sen[sentence_length] = word;
				sentence_length++;

				if (sentence_length >= MAX_SENTENCE_LENGTH)
					break;

			}
			
			sentence_position = 0;
		}

		if (feof(fi)) //end file
			break;

		if (word_count > voc->train_words / num_threads) //trained all word
			break;

		word = sen[sentence_position]; //index

		if (word == -1) 
			continue;

		for (c = 0; c < layer1_size; c++)
			neu1[c] = 0;

		for (c = 0; c < layer1_size; c++)
			neu1e[c] = 0;

		next_random = next_random * (unsigned long long)25214903917 + 11;

		b = next_random % window;

		/*--- Training ---*/

		// in -> hidden
		for (a = b; a < window * 2 + 1 - b; a++) //a = [0 window]->[(window*2+1)-rand] -> dynamic window
			if (a != window) {

				c = sentence_position - window + a;
				
				if (c < 0) continue;

				if (c >= sentence_length) continue;

				last_word = sen[c]; //index of word

				if (last_word == -1) continue;

				for (c = 0; c < layer1_size; c++) // c is each vector index
					neu1[c] += syn0[c + last_word * layer1_size]; //sum of all vectors in input window (fig cbow) -> vectors on hidden
		}

		if (hs)
			for (d = 0; d < voc->vocab[word].codelen; d++) {
				f = 0;
				l2 = voc->vocab[word].point[d] * layer1_size; //offset of word
				// Propagate hidden -> output
				for (c = 0; c < layer1_size; c++)
					f += neu1[c] * syn1[c + l2]; //sum vectors input window * word weights on syn1 -> output vectors

				if (f <= -MAX_EXP) //sigmoid activation function - precalculated in expTable
					continue;
				else if (f >= MAX_EXP)
					continue;
				else
					f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];

				// 'g' is the gradient multiplied by the learning rate
				g = (1 - voc->vocab[word].code[d] - f) * (*alpha); 
				// Propagate errors output -> hidden
				for (c = 0; c < layer1_size; c++)
					neu1e[c] += g * syn1[c + l2]; //save to modify vectors
				// Learn weights hidden -> output
				for (c = 0; c < layer1_size; c++)
					syn1[c + l2] += g * neu1[c]; //modify weights
		}
		// NEGATIVE SAMPLING
		if (negative > 0)
			for (d = 0; d < negative + 1; d++) {
				if (d == 0) {
					target = word;
					label = 1; //(w,c) in corpus
				} else {
					next_random = next_random * (unsigned long long)25214903917 + 11;
					target = table[(next_random >> 16) % table_size];

					if (target == 0) 
						target = next_random % (voc->vocab_size - 1) + 1;

					if (target == word)
						continue;

					label = 0; //(w,c) not in corpus
				}

				l2 = target * layer1_size; //get word vector index
				f = 0;

				for (c = 0; c < layer1_size; c++)
					f += neu1[c] * syn1neg[c + l2]; //vector*weights

				if (f > MAX_EXP) //sigmoid
					g = (label - 1) * (*alpha);
				else if (f < -MAX_EXP)
					g = (label - 0) * (*alpha);
				else
					g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * (*alpha);

				for (c = 0; c < layer1_size; c++)
					neu1e[c] += g * syn1neg[c + l2]; //saving error

				for (c = 0; c < layer1_size; c++)
					syn1neg[c + l2] += g * neu1[c];
		}
		// hidden -> in
		for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
			c = sentence_position - window + a;

			if (c < 0)
				continue;

			if (c >= sentence_length)
				continue;
			last_word = sen[c];

			if (last_word == -1)
				continue;

			for (c = 0; c < layer1_size; c++)
				syn0[c + last_word * layer1_size] += neu1e[c];  //modify word vectors with error
		}
		

		sentence_position++;

		if (sentence_position >= sentence_length) {
			sentence_length = 0;
			continue;
		}
	}

	fclose(fi);
	free(neu1);
	free(neu1e);
	pthread_exit(NULL);
}
Example #9
0
/*
  Returns the word index by calling SearchVocab.
*/
int ReadWordIndex(FILE *fin) { //imported from word2vec
    char word[MAX_STRING];
    ReadWord(word, fin);
    if (feof(fin)) return -1;
    return SearchVocab(word);
}
Example #10
0
// Reads a word and returns its index in the vocabulary
int ReadWordIndex(struct vocabulary *v, FILE *fin) {
  char word[MAX_STRING];
  ReadWord(word, fin, MAX_STRING);
  if (feof(fin)) return -1;
  return SearchVocab(v, word);
}