Beispiel #1
0
void ReadVocab() {
  long long a, i = 0;
  char c;
  char word[MAX_STRING];
  FILE *fin = fopen(model_file, "rb");
  if (fin == NULL) {
    fprintf(stderr, "Vocabulary file not found\n");
    exit(1);
  }
  for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
  vocab_size = 0;
  while (1) {
    ReadWord(word, fin);
    if (feof(fin)) break;
    a = AddWordToVocab(word);
    fscanf(fin, "%lld%c", &vocab[a].cn, &c);
    i++;
  }
  SortVocab();

  if(recompute_train_counts) { // If training file changed, e.g. in fine-tuning
    FILE *fi = fopen(train_file, "rb");
    if (fi == NULL) {
      fprintf(stderr, "ERROR: training data file not found!\n");
      exit(1);
    }
    train_words = 0;
    while (1) {
      ReadWordIndex(fi);
      ++train_words;
      if (feof(fi)) break;
    }    
    fclose(fi);
  }

  if (debug_mode > 0) {
    fprintf(stderr, "Vocab size: %lld\n", vocab_size);
    fprintf(stderr, "Words in train file: %lld\n", train_words);
  }
  if(test_file[0] != 0 || gen != 0) return;

  fin = fopen(train_file, "rb");
  if (fin == NULL) {
    fprintf(stderr, "ERROR: training data file not found!\n");
    exit(1);
  }
  fseek(fin, 0, SEEK_END);
  file_size = ftell(fin);
  fclose(fin);
}
Beispiel #2
0
/*
  Converts a line in a file from text to an array of vocabulary
  indices, the input form for forward propagation.
*/
long long * FileToSen(int length, FILE* fi) {

    long long *sen = (long long *) calloc(MAX_SENTENCE_LENGTH,sizeof(long long));
    long long word = 0,sentence_length = 0;
    if (sentence_length == 0) {
        while (1) {
            word = ReadWordIndex(fi);
            if (feof(fi)) break;
            if (word == -1) continue;
            if (word == 0) break;

            sen[sentence_length] = word;
            sentence_length++;
            if (sentence_length >= length) break;
        }
        return sen;
    }
}
Beispiel #3
0
void TrainModel() {
  long a;
  pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
  if (model_file[0] == 0) return;
  int iter = 0;

  FILE *t1 = fopen(model_file, "rb");
  FILE *t2 = fopen(model_file_nnet, "rb");
  if(t1 != NULL && t2 != NULL) {
    fclose(t1);
    fclose(t2);
    fprintf(stderr, "Restoring nnet from existing files %s, %s\n", model_file, model_file_nnet);
    LoadNnet();
  } else {
    LearnVocabFromTrainFile();
    if(maxent_hash_size) {
      maxent_hash_size *= 1000000;
      maxent_hash_size -= maxent_hash_size % vocab_size;
    }
    InitNet();
    SaveNnet();
  } 

  if(test_file[0] != 0) {
    counter = 0;
    real sumlogprob = EvaluateModel(test_file, 1);
    fprintf(stderr, "Test entropy %f\n", sumlogprob/log10(2)/(real)counter);
    return;
  }

  if(gen > 0) {
    Sample(gen, 0);
    return;
  } else if(gen < 0) {
    while(1) {
      Sample(-gen, 1);
    }
    return;
  }

  fprintf(stderr, "Starting training using file %s\n", train_file);

  FILE *fi = fopen(valid_file, "rb");
  valid_words = 0;
  while (1) {
    ReadWordIndex(fi);
    ++valid_words;
    if (feof(fi)) break;
  }    
  valid_file_size = ftell(fi);
  fclose(fi);

  real old_entropy = 1e99;
  real entropy;
  real diff = 1e99;
  int retry = 0;
  int decay = 0;
  while(retry < max_retry) {
    if(iter  != 0) {
      if(decay) {
	alpha /= 2.0;
	maxent_alpha /= 2.0;
      }
      word_count_actual = 0;
      counter = 0;
      start = clock();
      for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
      for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
    }
    fprintf(stderr, "Iteration %d\t", iter);
    sumlogprob_valid = 0;
    counter = 0;
    sumlogprob_valid = EvaluateModel(valid_file, 0);
    entropy = sumlogprob_valid/log10(2)/(real)counter;
    fprintf(stderr, "Valid Entropy %f", entropy);
    ++iter;

    diff = old_entropy/entropy;  
    if (isnan(entropy) || isinf(entropy) || diff < stop) {
      if (decay == 1) {
	++retry;
	fprintf(stderr, "\tRetry %d/%d", retry, max_retry);
      } else {
	decay = 1;
	fprintf(stderr, "\tDecay started");
      }
      if(isnan(entropy) || isinf(entropy) || diff < reject_threshold) {
	fprintf(stderr, "\tNnet rejected");
	FreeNnet();
	int debug_ = debug_mode;
	debug_mode = 0;
	LoadNnet();
	debug_mode = debug_;
      }
    }
    fprintf(stderr, "\n");

    if(diff > 1.0) {  
      SaveNnet();
      old_entropy = entropy;
    }
  }
}
Beispiel #4
0
void Sample(int num_sentences, int interactive) {

  long long last_word;
  long long sen[MAX_SENTENCE_LENGTH + 1];
  long long l2;
  real f;
  real *neu1;
  int begin = 0;
  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
  sen[0] = 0;
  if(interactive) {
    printf("Enter the phrase to be continued:\n");
    while(1) {
      int word = ReadWordIndex(stdin);
      if(word == 0) break;
      if(word == -1) word = SearchVocab("<unk>");
      ++begin;
      sen[begin] = word;      
    }

  }

  int sentence = 0;
  while (sentence < num_sentences) {
    memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); // clean activations

    for(int i = 1; i <= begin; ++i) printf("%s ", vocab[sen[i]].word);
    if(begin) printf("| ");
    int input = 0;
    real logprob = 0.0;
    while(1) {

      if (input != 0) { 
	for(int c = 0; c < layer1_size; ++c) {
	  for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation
	}
      }
      last_word = sen[input];
      for(int c = 0; c < layer1_size; ++c) {
	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
      }
      ApplySigmoid(neu1+layer1_size*input, layer1_size);
    
      if(input < begin) {
	++input;
	continue;
      }

      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
      if(maxent_order) {
	for(int order = 0; order < maxent_order && input >= order; ++order) {
	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[input-b]+1);
	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
	}
      }
     
      int node = vocab_size - 2;
      while(node > 0) {
	// Propagate hidden -> output
	f = 0.0;
	l2 = node * layer1_size;
	for(int c = 0; c < layer1_size; ++c) {
	  f += neu1[input*layer1_size + c] * nnet.syn1[l2 + c];
	}
	for(int order = 0; order < maxent_order && input >= order; ++order) {
	  f += nnet.synMaxent[feature_hashes[order] + node];
	}
	f = exp(f)/(1+exp(f)); // sigmoid
	real random = rand() / (real)RAND_MAX;
	if (f > random) {
	  node = tree[node].child0; 
	  logprob += log10(f);
	} else {
	  node = tree[node].child1; 
	  logprob += log10(1-f);
	}
      }
      ++input;
      sen[input] = node + vocab_size;
      printf("%s ", vocab[sen[input]].word);
      if(sen[input] == 0 || input == MAX_SENTENCE_LENGTH) {
	printf("%f %f\n", logprob, logprob /(input-begin));
	break;
      }
    }
    ++sentence;
  }
  free(neu1);
}
Beispiel #5
0
real EvaluateModel(char* filename, int printLoglikes) {
  long long d, word = -1, last_word, sentence_length = 0;
  long long sen[MAX_SENTENCE_LENGTH + 1];
  long long l2;
  real f;
  real *neu1;
  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
  memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));

  FILE *fi = fopen(filename, "rb");
  real my_sumlogprob = 0;

  while (1) {
    if (feof(fi)) break;
   
    sen[0] = 0;
    int good = 1;
    sentence_length = 1;
    while(sentence_length < MAX_SENTENCE_LENGTH) {
      word = ReadWordIndex(fi);
      sen[sentence_length] = word;
      if (feof(fi) || word == 0) break;
      if( word == -1) good = 0;
      ++sentence_length;
    }
    if(good == 0) {
      if(printLoglikes) printf("OOV\n");
      continue;
    }
    if(sentence_length == 1 && feof(fi)) break;
    real sentence_logprob = 0.0;

    memset(neu1, 0, (long long)layer1_size * sentence_length * sizeof(real));

    for(int input = 0; input < sentence_length; ++input) {
      // Forward pass (not including final softmax)  
      if (input != 0) { 
	for(int c = 0; c < layer1_size; ++c) {
	  for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation
	}
      }
      last_word = sen[input];
      for(int c = 0; c < layer1_size; ++c) {
	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
      }
      ApplySigmoid(neu1+layer1_size*input, layer1_size);
    }

    for(int target = 1; target <= sentence_length; ++target) {
      // Forward pass (softmax)
      word = sen[target];
      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
      if(maxent_order) {
	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[target-b]+1);
	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
	}
      }
      real logprob = 0.0;
      for (d = 0; d < vocab[word].codelen; d++) {
	// Propagate hidden -> output
	f = 0.0;
	l2 = vocab[word].point[d] * layer1_size;
	for(int c = 0; c < layer1_size; ++c) {
	  f += neu1[layer1_size*(target - 1) + c] * nnet.syn1[l2 + c];
	}
	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
	  f += nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]];
	}
	logprob += log10(1+(vocab[word].code[d] == 1 ? exp(f) : exp(-f)));	
      }
      sentence_logprob += logprob;
      ++counter;
    }
    if(printLoglikes) printf("%f\n", -sentence_logprob);
    my_sumlogprob += sentence_logprob;
  }
  fclose(fi);
  free(neu1);
  return my_sumlogprob;
}
Beispiel #6
0
void *TrainModelThread(void *id) {
  long long d, word = -1, last_word, sentence_length = 0;
  long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
  long long l2;
  real f, g;
  clock_t now;
  real *neu1, *neu1e;
  int full_block = bptt_block + bptt;
  posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));
  posix_memalign((void **)&neu1e, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real));

  FILE *fi = fopen(train_file, "rb");
  fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
#ifdef DEBUG
  real my_sumlogprob = 0;
#endif
  if((long long)id != 0) while(word != 0 && !feof(fi)) { // skipping to the next newline
      word = ReadWordIndex(fi); 
    }

  while (1) {
    if (word_count - last_word_count > 10000) {
      word_count_actual += word_count - last_word_count;
      last_word_count = word_count;
      if (debug_mode > 1) {
        now=clock();
        fprintf(stderr, "%cAlpha: %f  ME-alpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk\t", 13, alpha, maxent_alpha,
		word_count_actual / (real)(train_words + 1) * 100,
		word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
        fflush(stdout);
      }
    }
    if (feof(fi) || word_count > train_words / num_threads) break;
   
    sen[0] = 0; // <s> token -- beginning of sentence
    int good = 1;
    sentence_length = 1;
    while(sentence_length < MAX_SENTENCE_LENGTH) {
      word = ReadWordIndex(fi);
      ++word_count;
      sen[sentence_length] = word;
      if (feof(fi) || word == 0) break;
      if (word == -1) good = 0;
      ++sentence_length;
    }

    if(good == 0) continue;
    if(sentence_length == 1 && feof(fi)) break;
    
    memset(neu1e, 0, (long long)layer1_size * sentence_length * sizeof(real)); // clear gradients  
    memset(neu1, 0, (long long)layer1_size * sentence_length * sizeof(real)); // clear activations
#ifdef DEBUG
    real sentence_logprob = 0.0;
#endif

    for(int input = 0; input < sentence_length; ++input) {
      // Forward pass (not including final softmax)  
      if (input != 0) { 
	for(int c = 0; c < layer1_size; ++c) {
	  for(int d = 0; d < layer1_size; ++d) { 
	    neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d];  // Recurrent hidden->hidden activation
	  }
	}
      }
      last_word = sen[input];
      for(int c = 0; c < layer1_size; ++c) {
	neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden
      }
      ApplySigmoid(neu1+layer1_size*input, layer1_size);
    }
    
    for(int target = 1; target <= sentence_length; ++target) {
      // Forward pass (softmax)
      word = sen[target];
      long long feature_hashes[MAX_NGRAM_ORDER] = {0};
      if(maxent_order) {
	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
	  feature_hashes[order] = PRIMES[0]*PRIMES[1];    	    
	  for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[target-b]+1);
	  feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size);
	}
      }
      for (d = 0; d < vocab[word].codelen; d++) {
	// Propagate hidden -> output
	f = 0.0;
	l2 = vocab[word].point[d] * layer1_size;
	for(int c = 0; c < layer1_size; ++c) {
	  f += neu1[layer1_size*(target - 1) + c] * nnet.syn1[l2 + c];
	}
	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
	  f += nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]];
	}
#ifdef DEBUG
	sentence_logprob += log10(1+(vocab[word].code[d] == 1 ? exp(f) : exp(-f)));
#endif
	f = exp(f)/(1+exp(f)); // sigmoid
	g = (1 - vocab[word].code[d] - f); 
	g = g > MAX_GRAD ? MAX_GRAD : g;
	g = g < MIN_GRAD ? MIN_GRAD : g;
	real g_alpha = g * alpha; // 'g_alpha' is the gradient multiplied by the learning rate
	real g_maxentalpha = g * maxent_alpha;

	// Propagate errors output -> hidden
	for(int c = 0; c < layer1_size; ++c) {
	  neu1e[layer1_size * (target - 1) + c] += g_alpha * nnet.syn1[l2 + c];
        }

	// Learn weights hidden -> output
	for(int c = 0; c < layer1_size; ++c) {
	  nnet.syn1[l2 + c] += g_alpha * neu1[layer1_size*(target - 1) + c] - beta * nnet.syn1[l2 + c];
        }
	for(int order = 0; order < maxent_order && target - order >= 0; ++order) {
          nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]] += g_maxentalpha - maxent_beta * nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]];
        }
      }
    }
#ifdef DEBUG
    my_sumlogprob += sentence_logprob;
#endif
    // Backpropagation through time pass
    int my_bptt = 0;
    for(int input = sentence_length - 1; input >= 0; --input) {
	MultiplySigmoidDerivative(neu1+layer1_size*input, layer1_size, neu1e+layer1_size*input);  
	last_word = sen[input];

	for(int c = 0; c < layer1_size; ++c) {
	  nnet.syn0[layer1_size*last_word + c] += neu1e[layer1_size*input + c] - beta * nnet.syn0[layer1_size*last_word + c]; // Input weight update
	}

	long long word_num = word_count - (input - sentence_length);
	if(full_block == 0 || word_num % full_block == 0) {
	  my_bptt = bptt;
	}
	if(input > 0 && (bptt == 0 || my_bptt > 0 )) {
	  // Work with recurrent weights: backpropagate
	  for(int c = 0; c < layer1_size; ++c) {
	    for(int d = 0; d < layer1_size; ++d) {
	      neu1e[(input-1)*layer1_size + d] += nnet.synRec[c*layer1_size + d] * neu1e[input*layer1_size + c];  // Recurrent hidden->hidden backprop
	    }
	  }
	  --my_bptt;
	}
    } // End BPTT loop

    for(int input = sentence_length - 1; input > 0; --input) {
      // Work with recurrent weights: update
 	for(int c = 0; c < layer1_size; ++c) {
	  for(int d = 0; d < layer1_size; ++d) { 
	    nnet.synRec[c*layer1_size + d] += neu1e[input*layer1_size + c] * neu1[(input-1)*layer1_size + d] - beta * nnet.synRec[c*layer1_size + d]; // Recurrent hidden->hidden weight update
	  }
	}
     }

  } // End main training loop

#ifdef DEBUG
  if((long long)id == 0) fprintf(stderr, "Train Entropy (thread %lld, word count %lld) %f\t", (long long)id, word_count, my_sumlogprob/log10(2)/(real)word_count);
#endif

  fclose(fi);
  free(neu1);
  free(neu1e);
  pthread_exit(NULL);
}
Beispiel #7
0
void *TrainModelThread(void *id) {
  long long a, b, d, cw, word, last_word, sentence_length = 0,
                                          sentence_position = 0;
  long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
  long long l1, l2, c, target, label, local_iter = iter;
  unsigned long long next_random = (long long)id;
  real f, g;
  clock_t now;
  real *neu1 = (real *)calloc(layer1_size, sizeof(real));
  real *neu1e = (real *)calloc(layer1_size, sizeof(real));
  FILE *fi = fopen(train_file, "rb");
  fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
  while (1) {
    if (word_count - last_word_count > 10000) {
      word_count_actual += word_count - last_word_count;
      last_word_count = word_count;
      if ((debug_mode > 1)) {
        now = clock();
        printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13,
               alpha, word_count_actual / (real)(iter * train_words + 1) * 100,
               word_count_actual /
                   ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
        fflush(stdout);
      }
      alpha = starting_alpha *
              (1 - word_count_actual / (real)(iter * train_words + 1));
      if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
    }
    if (sentence_length == 0) {
      while (1) {
        word = ReadWordIndex(fi);
        if (feof(fi)) break;
        if (word == -1) continue;
        word_count++;
        if (word == 0) break;
        // The subsampling randomly discards frequent words while keeping the
        // ranking same
        if (sample > 0) {
          real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) *
                     (sample * train_words) / vocab[word].cn;
          next_random = next_random * (unsigned long long)25214903917 + 11;
          if (ran < (next_random & 0xFFFF) / (real)65536) continue;
        }
        sen[sentence_length] = word;
        sentence_length++;
        if (sentence_length >= MAX_SENTENCE_LENGTH) break;
      }
      sentence_position = 0;
    }
    if (feof(fi) || (word_count > train_words / num_threads)) {
      word_count_actual += word_count - last_word_count;
      local_iter--;
      if (local_iter == 0) break;
      word_count = 0;
      last_word_count = 0;
      sentence_length = 0;
      fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
      continue;
    }
    word = sen[sentence_position];
    if (word == -1) continue;
    for (c = 0; c < layer1_size; c++) neu1[c] = 0;
    for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
    next_random = next_random * (unsigned long long)25214903917 + 11;
    b = next_random % window;
    if (cbow) {  // train the cbow architecture
      // in -> hidden
      cw = 0;
      for (a = b; a < window * 2 + 1 - b; a++)
        if (a != window) {
          c = sentence_position - window + a;
          if (c < 0) continue;
          if (c >= sentence_length) continue;
          last_word = sen[c];
          if (last_word == -1) continue;
          for (c = 0; c < layer1_size; c++)
            neu1[c] += syn0[c + last_word * layer1_size];
          cw++;
        }
      if (cw) {
        for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
        if (hs)
          for (d = 0; d < vocab[word].codelen; d++) {
            f = 0;
            l2 = vocab[word].point[d] * layer1_size;
            // Propagate hidden -> output
            for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
            if (f <= -MAX_EXP)
              continue;
            else if (f >= MAX_EXP)
              continue;
            else
              f = expTable[(int)((f + MAX_EXP) *
                                 (EXP_TABLE_SIZE / MAX_EXP / 2))];
            // 'g' is the gradient multiplied by the learning rate
            g = (1 - vocab[word].code[d] - f) * alpha;
            // Propagate errors output -> hidden
            for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
            // Learn weights hidden -> output
            for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
          }
        // NEGATIVE SAMPLING
        if (negative > 0)
          for (d = 0; d < negative + 1; d++) {
            if (d == 0) {
              target = word;
              label = 1;
            } else {
              next_random = next_random * (unsigned long long)25214903917 + 11;
              target = table[(next_random >> 16) % table_size];
              if (target == 0) target = next_random % (vocab_size - 1) + 1;
              if (target == word) continue;
              label = 0;
            }
            l2 = target * layer1_size;
            f = 0;
            for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
            if (f > MAX_EXP)
              g = (label - 1) * alpha;
            else if (f < -MAX_EXP)
              g = (label - 0) * alpha;
            else
              g = (label - expTable[(int)((f + MAX_EXP) *
                                          (EXP_TABLE_SIZE / MAX_EXP / 2))]) *
                  alpha;
            for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
            for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
          }
        // hidden -> in
        for (a = b; a < window * 2 + 1 - b; a++)
          if (a != window) {
            c = sentence_position - window + a;
            if (c < 0) continue;
            if (c >= sentence_length) continue;
            last_word = sen[c];
            if (last_word == -1) continue;
            for (c = 0; c < layer1_size; c++)
              syn0[c + last_word * layer1_size] += neu1e[c];
          }
      }
    } else {  // train skip-gram
      for (a = b; a < window * 2 + 1 - b; a++)
        if (a != window) {
          c = sentence_position - window + a;
          if (c < 0) continue;
          if (c >= sentence_length) continue;
          last_word = sen[c];
          if (last_word == -1) continue;
          l1 = last_word * layer1_size;
          for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
          // HIERARCHICAL SOFTMAX
          if (hs)
            for (d = 0; d < vocab[word].codelen; d++) {
              f = 0;
              l2 = vocab[word].point[d] * layer1_size;
              // Propagate hidden -> output
              for (c = 0; c < layer1_size; c++)
                f += syn0[c + l1] * syn1[c + l2];
              if (f <= -MAX_EXP)
                continue;
              else if (f >= MAX_EXP)
                continue;
              else
                f = expTable[(int)((f + MAX_EXP) *
                                   (EXP_TABLE_SIZE / MAX_EXP / 2))];
              // 'g' is the gradient multiplied by the learning rate
              g = (1 - vocab[word].code[d] - f) * alpha;
              // Propagate errors output -> hidden
              for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
              // Learn weights hidden -> output
              for (c = 0; c < layer1_size; c++)
                syn1[c + l2] += g * syn0[c + l1];
            }
          // NEGATIVE SAMPLING
          if (negative > 0)
            for (d = 0; d < negative + 1; d++) {
              if (d == 0) {
                target = word;
                label = 1;
              } else {
                next_random =
                    next_random * (unsigned long long)25214903917 + 11;
                target = table[(next_random >> 16) % table_size];
                if (target == 0) target = next_random % (vocab_size - 1) + 1;
                if (target == word) continue;
                label = 0;
              }
              l2 = target * layer1_size;
              f = 0;
              for (c = 0; c < layer1_size; c++)
                f += syn0[c + l1] * syn1neg[c + l2];
              if (f > MAX_EXP)
                g = (label - 1) * alpha;
              else if (f < -MAX_EXP)
                g = (label - 0) * alpha;
              else
                g = (label - expTable[(int)((f + MAX_EXP) *
                                            (EXP_TABLE_SIZE / MAX_EXP / 2))]) *
                    alpha;
              for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
              for (c = 0; c < layer1_size; c++)
                syn1neg[c + l2] += g * syn0[c + l1];
            }
          // Learn weights input -> hidden
          for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
        }
    }
Beispiel #8
0
// learning: hs (hierarchical softmax) v.s. negative sampling
// model: cbow v.s. skip gram
void *TrainModelThread(void *id) {
    // word 向sen中添加单词用,句子完成后表示句子中的当前单词
    // last_word 上一个单词,辅助扫描窗口
    // sentence_length 当前句子的长度(单词数)
    // sentence_position 当前单词在当前句子中的index
    long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
    // word_count 已训练语料总长度
    // last_word_count 保存值,以便在新训练语料长度超过某个值时输出信息
    // sen 单词数组,表示句子
    long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
    // l1 ns中表示word在concatenated word vectors中的起始位置,之后layer1_size是对应的word vector,
    //因为把矩阵拉成长向量了
    // l2 cbow或ns中权重向量的起始位置,之后layer1_size是对应的syn1或syn1neg,因为把矩阵拉成长向量了
    // c 循环中的计数作用
    // target ns中当前的sample
    // label ns中当前sample的label

    long long l1, l2, c, target, label;
    // id 线程创建的时候传入,辅助随机数生成
    unsigned long long next_random = (long long) id;
    // f e^x / (1/e^x),fs中指当前编码为是0(父亲的左子节点为0,右为1)的概率,
    // ns中指label是1的概率
    // g 误差(f与真实值的偏离)与学习速率的乘积
    real f, g; // function and gradient
    clock_t now;
    // 隐层节点
    real * neu1 = (real *)calloc(layer1_size, sizeof(real));
    // 误差累计项,其实对应的是Gneu1
    real * neu1e = (real *)calloc(layer1_size, sizeof(real));
    // 将文件内容分配给各个线程
    FILE * fi = fopen(train_file, "rb");
    fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);

    while (1) {
        if (word_count - last_word_count > 10000) {
            word_count_actual += word_count - last_word_count;
            last_word_count = word_count;
            if (debug_mode > 1) {
                now = clock();
                printf("%cAlpah: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
                       word_count_actual / (real)(train_words + 1) * 100,
                       word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
                fflush(stdout);
            }
            alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1)); // 自动调整学习速率
            if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;// 学习速率有下限
        }
        if (sentence_length == 0) {// 如果当前句子长度为0
            while(1) {
                word = ReadWordIndex(fi);
                if (feof(fi)) break;// 读到文件末尾
                if (word == -1) continue;// 没有这个单词
                word_count++;// 单词计数增加
                if (word == 0) break;// 是个回车
                // 这里的亚采样是指 Sub-Sampling,Mikolov 在论文指出这种亚采样能够带来 2 到 10 倍的性能提升,并能够提升低频词的表示精度。
                // 低频词被丢弃概率低,高频词被丢弃概率高
                if (sample > 0) {
                    real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
                    next_random = next_random * (unsigned long long)25214903917 + 11;
                    if (ran < (next_random & 0xFFFF) / (real)65536) continue;
                }
                sen[sentence_length] = word;
                sentence_length++;
                if (sentence_length >= MAX_SENTENCE_LENGTH) break;
            }
            sentence_position = 0;// 当前单词在当前句中的index,起始值为0
        }
        // 照应while中的break,如果读到末尾,退出
        if (feof(fi)) break;
        // 已经做到了一个thread应尽的工作量,就退出
        if (word_count > train_words / num_threads) break;
        // 取句子中的第一个单词,开始运行BP算法
        word = sen[sentence_position];
        if (word == -1) continue;
        // 隐层节点值和隐层节点误差累计项清零
        for (c = 0; c < layer1_size; c++) neu1[c] = 0;
        for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
        next_random = next_random * (unsigned long long)25214903917 + 11;
        // b是个随机数,0到window-1,指定了本次算法操作实际的窗口大小
        b = next_random % window;

        if (cbow) { // train the cbow architecture - HS or NS
            // IN -> HIDDEN
            // 将窗口内的word vectors累加到隐层节点上
            for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
                c = sentence_position - window + a;
                if (c < 0) continue;
                if (c >= sentence_length) continue;
                last_word = sen[c];
                if (last_word == -1) continue;// 这个单词没有
                for (c = 0; c < layer1_size; c++)
                    neu1[c] += syn0[c + last_word * layer1_size];
            }
            // HIERARCHICAL SOFTMAX
            //当前中心词在哈弗曼数中从根节点到该节点的路径上的每一个点和neu1结合做sigmoid二分类
            //label为1-code[j]
            if (hs) for (d = 0; d < vocab[word].codelen; d++) {
                // 这里的codelen其实是少一个的,所以不会触及point里面最后一个负数
                f = 0;
                l2 = vocab[word].point[d] * layer1_size;
                // propagate hidden -> output
                // 准备计算f
                for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
                // 不在expTable内的舍弃掉,比较暴力
                if (f <= -MAX_EXP) continue;
                else if (f >= MAX_EXP) continue;
                // 从expTable中查找,快速计算
                else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
                // g is the gradient multiplied by the learning rate
                g = (1 - vocab[word].code[d] -f) * alpha;
                // propogate errors output -> hidden
                 // 记录累积误差项
                for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
                // learning weights hidden -> output
                // 更新隐层到霍夫曼树非叶节点的权重
                for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
            }
            // NEGATIVE SAMPLING
            if (negative > 0) for (d = 0; d < negative + 1; d++) {
                if (d == 0) { // 当前词的分类器应当输出1
                    target = word;
                    label = 1;
                } else { // 采样使得与target不同,不然continue,label为0,也即最多采样negative个negative sample
                    next_random = next_random * (unsigned long long)25214903917 + 11;
                    target = table[(next_random >> 16) % table_size];
                    if (target == 0) target = next_random % (vocab_size - 1) + 1;
                    if (target == word) continue;
                    label = 0;
                }
                l2 = target * layer1_size;
                f = 0;
                for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
                // 这里直接上0、1,没有考虑计算精度问题……
                if (f > MAX_EXP) g = (label - 1) * alpha;
                else if (f < -MAX_EXP) g = (label - 0) * alpha;
                else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
                for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
                for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
            }
            // hidden -> in
            // 根据隐层节点累积误差项,更新word vectors
            for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
                c = sentence_position - window + a;
                if (c < 0) continue;
                if (c >= sentence_length) continue;
                last_word = sen[c];
                if (last_word == -1) continue;
                for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
            }
        } else {  //train skip-gram
            for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { // 预测非中心的单词(邻域内的单词)
                c = sentence_position - window + a;
                if (c < 0) continue;
                if (c >= sentence_length) continue;
                last_word = sen[c];
                if (last_word == -1) continue;
                l1 = last_word * layer1_size;
                // 累计误差项清零
                for (c = 0; c < layer1_size; c++) neu1e[c] = 0;

                // HIERARCHICAL SOFTMAX
                if (hs) for (d = 0; d < vocab[word].codelen; d++) {
                    f = 0;
                    l2 = vocab[word].point[d] * layer1_size;
                    // Propagate hidden -> output
                    // 待预测单词的 word vecotr 和 隐层-霍夫曼树非叶节点权重 的内积
                    for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
                    // 同cbow中hs的讨论
                    // if (f <= -MAX_EXP) continue;
                    // else if (f >= MAX_EXP) continue;
                    if (f <= -MAX_EXP) f = 0;
                    else if (f >= MAX_EXP) f = 1;
                    // 以下内容同之前的cbow
                    else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
                    // 'g' is the gradient multiplied by the learning rate
                    g = (1 - vocab[word].code[d] - f) * alpha; // 这里的code[d]其实是下一层的,code错位了,point和code是错位的!
                    // Propagate errors output -> hidden
                    for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
                    // Learn weights hidden -> output
                    for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
                }
                // NEGATIVE SAMPLING
                if (negative > 0) for (d = 0; d < negative + 1; d++) {
                    if (d == 0) {
                        target = word;
                        label = 1;
                    } else {
                        next_random = next_random * (unsigned long long)25214903917 + 11;
                        target = table[(next_random >> 16) % table_size];
                        if (target == 0) target = next_random % (vocab_size - 1) + 1;
                        if (target == word) continue;
                        label = 0;
                    }
                    l2 = target * layer1_size;
                    f = 0;
                    for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
                    // 以下内容同之前的cbow
                    if (f > MAX_EXP) g = (label - 1) * alpha;
                    else if (f < -MAX_EXP) g = (label - 0) * alpha;
                    else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
                    for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
                    for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
                }
                // Learn weights input -> hidden
                for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
            }
        }
Beispiel #9
0
//这个线程函数执行之前,已经做好了一些工作:根据词频排序的词汇表,每个单词的huffman编码
void *TrainModelThread(void *id)
{
  long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
  long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
  long long l1, l2, c, target, label;
  unsigned long long next_random = (long long)id;
  real f, g;
  clock_t now;
  real *neu1 = (real *)calloc(layer1_size, sizeof(real));
  real *neu1e = (real *)calloc(layer1_size, sizeof(real));
  FILE *fi = fopen(train_file, "rb");
  //每个线程对应一段文本。根据线程id找到自己负责的文本的初始位置
  fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
  while (1)
  {
    if (word_count - last_word_count > 10000)
    {
      word_count_actual += word_count - last_word_count;
      last_word_count = word_count;
      if ((debug_mode > 1))
      {
        now=clock();
        printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
         word_count_actual / (real)(train_words + 1) * 100,
         word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
        fflush(stdout);
      }
      alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1));
      if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
    }
    if (sentence_length == 0)
    {
      while (1)
      {
        word = ReadWordIndex(fi);//从文件流中读取一个词,并返回这个词在词汇表中的位置
        if (feof(fi)) break;
        if (word == -1) continue;
        word_count++;
        if (word == 0) break;
        // The subsampling randomly discards frequent words while keeping the ranking same
        if (sample > 0)//对高频词进行下采样,不过要保持排序不变。
        {
          real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
          next_random = next_random * (unsigned long long)25214903917 + 11;
          if (ran < (next_random & 0xFFFF) / (real)65536) continue;
        }
        sen[sentence_length] = word;
        sentence_length++;
        //1000个单词视作一个句子?
        if (sentence_length >= MAX_SENTENCE_LENGTH) break;
      }
      sentence_position = 0;
    }
    if (feof(fi)) break;
    if (word_count > train_words / num_threads) break;//如果当前线程已处理的单词超过了 阈值,则退出。
    word = sen[sentence_position];
    if (word == -1) continue;
    for (c = 0; c < layer1_size; c++) neu1[c] = 0;
    for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
    next_random = next_random * (unsigned long long)25214903917 + 11;
    b = next_random % window;
    if (cbow)
    {  //train the cbow architecture
      // in -> hidden
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window)//扫描目标单词的左右几个单词
      {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        for (c = 0; c < layer1_size; c++)//layer1_size词向量的维度,默认值是100
        	neu1[c] += syn0[c + last_word * layer1_size];//传说中的向量和?
      }
      if (hs) for (d = 0; d < vocab[word].codelen; d++)//开始遍历huffman树,每次一个节点
      {
        f = 0;
        l2 = vocab[word].point[d] * layer1_size;//point应该记录的是huffman的路径。找到当前节点,并算出偏移
        // Propagate hidden -> output
        for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];//计算内积
        if (f <= -MAX_EXP) continue;//内积不在范围内直接丢弃
        else if (f >= MAX_EXP) continue;
        else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//内积之后sigmoid函数
        // 'g' is the gradient multiplied by the learning rate
        g = (1 - vocab[word].code[d] - f) * alpha;//偏导数的一部分

        //layer1_size是向量的维度
        // Propagate errors output -> hidden 反向传播误差,从huffman树传到隐藏层。下面就是把当前内节点的误差传播给隐藏层,syn1[c + l2]是偏导数的一部分。
        for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];

        // Learn weights hidden -> output 更新当前内节点的向量,后面的neu1[c]其实是偏导数的一部分
        for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
      }
      // NEGATIVE SAMPLING
      if (negative > 0)
      for (d = 0; d < negative + 1; d++)
      {
        if (d == 0)
        {
          target = word;//目标单词
          label = 1;//正样本
        }
        else
        {
          next_random = next_random * (unsigned long long)25214903917 + 11;
          target = table[(next_random >> 16) % table_size];
          if (target == 0) target = next_random % (vocab_size - 1) + 1;
          if (target == word) continue;
          label = 0;//负样本
        }
        l2 = target * layer1_size;
        f = 0;
        for (c = 0; c < layer1_size; c++)
        	f += neu1[c] * syn1neg[c + l2];//内积
        if (f > MAX_EXP)
        	g = (label - 1) * alpha;
        else if (f < -MAX_EXP)
        	g = (label - 0) * alpha;
        else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
        for (c = 0; c < layer1_size; c++)
        	neu1e[c] += g * syn1neg[c + l2];//隐藏层的误差
        for (c = 0; c < layer1_size; c++)
        	syn1neg[c + l2] += g * neu1[c];//更新负样本向量
      }
      // hidden -> in
      for (a = b; a < window * 2 + 1 - b; a++)
      if (a != window)//cbow模型 更新的不是中间词语的向量,而是周围几个词语的向量。
      {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        for (c = 0; c < layer1_size; c++)
        	syn0[c + last_word * layer1_size] += neu1e[c];//更新词向量
      }
    }
    else
    {  //train skip-gram
       for (a = b; a < window * 2 + 1 - b; a++)
       if (a != window)//扫描周围几个词语
       {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        l1 = last_word * layer1_size;
        for (c = 0; c < layer1_size; c++)
        	neu1e[c] = 0;
        // HIERARCHICAL SOFTMAX
        if (hs)
        for (d = 0; d < vocab[word].codelen; d++)//遍历叶子节点
        {
          f = 0;
          l2 = vocab[word].point[d] * layer1_size;//point记录的是huffman的路径
          // Propagate hidden -> output 感觉源代码这个英语注释有点误导人,这里的隐藏层就是输入层,就是词向量。
          for (c = 0; c < layer1_size; c++)
        	  f += syn0[c + l1] * syn1[c + l2];//计算两个词向量的内积
          if (f <= -MAX_EXP) continue;
          else if (f >= MAX_EXP) continue;
          else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
          // 'g' is the gradient multiplied by the learning rate
          g = (1 - vocab[word].code[d] - f) * alpha;//偏导数的一部分
          // Propagate errors output -> hidden
          for (c = 0; c < layer1_size; c++)
        	  neu1e[c] += g * syn1[c + l2];//隐藏层的误差
          // Learn weights hidden -> output
          for (c = 0; c < layer1_size; c++)
        	  syn1[c + l2] += g * syn0[c + l1];//更新叶子节点向量
        }
        // NEGATIVE SAMPLING
        if (negative > 0)//这个同cobow差不多
        for (d = 0; d < negative + 1; d++)
        {
          if (d == 0)
          {
            target = word;
            label = 1;
          }
          else
          {
            next_random = next_random * (unsigned long long)25214903917 + 11;
            target = table[(next_random >> 16) % table_size];
            if (target == 0) target = next_random % (vocab_size - 1) + 1;
            if (target == word) continue;
            label = 0;
          }
          l2 = target * layer1_size;
          f = 0;
          for (c = 0; c < layer1_size; c++)
        	  f += syn0[c + l1] * syn1neg[c + l2];
          if (f > MAX_EXP) g = (label - 1) * alpha;
          else if (f < -MAX_EXP)
        	  g = (label - 0) * alpha;
          else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
          for (c = 0; c < layer1_size; c++)
        	  neu1e[c] += g * syn1neg[c + l2];
          for (c = 0; c < layer1_size; c++)
        	  syn1neg[c + l2] += g * syn0[c + l1];
        }

        // Learn weights input -> hidden
        for (c = 0; c < layer1_size; c++)
        	syn0[c + l1] += neu1e[c];//更新周围几个词语的向量
      }
    }
// 训练模型线程:训练过程
void *TrainModelThread(void *id) {
  long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
  long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
  long long l1, l2, c, target, label, local_iter = iter;
  unsigned long long next_random = (long long)id;
  real f, g;
  clock_t now;
  real *neu1 = (real *)calloc(layer1_size, sizeof(real));  //对应Xw
  real *neu1e = (real *)calloc(layer1_size, sizeof(real)); //对应error累加量
  FILE *fi = fopen(train_file, "rb");
  //每个线程对应一段文本
  fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
  while (1) {
    if (word_count - last_word_count > 10000) {
      word_count_actual += word_count - last_word_count;
      last_word_count = word_count;
      if ((debug_mode > 1)) {
        now=clock();
        printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, alpha,
         word_count_actual / (real)(iter * train_words + 1) * 100,
         word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
        fflush(stdout);
      }
      alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
      if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
    }
    if (sentence_length == 0) {
      while (1) {
        word = ReadWordIndex(fi); //读一个词,返回其在词汇表的索引位置
        if (feof(fi)) break;
        if (word == -1) continue;
        word_count++;
        if (word == 0) break;
        // The subsampling randomly discards frequent words while keeping the ranking same
        // 对高频词进行下采样,以概率p丢弃。p = 1-[sqrt(t/f(w))+t/f(w)].但仍保持排序不变
        // 先计算ran = sqrt(t/f(w))+t/f(w),产生(0,1)上的随机数r,如果r>ran,则丢弃。
        if (sample > 0) { 
          real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
          next_random = next_random * (unsigned long long)25214903917 + 11;
          if (ran < (next_random & 0xFFFF) / (real)65536) continue;
        }
        sen[sentence_length] = word;
        sentence_length++;
        // 将1000个词当成一个句子
        if (sentence_length >= MAX_SENTENCE_LENGTH) break;
      }
      sentence_position = 0;
    }
    // 当前线程处理单词数超过阈值
    if (feof(fi) || (word_count > train_words / num_threads)) {
      word_count_actual += word_count - last_word_count;
      local_iter--;
      if (local_iter == 0) break;
      word_count = 0;
      last_word_count = 0;
      sentence_length = 0;
      fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
      continue;
    }
    word = sen[sentence_position];
    if (word == -1) continue;
    for (c = 0; c < layer1_size; c++) neu1[c] = 0;
    for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
    next_random = next_random * (unsigned long long)25214903917 + 11;
	// 随机产生0-5的窗口大小
    b = next_random % window;
    if (cbow) {  //train the cbow architecture
      // in -> hidden
      cw = 0;
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        // 上下文词进行向量加和,得到Xw
        for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
        cw++;
      }
      if (cw) {
      	// 归一化?
        for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
        // hs,采用huffman
        if (hs) for (d = 0; d < vocab[word].codelen; d++) {
          f = 0;
          l2 = vocab[word].point[d] * layer1_size; //路径的内部节点
          // Propagate hidden -> output
          // 隐藏层到输出层,计算误差梯度
          // neu1 对应 Xw, syn1对应内部节点的向量0
          for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //计算内积
          if (f <= -MAX_EXP) continue;
          else if (f >= MAX_EXP) continue;
          else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//sigmoid
          // 'g' is the gradient multiplied by the learning rate
          // 内部节点0的梯度(1-d-sigmoid(Xw·0))Xw,g为前面部分
          g = (1 - vocab[word].code[d] - f) * alpha;
          
          // Propagate errors output -> hidden
          // 反向传播误差,从huffman树传到隐藏层
          // 累加的梯度更新量
          for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
          // Learn weights hidden -> output
          // 内部节点更新向量
          for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
        }
        // NEGATIVE SAMPLING
        if (negative > 0) for (d = 0; d < negative + 1; d++) {
          if (d == 0) {
            target = word; //目标词
            label = 1;   //正样本
          } else {//采样负样本
            next_random = next_random * (unsigned long long)25214903917 + 11;
            target = table[(next_random >> 16) % table_size];
            if (target == 0) target = next_random % (vocab_size - 1) + 1;
            if (target == word) continue;
            label = 0;
          }
          l2 = target * layer1_size;
          f = 0;
          for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; //内积
          if (f > MAX_EXP) g = (label - 1) * alpha;
          else if (f < -MAX_EXP) g = (label - 0) * alpha;
          else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; //sigmoid
          for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //累积误差梯度
          for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];  //参数向量更新
        }
        // hidden -> in
    	// 更新上下文几个词语的向量。  
        for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
          c = sentence_position - window + a;
          if (c < 0) continue;
          if (c >= sentence_length) continue;
          last_word = sen[c];
          if (last_word == -1) continue;
          for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
        }
      }
    } else {  //train skip-gram
      for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        l1 = last_word * layer1_size;
        for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
        // HIERARCHICAL SOFTMAX
        if (hs) for (d = 0; d < vocab[word].codelen; d++) { //遍历叶子节点
          f = 0;
          l2 = vocab[word].point[d] * layer1_size; //point是路径上的节点
          // Propagate hidden -> output
          for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; //内积
          if (f <= -MAX_EXP) continue;
          else if (f >= MAX_EXP) continue;
          else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; //sigmoid
          // 'g' is the gradient multiplied by the learning rate
          g = (1 - vocab[word].code[d] - f) * alpha; //梯度一部分
          // Propagate errors output -> hidden
          for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //隐藏层的误差
          // Learn weights hidden -> output
          for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; //更新内部节点向量
        }
        // NEGATIVE SAMPLING
        if (negative > 0) for (d = 0; d < negative + 1; d++) {
          if (d == 0) {
            target = word;
            label = 1;
          } else {
            next_random = next_random * (unsigned long long)25214903917 + 11;
            target = table[(next_random >> 16) % table_size];
            if (target == 0) target = next_random % (vocab_size - 1) + 1;
            if (target == word) continue;
            label = 0;
          }
          l2 = target * layer1_size;
          f = 0;
          for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
          if (f > MAX_EXP) g = (label - 1) * alpha;
          else if (f < -MAX_EXP) g = (label - 0) * alpha;
          else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
          for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
          for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
        }
        // Learn weights input -> hidden
        for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; //更新中心词向量
      }
    }
Beispiel #11
0
void *TrainCBOWModelThread(void *arg) {

	/*Get Parameters*/
	threadParameters *params = arg;

	vocabulary *voc = params->voc;	 //shared
	int id = params->threadNumber;
	int MAX_EXP = params->max_exp;
	int layer1_size = params->layer1_size;
	int num_threads = params->num_threads;
	int file_size = params->file_size;
	int window = params->window;
	int hs = params->hs;
	int negative = params->negative;
	int EXP_TABLE_SIZE = params->exp_table_size;
	int table_size = params->table_size;
	long long int *word_count_actual = params->word_count_actual; //shared
	int *table = params->table;
	char *train_file = params->train_file;
	real starting_alpha = params->starting_alpha;
	real sample = params->sample;
	real *alpha = params->alpha;	 //shared
	real *syn0  = params->syn0; 	//shared
	real *syn1 = params->syn1;		 //shared
	real *syn1neg = params->syn1neg;	 //shared
	real *expTable = params->expTable;	 //shared

	free(arg); //arg is not needed anymore

	long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0;
	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
	long long  l2, c, target, label;
	unsigned long long next_random = (long long)id;

	real f, g;
	clock_t now;
	int start = 0;

	real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //one vector
	real *neu1e = (real *)calloc(layer1_size, sizeof(real)); 
	FILE *fi = fopen(train_file, "rb");

	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);

	while (1) {


		if (word_count - last_word_count > 10000) {
			(*word_count_actual) += word_count - last_word_count;
			last_word_count = word_count;

			if ((DEBUG_MODE > 1)) {
				now=clock();
				printf("%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", 13, (*alpha),
				(*word_count_actual) / (real)(voc->train_words + 1) * 100,
				(*word_count_actual) / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
				fflush(stdout);
			}

			(*alpha) = starting_alpha * (1 - (*word_count_actual) / (real)(voc->train_words + 1));

			if ((*alpha) < starting_alpha * 0.0001)
				(*alpha) = starting_alpha * 0.0001;
		}

		if (sentence_length == 0) {

			while (1) {
				
				if (feof(fi))
					break;
				
				word = ReadWordIndex(voc,fi);
				

				if (word == -1)
					continue;

				word_count++;

				if (word == 0)
					break;

				// The subsampling randomly discards frequent words while keeping the ranking same
				if (sample > 0) {
					real ran = (sqrt(voc->vocab[word].cn / (sample * voc->train_words)) + 1) * (sample * voc->train_words) / voc->vocab[word].cn;
					next_random = next_random * (unsigned long long)25214903917 + 11;
					
					if (ran < (next_random & 0xFFFF) / (real)65536)
						continue;
				}
				sen[sentence_length] = word;
				sentence_length++;

				if (sentence_length >= MAX_SENTENCE_LENGTH)
					break;
			}
			
			sentence_position = 0;
		}

		if (feof(fi)) //end file
			break;

		if (word_count > voc->train_words / num_threads) //trained all word
			break;

		word = sen[sentence_position]; //index

		if (word == -1) 
			continue;

		for (c = 0; c < layer1_size; c++)
			neu1[c] = 0;

		for (c = 0; c < layer1_size; c++)
			neu1e[c] = 0;

		next_random = next_random * (unsigned long long)25214903917 + 11;

		b = next_random % window;

		/*--- Training ---*/

		// in -> hidden
		for (a = b; a < window * 2 + 1 - b; a++) //a = [0 window]->[(window*2+1)-rand] -> dynamic window
			if (a != window) {

				c = sentence_position - window + a;
				
				if (c < 0) continue;

				if (c >= sentence_length) continue;

				last_word = sen[c]; //index of word

				if (last_word == -1) continue;

				for (c = 0; c < layer1_size; c++) // c is each vector index
					neu1[c] += syn0[c + last_word * layer1_size]; //sum of all vectors in input window (fig cbow) -> vectors on hidden
		}

		if (hs)
			for (d = 0; d < voc->vocab[word].codelen; d++) {
				f = 0;
				l2 = voc->vocab[word].point[d] * layer1_size; //offset of word
				// Propagate hidden -> output
				for (c = 0; c < layer1_size; c++)
					f += neu1[c] * syn1[c + l2]; //sum vectors input window * word weights on syn1 -> output vectors

				if (f <= -MAX_EXP) //sigmoid activation function - precalculated in expTable
					continue;
				else if (f >= MAX_EXP)
					continue;
				else
					f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];

				// 'g' is the gradient multiplied by the learning rate
				g = (1 - voc->vocab[word].code[d] - f) * (*alpha); 
				// Propagate errors output -> hidden
				for (c = 0; c < layer1_size; c++)
					neu1e[c] += g * syn1[c + l2]; //save to modify vectors
				// Learn weights hidden -> output
				for (c = 0; c < layer1_size; c++)
					syn1[c + l2] += g * neu1[c]; //modify weights
		}
		// NEGATIVE SAMPLING
		if (negative > 0)
			for (d = 0; d < negative + 1; d++) {
				if (d == 0) {
					target = word;
					label = 1; //(w,c) in corpus
				} else {
					next_random = next_random * (unsigned long long)25214903917 + 11;
					target = table[(next_random >> 16) % table_size];

					if (target == 0) 
						target = next_random % (voc->vocab_size - 1) + 1;

					if (target == word)
						continue;

					label = 0; //(w,c) not in corpus
				}

				l2 = target * layer1_size; //get word vector index
				f = 0;

				for (c = 0; c < layer1_size; c++)
					f += neu1[c] * syn1neg[c + l2]; //vector*weights

				if (f > MAX_EXP) //sigmoid
					g = (label - 1) * (*alpha);
				else if (f < -MAX_EXP)
					g = (label - 0) * (*alpha);
				else
					g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * (*alpha);

				for (c = 0; c < layer1_size; c++)
					neu1e[c] += g * syn1neg[c + l2]; //saving error

				for (c = 0; c < layer1_size; c++)
					syn1neg[c + l2] += g * neu1[c];
		}
		// hidden -> in
		for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
			c = sentence_position - window + a;

			if (c < 0)
				continue;

			if (c >= sentence_length)
				continue;
			last_word = sen[c];

			if (last_word == -1)
				continue;

			for (c = 0; c < layer1_size; c++)
				syn0[c + last_word * layer1_size] += neu1e[c];  //modify word vectors with error
		}
		

		sentence_position++;

		if (sentence_position >= sentence_length) {
			sentence_length = 0;
			continue;
		}
	}

	fclose(fi);
	free(neu1);
	free(neu1e);
	pthread_exit(NULL);
}
void *TrainModelThread(void *id) {
  long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0, sen_count = 0, layer1_size_all = 0;
  long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
  long long l1, l2, c, target, label, local_iter = iter;
  unsigned long long next_random = (long long)id;
  real f, g;
  clock_t now;
  real *neu1, *neu1e;
  if (global_image) {
    neu1 = (real *)calloc(layer1_size + layer1_image_size, sizeof(real));
    neu1e = (real *)calloc(layer1_size + layer1_image_size, sizeof(real));
  } else {
    neu1 = (real *)calloc(layer1_size, sizeof(real));
    neu1e = (real *)calloc(layer1_size, sizeof(real));
  }
  FILE *fi = fopen(train_file, "rb");
  fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
  while (1) {
    if (word_count - last_word_count > 10000) {
      word_count_actual += word_count - last_word_count;
      last_word_count = word_count;
      if ((debug_mode > 1)) {
        now=clock();
        printf("%cAlpha: %f  Progress: %.3f%%  Words/thread/sec: %.2fk  ", 13, alpha,
         word_count_actual / (real)(iter * train_words + 1) * 100,
         word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
        fflush(stdout);
      }
      alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
      if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
    }
    // Take one sentence/paragraph in sen, iterate over each word in the sentence
    if (sentence_length == 0) {
      while (1) {
        word = ReadWordIndex(fi);
        if (feof(fi)) break;
        if (word == -1) continue;
        word_count++;
        if (word == 0) break;
        // The subsampling randomly discards frequent words while keeping the ranking same
        if (sample > 0) {
          real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
          next_random = next_random * (unsigned long long)25214903917 + 11;
          if (ran < (next_random & 0xFFFF) / (real)65536) continue;
        }
        // put words in a sentence in sen
        sen[sentence_length] = word;
        sentence_length++;
        if (sentence_length >= MAX_SENTENCE_LENGTH) break;
      }
      if (!feof(fi)) sen_count++;
      //printf("Id, sentence num/length is %lld, %lld,%lld, %lld\n", (long long)id,sen_count,sentence_length, feof(fi));
      sentence_position = 0;
    }
    if (feof(fi) || (word_count > train_words / num_threads)) {
      word_count_actual += word_count - last_word_count;
      local_iter--;
      printf("id/ word count/ word_count_actual / iter/: %lld,%lld, %lld, %lld\n", (long long)id, word_count, word_count_actual, local_iter);
      // End training
      if (local_iter == 0) break;
      word_count = 0;
      last_word_count = 0;
      sentence_length = 0;
      // Only single thread set sen_count = 0
      sen_count = 0;
      fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
      continue;
    }
    //printf("Sentence position is %lld\n", sentence_position);
    word = sen[sentence_position];
    if (word == -1) continue;
    if (global_image) {
      for (c = 0; c < layer1_size + layer1_image_size; c++) neu1[c] = 0;
      for (c = 0; c < layer1_size + layer1_image_size; c++) neu1e[c] = 0; 
    } else {
      for (c = 0; c < layer1_size; c++) neu1[c] = 0;
      for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
    }
    next_random = next_random * (unsigned long long)25214903917 + 11;
    b = next_random % window;
    if (cbow) {  //train the cbow architecture
      // in -> hidden
      cw = 0;
      for (a = b; a < window * 1 + 1 - b; a++) if (a != window) {
        c = sentence_position - window + a;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        // start from 1 as word vector, 0 is for sentence/paragraph
        if (sentence_vectors && (c == 0)) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        // Sum over word vectors in local windows
        for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
        cw++;
      }
      // To train sentence vectors, sentence vector id is in sen[0].
      if (sentence_vectors) {
        last_word = sen[0];
        if (last_word == -1) continue;
        for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
        cw++;
      }
      if (cw) {
        // average of word vectors in a sentence
        for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
        if (global_image) {
          for (c = layer1_size; c < layer1_size + layer1_image_size; c++) neu1[c] += 
            syn0_im[c + (sen_count-1) * layer1_image_size];
        }
        if (hs) for (d = 0; d < vocab[word].codelen; d++) {
          if (global_image) layer1_size_all = layer1_size + layer1_image_size;
          else layer1_size_all = layer1_size;
          // for each node along the path
          f = 0;
          // l2 is the location of all points in the path of Huffman Tree towards word
          l2 = vocab[word].point[d] * layer1_size_all;
          // Propagate hidden -> output
          // neul1, average wordvec; syn1, model; neul0, original wordvec
          //for (c = 0; c < layer1_size_all; c++) f += neu1[c] * syn1[c + l2];
          // apply different weight from image and word
          for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
          if (global_image) {
            for (c = layer1_size; c < layer1_size_all; c++) f += factor_image * neu1[c] * syn1[c + l2];
          }
          if (f <= -MAX_EXP) continue;
          else if (f >= MAX_EXP) continue;
          // f = sigmod(neu1*syn1)
          else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
          // 'g' is the gradient multiplied by the learning rate
          g = (1 - vocab[word].code[d] - f) * alpha;
          //printf("d is %f\n", 1- vocab[word].code[d]);
          //printf("f and gt, and g is %f, %f\n", f, g);
          // Propagate errors output -> hidden
          for (c = 0; c < layer1_size_all; c++) neu1e[c] += g * syn1[c + l2];
          // Learn weights hidden -> output
          for (c = 0; c < layer1_size_all; c++) syn1[c + l2] += g * neu1[c];
        }
        // NEGATIVE SAMPLING
        if (negative > 0) for (d = 0; d < negative + 1; d++) {
          if (global_image) layer1_size_all = layer1_size + layer1_image_size;
          else layer1_size_all = layer1_size;
          if (d == 0) {
            target = word;
            label = 1;
          } else {
            next_random = next_random * (unsigned long long)25214903917 + 11;
            target = table[(next_random >> 16) % table_size];
            if (target == 0) target = next_random % (vocab_size - 1) + 1;
            if (target == word) continue;
            label = 0;
          }
          l2 = target * layer1_size_all;
          f = 0;
          //for (c = 0; c < layer1_size_all; c++) f += neu1[c] * syn1neg[c + l2];
          for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
          if (global_image) {
            for (c = layer1_size; c < layer1_size_all; c++) f += factor_image * neu1[c] * syn1neg[c + l2];
          }
          if (f > MAX_EXP) g = (label - 1) * alpha;
          else if (f < -MAX_EXP) g = (label - 0) * alpha;
          else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
          for (c = 0; c < layer1_size_all; c++) neu1e[c] += g * syn1neg[c + l2];
          for (c = 0; c < layer1_size_all; c++) syn1neg[c + l2] += g * neu1[c];
        }
        // hidden -> in
        for (a = b; a < window * 1 + 1 - b; a++) if (a != window) {
          c = sentence_position - window + a;
          if (c < 0) continue;
          if (c >= sentence_length) continue;
          // back propagation, skip the sentence vector
          if (sentence_vectors && (c == 0)) continue;
          last_word = sen[c];
          if (last_word == -1) continue;
          for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
        }
        // For every word in the sentence, update the sentence vector
        if (sentence_vectors) {
          last_word = sen[0];
          if (last_word == -1) continue;
          for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
        }
        if (global_image) {
          for (c = layer1_size; c < layer1_size + layer1_image_size; c++) 
            syn0_im[c + (sen_count-1) * layer1_image_size] += neu1e[c];
        }
      }
    } else {  //train skip-gram
      for (a = b; a < window * 2 + 1 + sentence_vectors - b; a++) if (a != window) {
        c = sentence_position - window + a;
        if (sentence_vectors) if (a >= window * 2 + sentence_vectors - b) c = 0;
        if (c < 0) continue;
        if (c >= sentence_length) continue;
        last_word = sen[c];
        if (last_word == -1) continue;
        l1 = last_word * layer1_size;
        for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
        // HIERARCHICAL SOFTMAX
        if (hs) for (d = 0; d < vocab[word].codelen; d++) {
          if (global_image) layer1_size_all = layer1_size + layer1_image_size;
          else layer1_size_all = layer1_size;
          f = 0;
          l2 = vocab[word].point[d] * layer1_size_all;
          // Propagate hidden -> output
          for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
          if (global_image) {
            for (c = layer1_size; c < layer1_size_all; c++) f += factor_image * neu1[c] * syn1[c + l2];
          }
          //for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
          if (f <= -MAX_EXP) continue;
          else if (f >= MAX_EXP) continue;
          else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
          // 'g' is the gradient multiplied by the learning rate
          g = (1 - vocab[word].code[d] - f) * alpha;
          // Propagate errors output -> hidden
          for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
          // Learn weights hidden -> output
          for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
        }
        // NEGATIVE SAMPLING
        if (negative > 0) for (d = 0; d < negative + 1; d++) {
          if (d == 0) {
            target = word;
            label = 1;
          } else {
            next_random = next_random * (unsigned long long)25214903917 + 11;
            target = table[(next_random >> 16) % table_size];
            if (target == 0) target = next_random % (vocab_size - 1) + 1;
            if (target == word) continue;
            label = 0;
          }
          l2 = target * layer1_size;
          f = 0;
          for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
          if (f > MAX_EXP) g = (label - 1) * alpha;
          else if (f < -MAX_EXP) g = (label - 0) * alpha;
          else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
          for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
          for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
        }
        // Learn weights input -> hidden
        for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
      }
    }