Ejemplo n.º 1
0
int main(int argc, char **argv) {
  int i;
  if (argc == 1) {
    printf("Concatenate the 1st-order embedding and the 2nd-order embeddings\n\n");
    printf("Options:\n");
    printf("Parameters for training:\n");
    printf("\t-input1 <file>\n");
    printf("\t\tThe 1st-order embeddings\n");
    printf("\t-input2 <file>\n");
    printf("\t\tThe 2nd-order embeddings\n");
    printf("\t-output <file>\n");
    printf("\t\tUse <file> to save the concatenated embeddings\n");
    printf("\t-binary <int>\n");
    printf("\t\tSave the learnt embeddings in binary moded; default is 0 (off)\n");
    printf("\nExamples:\n");
    printf("./concatenate -input1 vec_1st.txt -input2 vec_2nd.txt -output vec_all.txt -binary 1\n\n");
    return 0;
  }
  if ((i = ArgPos((char *)"-input1", argc, argv)) > 0) strcpy(vector_file1, argv[i + 1]);
  if ((i = ArgPos((char *)"-input2", argc, argv)) > 0) strcpy(vector_file2, argv[i + 1]);
  if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
  TrainModel();
  return 0;
}
Ejemplo n.º 2
0
int main(int argc, char **argv) {
  int i;
  if (argc == 1) {
    printf("WORD2PHRASE tool v0.1a\n\n");
    printf("Options:\n");
    printf("Parameters for training:\n");
    printf("\t-train <file>\n");
    printf("\t\tUse text data from <file> to train the model\n");
    printf("\t-output <file>\n");
    printf("\t\tUse <file> to save the resulting word vectors / word clusters / phrases\n");
    printf("\t-min-count <int>\n");
    printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
    printf("\t-threshold <float>\n");
    printf("\t\t The <float> value represents threshold for forming the phrases (higher means less phrases); default 100\n");
    printf("\t-debug <int>\n");
    printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
    printf("\nExamples:\n");
    printf("./word2phrase -train text.txt -output phrases.txt -threshold 100 -debug 2\n\n");
    return 0;
  }
  if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) threshold = atof(argv[i + 1]);
  vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
  vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
  TrainModel();
  return 0;
}
Ejemplo n.º 3
0
int main(int argc, char **argv) {
	int i;
	if (argc == 1) {
		printf("Reconstruct the network by using a Breadth-First-Search strategy\n\n");
		printf("Options:\n");
		printf("Parameters for training:\n");
		printf("\t-train <file>\n");
		printf("\t\tReconstruct the network from <file>\n");
		printf("\t-output <file>\n");
		printf("\t\tUse <file> to save the reconstructed network\n");
		printf("\t-depth <int>\n");
		printf("\t\tThe maximum depth in the Breadth-First-Search; default is 0\n");
		printf("\t-threshold <int>\n");
		printf("\t\tFor vertex whose degree is less than <int>, we will expand its neighbors until the degree reaches <iny>\n");
		printf("\nExamples:\n");
		printf("./reconstruct -train net.txt -output net_dense.txt -depth 2 -threshold 1000\n\n");
		return 0;
	}
	if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
	if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
	if ((i = ArgPos((char *)"-depth", argc, argv)) > 0) max_depth = atoi(argv[i + 1]);
	if ((i = ArgPos((char *)"-threshold", argc, argv)) > 0) max_k = atoi(argv[i + 1]);
	vertex = (struct ClassVertex *)calloc(max_num_vertices, sizeof(struct ClassVertex));
	TrainLINE();
	return 0;
}
Ejemplo n.º 4
0
int main(int argc, char *argv[])
{
	// read arguments
	int a;
	if ((a = ArgPos((char *)"-delay", argc, argv)) > 0) delay = atoi(argv[a + 1]);
	if ((a = ArgPos((char *)"-numline", argc, argv)) > 0) nLine = atoi(argv[a + 1]);
	if ((a = ArgPos((char *)"-length", argc, argv)) > 0) lineLength = atoi(argv[a + 1]);

	clear();
	pthread_t *pt = (pthread_t *)malloc(nLine * sizeof(pthread_t));

	for (int i=0; i<nLine; i++) pthread_create(&pt[i], NULL, matrixLine, (void *)i);
	for (int i=0; i<nLine; i++) pthread_join(pt[i], NULL);

	return 0;
}
Ejemplo n.º 5
0
int main(int argc, char **argv) {
    int i;
    if (argc == 1) {
        printf("HPLE\n\n");
        printf("Options:\n");
        printf("Parameters for training:\n");
        printf("\t-data <path>\n");
        printf("\t\tData (FIGER / BBN)\n");
        printf("\t-task <path>\n");
        printf("\t\tTask (reduce_label_noise / typing)\n");
        printf("\t-binary <int>\n");
        printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
        printf("\t-size <int>\n");
        printf("\t\tSet size of embedding; default is 100\n");
        printf("\t-negative <int>\n");
        printf("\t\tNumber of negative examples; default is 5, common values are 5 - 10 (0 = not used)\n");
        printf("\t-iters <int>\n");
        printf("\t\tSet the number of iterations as <int>\n");
        printf("\t-threads <int>\n");
        printf("\t\tUse <int> threads (default 1)\n");
        printf("\t-alpha <float>\n");
        printf("\t\tSet the value of weight decay (default 0.0001)\n");
        printf("\t-lr <float>\n");
        printf("\t\tSet the value of learning rate (default 0.025)\n");
        return 0;
    }
    if ((i = ArgPos((char *)"-data", argc, argv)) > 0) strcpy(data, argv[i + 1]);
    if ((i = ArgPos((char *)"-task", argc, argv)) > 0) strcpy(task, argv[i + 1]);
    if ((i = ArgPos((char *)"-mode", argc, argv)) > 0) mode = argv[i + 1][0];
    if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-size", argc, argv)) > 0) vector_size = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-iters", argc, argv)) > 0) iters = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-lr", argc, argv)) > 0) starting_lr = atof(argv[i + 1]);
    if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
    if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
    sprintf(file_path, "Intermediate/%s/", data);
    sprintf(output_path, "Results/%s/", data);
    lr = starting_lr;
    TrainModel();
    return 0;
}
Ejemplo n.º 6
0
int main(int argc, char **argv) {
  int i;
  if (argc == 1) {
    printf("Normalize vertex embeddings by setting their L2 norm as 1\n\n");
    printf("Options:\n");
    printf("Parameters for training:\n");
    printf("\t-input <file>\n");
    printf("\t\tThe original vertex embeddings\n");
    printf("\t-output <file>\n");
    printf("\t\tUse <file> to save the normalized vertex embeddings\n");
    printf("\t-binary <int>\n");
    printf("\t\tSave the learnt embeddings in binary moded; default is 0 (off)\n");
    printf("\nExamples:\n");
    printf("./normalize -input vec_wo_norm.txt -output vec_norm.txt -binary 1\n\n");
    return 0;
  }
  if ((i = ArgPos((char *)"-input", argc, argv)) > 0) strcpy(input_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
  Normalize();
  return 0;
}
Ejemplo n.º 7
0
int main(int argc, char **argv) {
  int i;
  if (argc == 1) {
    printf("LINE: Large Information Network Embedding\n\n");
    printf("Options:\n");
    printf("Parameters for training:\n");
    printf("\t-train <file>\n");
    printf("\t\tUse network data from <file> to train the model\n");
    printf("\t-output <file>\n");
    printf("\t\tUse <file> to save the learnt embeddings\n");
    printf("\t-binary <int>\n");
    printf("\t\tSave the learnt embeddings in binary moded; default is 0 (off)\n");
    printf("\t-size <int>\n");
    printf("\t\tSet dimension of vertex embeddings; default is 100\n");
    printf("\t-order <int>\n");
    printf("\t\tThe type of the model; 1 for first order, 2 for second order; default is 2\n");
    printf("\t-negative <int>\n");
    printf("\t\tNumber of negative examples; default is 5\n");
    printf("\t-samples <int>\n");
    printf("\t\tSet the number of training samples as <int>Million; default is 1\n");
    printf("\t-threads <int>\n");
    printf("\t\tUse <int> threads (default 1)\n");
    printf("\t-rho <float>\n");
    printf("\t\tSet the starting learning rate; default is 0.025\n");
    printf("\nExamples:\n");
    printf("./line -train net.txt -output vec.txt -binary 1 -size 200 -order 2 -negative 5 -samples 100 -rho 0.025 -threads 20\n\n");
    return 0;
  }
  if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(network_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(embedding_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) is_binary = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-size", argc, argv)) > 0) dim = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-order", argc, argv)) > 0) order = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) num_negative = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-samples", argc, argv)) > 0) total_samples = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-rho", argc, argv)) > 0) init_rho = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
  total_samples *= 1000000;
  rho = init_rho;
  vertex = (struct ClassVertex *)calloc(max_num_vertices, sizeof(struct ClassVertex));
  TrainLINE();
  return 0;
}
Ejemplo n.º 8
0
int main(int argc, char **argv) {
  int i;
  if (argc == 1) {
    printf("RNNLM based on WORD VECTOR estimation toolkit v 0.1b\n\n");
    printf("Options:\n");
    printf("Parameters for training:\n");
    printf("\t-train <file>\n");
    printf("\t\tUse text data from <file> to train the model\n");
    printf("\t-valid <file>\n");
    printf("\t\tUse text data from <file> to perform validation and control learning rate\n");
    printf("\t-test <file>\n");
    printf("\t\tUse text data from <file> to compute logprobs with an existing model\n");
    printf("\t-rnnlm <file>\n");
    printf("\t\tUse <file> to save the resulting language model\n");
    printf("\t-hidden <int>\n");
    printf("\t\tSet size of hidden layer; default is 100\n");
    printf("\t-bptt <int>\n");
    printf("\t\tSet length of BPTT unfolding; default is 3; set to 0 to disable truncation\n");
    printf("\t-bptt-block <int>\n");
    printf("\t\tSet period of BPTT unfolding; default is 10; BPTT is performed each bptt+bptt_block steps\n");
    printf("\t-gen <int>\n");
    printf("\t\tSampling mode; number of sentences to sample, default is 0 (off); enter negative number for interactive mode\n");
    printf("\t-threads <int>\n");
    printf("\t\tUse <int> threads (default 1)\n");
    printf("\t-min-count <int>\n");
    printf("\t\tThis will discard words that appear less than <int> times; default is 0\n");
    printf("\t-alpha <float>\n");
    printf("\t\tSet the starting learning rate; default is 0.1\n");
    printf("\t-maxent-alpha <float>\n");
    printf("\t\tSet the starting learning rate for maxent; default is 0.1\n");
    printf("\t-reject-threshold <float>\n");
    printf("\t\tReject nnet and reload nnet from previous epoch if the relative entropy improvement on the validation set is below this threshold (default 0.997)\n");
    printf("\t-stop <float>\n");
    printf("\t\tStop training when the relative entropy improvement on the validation set is below this threshold (default 1.003); see also -retry\n");
    printf("\t-retry <int>\n");
    printf("\t\tStop training iff N retries with halving learning rate have failed (default 2)\n");
    printf("\t-debug <int>\n");
    printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
    printf("\t-direct <int>\n");
    printf("\t\tSet the size of hash for maxent parameters, in millions (default 0 = maxent off)\n");
    printf("\t-direct-order <int>\n");
    printf("\t\tSet the order of n-gram features to be used in maxent (default 3)\n");
    printf("\t-beta1 <float>\n");
    printf("\t\tL2 regularisation parameter for RNNLM weights (default 1e-6)\n");
    printf("\t-beta2 <float>\n");
    printf("\t\tL2 regularisation parameter for maxent weights (default 1e-6)\n");
    printf("\t-recompute-counts <int>\n");
    printf("\t\tRecompute train words counts, useful for fine-tuning (default = 0 = use counts stored in the vocab file)\n");
    printf("\nExamples:\n");
    printf("./rnnlm -train data.txt -valid valid.txt -rnnlm result.rnnlm -debug 2 -hidden 200\n\n");
    return 0;
  }
  model_file[0] = 0;
  test_file[0] = 0;
  if ((i = ArgPos((char *)"-hidden", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-valid", argc, argv)) > 0) strcpy(valid_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-test", argc, argv)) > 0) strcpy(test_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-bptt", argc, argv)) > 0) bptt = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-bptt-block", argc, argv)) > 0) bptt_block = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-maxent-alpha", argc, argv)) > 0) maxent_alpha = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-reject-threshold", argc, argv)) > 0) reject_threshold = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-stop", argc, argv)) > 0) stop = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-retry", argc, argv)) > 0) max_retry = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-rnnlm", argc, argv)) > 0) {
    strcpy(model_file, argv[i + 1]);
    strcpy(model_file_nnet, argv[i + 1]);
    strcat(model_file_nnet, ".nnet");
  }
  if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-direct", argc, argv)) > 0) maxent_hash_size = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-direct-order", argc, argv)) > 0) maxent_order = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-beta1", argc, argv)) > 0) beta = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-beta2", argc, argv)) > 0) maxent_beta = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-gen", argc, argv)) > 0) gen = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-recompute-counts", argc, argv)) > 0) recompute_train_counts = atoi(argv[i + 1]);


  vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
  vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
  TrainModel();
  return 0;
}
Ejemplo n.º 9
0
int main(int argc, char* argv[])
{
    Eigen::initParallel();

    int i = 0;
    if (argc == 1)
    {
        help();
        return 0;
    }

    string input_file = "";
    string output_file = "text8-sgns.txt";
    string save_vocab_file = "";
    string read_vocab_file = "";
    string model = "sg";
    string train_method = "ns";
    int table_size = 100000000;
    int word_dim = 200;
    float init_alpha = 0.025f;
    int window = 5;
    float subsample_threshold = 0.0001;
    float min_alpha = init_alpha * 0.0001;
    bool cbow_mean = true;
    int negative = 0;
    int num_threads = 1;
    int iter = 1;
    int min_count = 5;

    if ((i = ArgPos((char *)"-size", argc, argv)) > 0)
        word_dim = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-train", argc, argv)) > 0)
        input_file = std::string(argv[i + 1]);
    if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0)
        save_vocab_file = std::string(argv[i + 1]);
    if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0)
        read_vocab_file = std::string(argv[i + 1]);
    //if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-model", argc, argv)) > 0)
        model = std::string(argv[i + 1]);
    if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0)
        init_alpha = atof(argv[i + 1]);
    if ((i = ArgPos((char *)"-output", argc, argv)) > 0)
        output_file = std::string(argv[i + 1]);
    if ((i = ArgPos((char *)"-window", argc, argv)) > 0)
        window = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-subsample", argc, argv)) > 0)
        subsample_threshold = atof(argv[i + 1]);
    if ((i = ArgPos((char *)"-train_method", argc, argv)) > 0)
        train_method = std::string(argv[i + 1]);
    if ((i = ArgPos((char *)"-negative", argc, argv)) > 0)
        negative = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-threads", argc, argv)) > 0)
        num_threads = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-iter", argc, argv)) > 0)
        iter = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0)
        min_count = atoi(argv[i + 1]);

    if(model == "")
    {
        model = "sg";
        cout << "Default use skip gram model" << endl;
    }
    if(train_method == "")
    {
        train_method = "ns";
        cout << "Default use negative sampling model" << endl;
    }

    if(train_method == "ns" && negative <= 0)
    {
        cout << "Please set -negative > 0!" << endl;
        return 1;
    }
    if(train_method == "hs" && negative > 0)
    {
        cout << "Do not set -negative under hierarchical softmax!" << endl;
        return 1;
    }
    if(train_method == "hs" && model.find("align") != string::npos)
    {
        cout << "Please use negative sampling in aligned skip gram model!" << endl;
        return 1;
    }

    if(cbow_mean)
        init_alpha = 0.05;

    Word2Vec w2v(iter, window, min_count, table_size, word_dim, negative, subsample_threshold,
                 init_alpha, min_alpha, cbow_mean, num_threads, train_method, model);

    omp_set_num_threads(num_threads);
    //vector<vector<string>> sentences = w2v.line_docs("imdb_train.txt");
    vector<vector<string>> sentences = text8_corpus();
    w2v.build_vocab(sentences);
    w2v.init_weights(w2v.vocab.size());
    if(save_vocab_file != "")
        w2v.save_vocab(save_vocab_file);

    w2v.train(sentences);

    if(output_file != "")
    {
        if(train_method == "hs" && model == "cbow")
            w2v.save_word2vec(output_file, w2v.C);
        else
            w2v.save_word2vec(output_file, w2v.W);
    }

    return 0;
}
Ejemplo n.º 10
0
int main(int argc, char **argv) {
    int i,j,k = 0;//counters
    if(argc == 1) { //printing instructions
        printf("\n");
        printf("Forward propagation of sentences in a file delimited by \\n\n\n");
        printf("Parameters:\n");
        printf("\tValue for the vocabulary size that resulted from training (first number in the output file of word2vec):\n");
        printf("\t\t-vocab_size <int>\n");
        printf("\tValue for the layer size used in training (second number in the output file of word2vec):\n");
        printf("\t\t-layer_size <int>\n");
        printf("\tValue for the window size:\n");
        printf("\t\t-window <int>\n\n");
        return 0;
    } //reading command line arguments
    if ((i = ArgPos((char *)"-layer_size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-vocab_size", argc, argv)) > 0) vocab_size = atoi(argv[i + 1]);
    if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);

    // allocating memory to store the network elements
    syn0 = (real *)calloc(layer1_size*vocab_size,sizeof(real));
    syn1 = (real *)calloc(layer1_size*vocab_size,sizeof(real));
    neu1 = (real *)calloc(layer1_size,sizeof(real));

    index_buff = (char *)calloc(MAX_INDEX_BUFF_SIZE,sizeof(char));
    // reading the network from file
    read_syn0();
    read_syn1();

    expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); //allocating memory for expTable
    for (i = 0; i < EXP_TABLE_SIZE; i++) {
        expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table in the same way as in word2vec
        expTable[i] = expTable[i] / (expTable[i] + 1);                   // Precompute f(x) = x / (x + 1)
    }
    //building the vocabulary and the vocabulary hash from the files it was stored in
    BuildVocabFromFile();
    BuildVocabHashFromFile();

    int length = 0; //word lenght of sentence variable
    int syno_length = 0; //how many synonyms/replacements
    long long * sen; //sentence variable where words are represented as vocabualry indices
    long long * sen_temp; //temporary sentence variable where words are represented as vocabulary indices
    sen_temp = (long long *)calloc(MAX_SENTENCE_LENGTH,sizeof(long long)); //allocating memory for sen_temp
    long long * synonym; //replacement word (in vocabulary index form)
    long double prob = 0; //probability variable
    long long ptr = 0, ptr_temp = 0; //pointer used to go through the sentences file
    long long syno_ptr = 0, syno_ptr_temp = 0; //pointer used to go through the synonyms/replacements file


    FILE *sentfile = fopen("sentences","r");
    FILE *indices = fopen("indices","r");
    FILE *synfile = fopen("synonyms","r");
    FILE *fo = fopen("wordprobs","w");
    int lines = 0;
    char line[MAX_SENTENCE_LENGTH]; // buffer to store current sentence
    char synline[MAX_SENTENCE_LENGTH]; // buffer to store synonyms


    lines = Lines(sentfile); // how many lines in the sentences file, which is used as the outer loop delimiter
    //(this can be done) since all the files "sentences", "synonyms" and "indices" have the same number of lines delimited by "\n"
    rewind(sentfile);
    rewind(synfile);

    for(i = 0; i<lines; i++) { //outer loop iterating through "sentences", "synonyms" and "indices" line by line

        // read sentence
        ptr = ftell(sentfile); // store beginning of line
        if (readLine(sentfile,line) < 0) break;
        length = LineWordCount(line);
        //printf("sent words %d\n",length);

        // read word replacements
        syno_ptr = ftell(synfile); // store beginning of line
        if (readLine(synfile,synline) < 0) break;
        syno_length = LineWordCount(synline);
        printf("synline %s\n",synline);

        fseek(sentfile,ptr,SEEK_SET); // move the pointer back to the beginning of the line
        sen = FileToSen(length,sentfile); //sen is an array of longs with the words of the sentence in a vocabulary index format

        fseek(synfile,syno_ptr,SEEK_SET);
        synonym = FileToSen(syno_length,synfile); //synonym is an array of longs with the replacements/synonyms from the "synonyms" file in vocabulary index format

        fseek(sentfile,1,SEEK_CUR); // added to get past newline
        fseek(synfile,1,SEEK_CUR);

        ReadIndexFromFile(indices); //reads the index and puts it in the char array "index_buff"
        target_index = GetIndex(); //returns a numerical value from what is in the char array "index_buff"
        for(k=0; k<syno_length; k++) { //repeats forward propagation for each synonym in the line
            memcpy(sen_temp,sen,MAX_SENTENCE_LENGTH*sizeof(long long)); //copying the sentence into sen_temp where synonyms will be changed
            sen_temp[target_index] = synonym[k]; //replacing the target word with a synonym/replacement
            prob = ForwardPropagate(length,sen_temp); //doing forward propagation to get the probability
            //prob = prob * 100000; // multiplying the probabilty by 100000 or taking the negative log is done in this line

            fprintf(fo,"%s %Lf\n",vocab[synonym[k]].word,prob); // SEA the replacement word and its probability
        }
    }

    fclose(fo);
    fclose(sentfile);
    fclose(synfile);
    fclose(indices);

    return 0;
}
Ejemplo n.º 11
0
int main(int argc, const char *argv[])
{
  if (argc == 1) // print usage instructions
  {
    std::cout << "Usage: \n"
      << "\t--datadir <dirname> REQUIRED [place to store db shards]\n"
      << "\t--nodestate <filename> REQUIRED [current node state]\n"
      << "\t--clustermembers <filename> REQUIRED [list of cluster members]\n"
      << "\t--ownershipmap <filename> REQUIRED [list of partitions owned by current node]\n"
      << "\t--partitionmap <filename> REQUIRED [mapping of partitions to nodes]\n"
      << "\t--log <filename> REQUIRED [log file for recovery]\n"
      << "\t--join OPTIONAL\n"
      << "\t--recover OPTIONAL\n"
      << "\t--debug OPTIONAL [start sending fake data to other cluster members]\n"
      << "\t--size OPTIONAL [mbytes]"
//      << "\t--name OPTIONAL [give the node a custom name that can be reached, IP address]"
      << std::endl;
      return 0;
  }

  std::string logfile;

  int i;
  bool join = false, recover = false, debug = false;
  if ((i = ArgPos("--datadir", argc, argv, true)) > 0) 
    g_db_files_dirname = std::string(argv[i+1]);
  if ((i = ArgPos("--nodestate", argc, argv, true)) > 0) 
    g_current_node_state_filename = std::string(argv[i+1]);
  if ((i = ArgPos("--clustermembers", argc, argv, true)) > 0) 
    g_cluster_member_list_filename = std::string(argv[i+1]);
  if ((i = ArgPos("--ownershipmap", argc, argv, true)) > 0)
    g_owned_partition_state_filename = std::string(argv[i+1]);
  if ((i = ArgPos("--partitionmap", argc, argv, true)) > 0)
    g_cached_partition_map_filename = std::string(argv[i+1]);
  if ((i = ArgPos("--log", argc, argv, true)) > 0)
    logfile = std::string(argv[i+1]); 
  if ((i = ArgPos("--join", argc, argv, false)) > 0)
    join = true;
  if ((i = ArgPos("--recover", argc, argv, false)) > 0)
    recover = true;
  if ((i = ArgPos("--debug", argc, argv, false)) > 0)
    debug = true;
  if ((i = ArgPos("--size", argc, argv, true)) > 0)
    g_local_disk_limit_in_bytes = atoi(argv[i+1]) * 1024 * 1024;
  else
    g_local_disk_limit_in_bytes = 512 * 1024 * 1024;

  std::cout << "DB directory : " << g_db_files_dirname << std::endl;
  std::cout << "Node state file : " << g_current_node_state_filename << std::endl;
  std::cout << "Cluster members file : " << g_cluster_member_list_filename << std::endl;
  std::cout << "Ownership map file : " << g_owned_partition_state_filename << std::endl;
  std::cout << "Partition-node map file : " << g_cached_partition_map_filename << std::endl;
  std::cout << "Log file : " << logfile << std::endl;

  if (join && recover)
  {
    perror("cannot join and recover\n");
    exit(0);
  }
  
  if (g_db_files_dirname == "")
  {
    perror("must specify a directory for database files\n");
    exit(0);
  }
  if (g_current_node_state_filename == "")
  {
    perror("must specify a filename for persisting node state\n");
    exit(0);
  }
  if (g_cluster_member_list_filename == "")
  {
    perror("must specify a file for list of cluster members\n");
    exit(0);
  }
  if (g_owned_partition_state_filename == "")
  {
    perror("must specify a file for owned partition state\n");
    exit(0);
  }
  if (g_cached_partition_map_filename == "")
  {
    perror("must specify a file for partition map\n");
    exit(0);
  }
  if (logfile == "")
  {
    perror("must specify a file for log\n");
    exit(0);
  }

  if (join)
  {
    JoinInitState();
  }
  else if (recover)
  {
    RecoverInitState(logfile); 
  }
  else
  {
    InitializeState();
  }

  edisense_comms::Member member;
  Server server;
  member.start(&server);

  if (join)
  {
    JoinFinishInit(&member);
  }

  std::thread async_put_thread(RetryPutDaemon, &member, 15); // 15 seconds
  std::thread rebalance_thread(LoadBalanceDaemon, &member, 60, logfile, recover); // 1 minutes
  std::thread gc_thread(GarbageCollectDaemon, 60 * 60 * 12); // 12 hrs
  std::thread db_transfer_thread(DBTransferServerDaemon);
  
  if (debug) // simulate data
  {
    for (int j = 1; j <= 50; j++)
    {
      std::thread simulate_put_thread(SimulatePutDaemon, &member, 1, j);
      simulate_put_thread.detach();
    }
  }

  gc_thread.join();
  rebalance_thread.join();
  db_transfer_thread.join();
  async_put_thread.join();

  member.stop();
}
Ejemplo n.º 12
0
//get arguments from command line
int get_optarg(int argc, char **argv)
{
  int i;
  output_file[0] = 0;
  if ((i = ArgPos((char *)"-dim", argc, argv)) > 0) dim = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-cbow", argc, argv)) > 0) cbow = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-negtive", argc, argv)) > 0) negtive = atoi(argv[i + 1]);
  if (cbow) alpha = 0.05;
  if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
  if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
  if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
  if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
  return output_file[0] == 0 ? -1 : 0;
}