bool GzippedFileReader::OkIndex() { if (m_pIndex) return true; // Try to read index from disk wxString indexfile = iso2indexname(m_filename); if (indexfile.length() == 0) return false; // iso2indexname(...) will print errors if it can't apply the template if (wxFileName::FileExists(indexfile) && (m_pIndex = ReadIndexFromFile(indexfile))) { Console.WriteLn(Color_Green, L"OK: Gzip quick access index read from disk: '%s'", WX_STR(indexfile)); if (m_pIndex->span != GZFILE_SPAN_DEFAULT) { Console.Warning(L"Note: This index has %1.1f MB intervals, while the current default for new indexes is %1.1f MB.", (float)m_pIndex->span / 1024 / 1024, (float)GZFILE_SPAN_DEFAULT / 1024 / 1024); Console.Warning(L"It will work fine, but if you want to generate a new index with default intervals, delete this index file."); Console.Warning(L"(smaller intervals mean bigger index file and quicker but more frequent decompressions)"); } InitZstates(); return true; } // No valid index file. Generate an index Console.Warning(L"This may take a while (but only once). Scanning compressed file to generate a quick access index..."); Access *index; FILE* infile = PX_fopen_rb(m_filename); int len = build_index(infile, GZFILE_SPAN_DEFAULT, &index); printf("\n"); // build_index prints progress without \n's fclose(infile); if (len >= 0) { m_pIndex = index; WriteIndexToFile((Access*)m_pIndex, indexfile); } else { Console.Error(L"ERROR (%d): index could not be generated for file '%s'", len, WX_STR(m_filename)); free_index(index); InitZstates(); return false; } InitZstates(); return true; }
int main(int argc, char **argv) { int i,j,k = 0;//counters if(argc == 1) { //printing instructions printf("\n"); printf("Forward propagation of sentences in a file delimited by \\n\n\n"); printf("Parameters:\n"); printf("\tValue for the vocabulary size that resulted from training (first number in the output file of word2vec):\n"); printf("\t\t-vocab_size <int>\n"); printf("\tValue for the layer size used in training (second number in the output file of word2vec):\n"); printf("\t\t-layer_size <int>\n"); printf("\tValue for the window size:\n"); printf("\t\t-window <int>\n\n"); return 0; } //reading command line arguments if ((i = ArgPos((char *)"-layer_size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-vocab_size", argc, argv)) > 0) vocab_size = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); // allocating memory to store the network elements syn0 = (real *)calloc(layer1_size*vocab_size,sizeof(real)); syn1 = (real *)calloc(layer1_size*vocab_size,sizeof(real)); neu1 = (real *)calloc(layer1_size,sizeof(real)); index_buff = (char *)calloc(MAX_INDEX_BUFF_SIZE,sizeof(char)); // reading the network from file read_syn0(); read_syn1(); expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); //allocating memory for expTable for (i = 0; i < EXP_TABLE_SIZE; i++) { expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table in the same way as in word2vec expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) } //building the vocabulary and the vocabulary hash from the files it was stored in BuildVocabFromFile(); BuildVocabHashFromFile(); int length = 0; //word lenght of sentence variable int syno_length = 0; //how many synonyms/replacements long long * sen; //sentence variable where words are represented as vocabualry indices long long * sen_temp; //temporary sentence variable where words are represented as vocabulary indices sen_temp = (long long *)calloc(MAX_SENTENCE_LENGTH,sizeof(long long)); //allocating memory for sen_temp long long * synonym; //replacement word (in vocabulary index form) long double prob = 0; //probability variable long long ptr = 0, ptr_temp = 0; //pointer used to go through the sentences file long long syno_ptr = 0, syno_ptr_temp = 0; //pointer used to go through the synonyms/replacements file FILE *sentfile = fopen("sentences","r"); FILE *indices = fopen("indices","r"); FILE *synfile = fopen("synonyms","r"); FILE *fo = fopen("wordprobs","w"); int lines = 0; char line[MAX_SENTENCE_LENGTH]; // buffer to store current sentence char synline[MAX_SENTENCE_LENGTH]; // buffer to store synonyms lines = Lines(sentfile); // how many lines in the sentences file, which is used as the outer loop delimiter //(this can be done) since all the files "sentences", "synonyms" and "indices" have the same number of lines delimited by "\n" rewind(sentfile); rewind(synfile); for(i = 0; i<lines; i++) { //outer loop iterating through "sentences", "synonyms" and "indices" line by line // read sentence ptr = ftell(sentfile); // store beginning of line if (readLine(sentfile,line) < 0) break; length = LineWordCount(line); //printf("sent words %d\n",length); // read word replacements syno_ptr = ftell(synfile); // store beginning of line if (readLine(synfile,synline) < 0) break; syno_length = LineWordCount(synline); printf("synline %s\n",synline); fseek(sentfile,ptr,SEEK_SET); // move the pointer back to the beginning of the line sen = FileToSen(length,sentfile); //sen is an array of longs with the words of the sentence in a vocabulary index format fseek(synfile,syno_ptr,SEEK_SET); synonym = FileToSen(syno_length,synfile); //synonym is an array of longs with the replacements/synonyms from the "synonyms" file in vocabulary index format fseek(sentfile,1,SEEK_CUR); // added to get past newline fseek(synfile,1,SEEK_CUR); ReadIndexFromFile(indices); //reads the index and puts it in the char array "index_buff" target_index = GetIndex(); //returns a numerical value from what is in the char array "index_buff" for(k=0; k<syno_length; k++) { //repeats forward propagation for each synonym in the line memcpy(sen_temp,sen,MAX_SENTENCE_LENGTH*sizeof(long long)); //copying the sentence into sen_temp where synonyms will be changed sen_temp[target_index] = synonym[k]; //replacing the target word with a synonym/replacement prob = ForwardPropagate(length,sen_temp); //doing forward propagation to get the probability //prob = prob * 100000; // multiplying the probabilty by 100000 or taking the negative log is done in this line fprintf(fo,"%s %Lf\n",vocab[synonym[k]].word,prob); // SEA the replacement word and its probability } } fclose(fo); fclose(sentfile); fclose(synfile); fclose(indices); return 0; }