/** Loads index from one or more file(s) named filename, possibly adding the proper extensions. */ int load_index(char *filename, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); void *Index = (void *) wcsa; int error; wcsa->text = NULL; // Inicializes the arrays used to detect if a char is valid or not. StartValid(); /** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */ loadVocabulary (Index, filename); { uint totaltmp=0; //words totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. fprintf(stderr,"\n\t*Loaded Vocabulary of text: %u words, %d bytes\n", wcsa->nwords, totaltmp); } /** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */ loadIndexConstants(Index, filename); fprintf(stderr,"\t*Loaded configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) ); #ifdef FREQ_VECTOR_AVAILABLE /** 3 ** Loading freq vector */ {uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary" loadFreqVector(&(wcsa->freqs), &size, (char *)filename); fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) ); } #endif /** 4 ** Loading Compressed Structure of posting lists (il) */ error = load_il((char*) filename,&(wcsa->ils)); IFERRORIL(error); uint sizeil; error = size_il(wcsa->ils,&sizeil); IFERRORIL(error); fprintf(stderr,"\n \t*loaded compressed inverted lists structure: %d bytes\n", sizeil); /** 5 ** Loading the Representation of the source text */ load_representation( &wcsa->ct,filename); { uint size; size_representation(wcsa->ct, &size); fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size); } (*index) = Index; return 0; }
ARPAPlugin::ARPAPlugin(Configuration* config, ContextTracker* ct) : Plugin(config, ct, "ARPAPlugin", "ARPAPlugin, a plugin relying on an ARPA language model", "ARPAPlugin, long description." ) { Value value; try { value = config->get(LOGGER); logger << setlevel(value); logger << INFO << "LOGGER: " << value << endl; } catch (Configuration::ConfigurationException ex) { logger << WARN << "Caught ConfigurationException: " << ex.what() << endl; } try { value = config->get(VOCABFILENAME); logger << INFO << "VOCABFILENAME: " << value << endl; vocabFilename = value; } catch (Configuration::ConfigurationException ex) { logger << ERROR << "Caught fatal ConfigurationException: " << ex.what() << endl; throw PresageException("Unable to init " + name + " predictive plugin."); } try { value = config->get(ARPAFILENAME); logger << INFO << "ARPAFILENAME: " << value << endl; arpaFilename = value; } catch (Configuration::ConfigurationException ex) { logger << ERROR << "Caught fatal ConfigurationException: " << ex.what() << endl; throw PresageException("Unable to init " + name + " predictive plugin."); } try { value = config->get(TIMEOUT); logger << INFO << "TIMEOUT: " << value << endl; timeout = atoi(value.c_str()); } catch (Configuration::ConfigurationException ex) { logger << ERROR << "Caught fatal ConfigurationException: " << ex.what() << endl; throw PresageException("Unable to init " + name + " predictive plugin."); } loadVocabulary(); createARPATable(); }
int main(int argc, char **argv) { std::setlocale(LC_ALL, "ru_RU.UTF-8"); auto vocabulary = loadVocabulary(argv[1]); token_t root = std::make_tuple( 0, std::wstring(L"*root*"), -1, std::wstring(L"rroot")); auto sentence = sentence_t(1, root); std::wstring line; while (std::getline(std::wcin, line)) { line.erase(std::remove(line.begin(), line.end(), L'\n'), line.end()); std::transform(line.begin(), line.end(), line.begin(), std::towlower); std::wistringstream stream(line); std::vector<std::wstring> rawSentence; std::copy(std::istream_iterator<std::wstring, wchar_t>(stream), std::istream_iterator<std::wstring, wchar_t>(), std::back_inserter(rawSentence)); if (rawSentence.empty()) { if (sentence.size() > 1) { printPairs(vocabulary, sentence); } sentence = sentence_t(1, root); } else { int index = std::stoi(rawSentence[0]); auto tokenWord = rawSentence[1]; int parentIndex = std::stoi(rawSentence[rawSentence.size() - 2]); auto relation = rawSentence[rawSentence.size() - 1]; sentence.push_back(std::make_tuple(index, tokenWord, parentIndex, relation)); } } if (sentence.size() > 1) { printPairs(vocabulary, sentence); } return 0; }
bool BowVocabulary::computeVocabulary(Mat& vocabularyOut, const string& vocabularyImgsList, bool outputAnalyzedImages, bool useOnlyTargetRegions) { if (loadVocabulary(vocabularyOut)) { return true; } _bowTrainer->clear(); ifstream imgsList(vocabularyImgsList); if (imgsList.is_open()) { vector<string> fileNames; string filename; while (getline(imgsList, filename)) { fileNames.push_back(filename); } int numberOfFiles = fileNames.size(); cout << " -> Building vocabulary with " << numberOfFiles << " images..." << endl; PerformanceTimer performanceTimer; performanceTimer.start(); int descriptorsOriginalMatrixType = CV_32FC1; //#pragma omp parallel for schedule(dynamic) for (int i = 0; i < numberOfFiles; ++i) { Mat imagePreprocessed; string imageFilename = IMGS_DIRECTORY + fileNames[i] + IMAGE_TOKEN; if (_imagePreprocessor->loadAndPreprocessImage(imageFilename, imagePreprocessed, CV_LOAD_IMAGE_GRAYSCALE, false)) { Mat outputImage; if (outputAnalyzedImages) { outputImage = imagePreprocessed.clone(); } if (useOnlyTargetRegions) { vector<Mat> masks; ImageUtils::retriveTargetsMasks(IMGS_DIRECTORY + fileNames[i], masks); for (size_t maskIndex = 0; maskIndex < masks.size(); ++maskIndex) { vector<KeyPoint> keypoints; Mat targetMask = masks[maskIndex]; _featureDetector->detect(imagePreprocessed, keypoints, targetMask); //_featureDetector->detect(imagePreprocessed, keypoints, masks[maskIndex]); if (keypoints.size() > 3) { Mat descriptors; _descriptorExtractor->compute(imagePreprocessed, keypoints, descriptors); descriptorsOriginalMatrixType = descriptors.type(); descriptors.convertTo(descriptors, CV_32FC1); if (descriptors.rows > 0) { //#pragma omp critical _bowTrainer->add(descriptors); } if (outputAnalyzedImages) { cv::drawKeypoints(outputImage, keypoints, outputImage); } } } } else { vector<KeyPoint> keypoints; _featureDetector->detect(imagePreprocessed, keypoints); if (keypoints.size() > 3) { Mat descriptors; _descriptorExtractor->compute(imagePreprocessed, keypoints, descriptors); descriptorsOriginalMatrixType = descriptors.type(); descriptors.convertTo(descriptors, CV_32FC1); if (descriptors.rows > 0) { //#pragma omp critical _bowTrainer->add(descriptors); } if (outputAnalyzedImages) { cv::drawKeypoints(outputImage, keypoints, outputImage); } } } if (outputAnalyzedImages) { stringstream imageOutputFilename; imageOutputFilename << VOCABULARY_BUILD_OUTPUT_DIRECTORY << fileNames[i] << FILENAME_SEPARATOR << _vocabularyFilename << IMAGE_OUTPUT_EXTENSION; imwrite(imageOutputFilename.str(), outputImage); } } } vocabularyOut = _bowTrainer->cluster(); saveVocabulary(vocabularyOut); vocabularyOut.convertTo(vocabularyOut, descriptorsOriginalMatrixType); _bowImgDescriptorExtractor->setVocabulary(vocabularyOut); cout << " -> Finished building vocabulary with " << vocabularyOut.rows << " word size in " << performanceTimer.getElapsedTimeFormated() << "\n" << endl; _bowTrainer->clear(); return true; } return false; }
/** *********************************************************************************** CONSTRUCTION OF THE INDEX, from a given text file "inbasename". ***********************************************************************************/ int build_WordIndex_from_postings (char *inbasename, char *build_options, void **index){ twcsa *wcsa; wcsa = (twcsa *) malloc (sizeof (twcsa) * 1); *index = wcsa; void *Index = *index; wcsa->text = NULL; double t0, t1; t0 = getSYSTimeBF(); /** processing the parameters of the index:: blockSize, and q-gram-len (q) */ { char delimiters[] = " =;"; int j,num_parameters; char ** parameters; if (build_options != NULL) { parse_parameters_II(build_options,&num_parameters, ¶meters, delimiters); for (j=0; j<num_parameters;j++) { if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) { //wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER; j++; } else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) { //wcsa->q =atoi(parameters[j+1]); j++; } else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) { //strcpy(path2repaircompressor,parameters[j+1]); j++; } } free_parameters_II(num_parameters, ¶meters); } //fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q); //fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor); } wcsa->freqs=NULL; /** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/ StartValid(); /** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */ t1 = getSYSTimeBF(); loadVocabulary (Index, inbasename); { uint totaltmp=0; //words totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint)); //the pointers totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words. fprintf(stderr,"\n\t*Loaded Vocabulary: %u words, %d bytes", wcsa->nwords, totaltmp); } fprintf(stderr,"\n\t... Done: %2.2f seconds (sys+usr t)\n", getSYSTimeBF() -t1); /** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */ loadIndexConstants(Index, inbasename); fprintf(stderr,"\n\t*Loaded configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) ); /* //shows the words parsed... { int i; fprintf(stderr,"\n\n Despues de sorting ...."); fflush(stderr); unsigned char *str; uint len; // for (i = 0; i<100; i++) { for (i = 0; ((uint)i)<wcsa->nwords; i++) { if ((i<10) || (((uint)i) >wcsa->nwords-5)) { getWord(wcsa,i,&str,&len); fprintf(stderr,"\n freq[%6d]=%6u ",i, wcsa->freqs[i]); fprintf(stderr,", words[%6d] = ",i); printWord(str,len); } } } t1 = getSYSTimeBF(); fprintf(stderr,"\n %u words have been loaded", wcsa->nwords); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); */ #ifdef FREQ_VECTOR_AVAILABLE /** 3 ** Loading freq vector */ {uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary" loadFreqVector(&(wcsa->freqs), &size, (char *)inbasename); fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) ); } #endif /** 5 ** Loading the Representation of the source text */ load_representation( &wcsa->ct,inbasename); { uint size; size_representation(wcsa->ct, &size); fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size); } /** 4 ** Loading the uncompressed posting lists previously created by the indexer. */ //Preparing a "list of occurrences" that will be later indexed through build_il() ** uint *source_il, sourcelen_il; uint maxPost ; //just to check it loads OK ;) ulong source_il_ulong; t1 = getSYSTimeBF(); fprintf(stderr,"\n... Loading the posting lists from disk \n"); fflush(stderr); load_posting_lists_from_file(&maxPost, &source_il_ulong, &source_il, inbasename); /** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/ sourcelen_il = (uint)source_il_ulong; fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", (ulong)sourcelen_il - wcsa->nwords -2, (ulong) sizeof(uint)*(sourcelen_il - wcsa->nwords -2)); fprintf(stderr,"\n MAXPOST loaded = %u, source_il_len = %u \n\n",maxPost,sourcelen_il); fprintf(stderr,"\n NLISTS loaded = %u, MAXPOSTS_sET \n\n",source_il[0],source_il[1]); t1 = getSYSTimeBF(); fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION************!! \n\n"); fflush(stderr); /* { char fileposts[2048]; sprintf(fileposts,"%s.%s.%u","postinglistsXX","posts", getpid()); FILE *ff = fopen(fileposts,"w"); fwrite(source_il, sizeof(uint), sourcelen_il,ff); fclose(ff); } */ //compressing the lists of occurrences and setting wcsa->ils int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils)); //source_il is freed inside!. IFERRORIL(error); fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); #ifndef FREQ_VECTOR_AVAILABLE //<----- not needed in advance, only during construction free(wcsa->freqs); #endif ulong sizeI; index_size(*index, &sizeI); fflush(stderr); fflush(stdout); fprintf(stderr,"\n ---------------------------------------------"); fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI); fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)", getSYSTimeBF() -t0); fprintf(stderr,"\n ---------------------------------------------\n\n\n"); fflush(stderr); fflush(stdout); return 0; }