Esempio n. 1
0
    /**  Loads index from one or more file(s) named filename, possibly 
      adding the proper extensions. */
int load_index(char *filename, void **index){

	twcsa *wcsa;
	wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
	void *Index = (void *) wcsa;
	int error;
	wcsa->text = NULL;
	
	// Inicializes the arrays used to detect if a char is valid or not.
	StartValid();
		
	/** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */
	loadVocabulary (Index, filename);
		{	
		uint totaltmp=0;  //words
		totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
		totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.	
		fprintf(stderr,"\n\t*Loaded Vocabulary of text: %u words, %d bytes\n", wcsa->nwords, totaltmp);
		}
		
	/** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */
	loadIndexConstants(Index, filename);
	fprintf(stderr,"\t*Loaded  configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) );

	#ifdef FREQ_VECTOR_AVAILABLE
	/** 3 ** Loading freq vector */
	{uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary"
	loadFreqVector(&(wcsa->freqs), &size, (char *)filename);	
	fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) );
	}		
	#endif

	/** 4 ** Loading Compressed Structure of posting lists (il) */
	error = load_il((char*) filename,&(wcsa->ils));
 	IFERRORIL(error);
 	uint sizeil;
 	error = size_il(wcsa->ils,&sizeil);
 	IFERRORIL(error);
	fprintf(stderr,"\n \t*loaded compressed inverted lists structure: %d bytes\n", sizeil);
	
	/** 5 ** Loading the Representation of the source text */
	load_representation( &wcsa->ct,filename); 
	{
		uint size;
		size_representation(wcsa->ct, &size);	
		fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size);
	}
	
	(*index) = Index;			
	return 0;
}
Esempio n. 2
0
ARPAPlugin::ARPAPlugin(Configuration* config, ContextTracker* ct)
    : Plugin(config,
	     ct,
             "ARPAPlugin",
             "ARPAPlugin, a plugin relying on an ARPA language model",
             "ARPAPlugin, long description." )
{
    Value value;

    try {
	value = config->get(LOGGER);
	logger << setlevel(value);
	logger << INFO << "LOGGER: " << value << endl;
    } catch (Configuration::ConfigurationException ex) {
	logger << WARN << "Caught ConfigurationException: " << ex.what() << endl;
    }

    try {
        value = config->get(VOCABFILENAME);
        logger << INFO << "VOCABFILENAME: " << value << endl;
        vocabFilename = value;

    } catch (Configuration::ConfigurationException ex) {
        logger << ERROR << "Caught fatal ConfigurationException: " << ex.what() << endl;
        throw PresageException("Unable to init " + name + " predictive plugin.");
    }

    try {
	value = config->get(ARPAFILENAME);
	logger << INFO << "ARPAFILENAME: " << value << endl;
	arpaFilename = value;

    } catch (Configuration::ConfigurationException ex) {
	logger << ERROR << "Caught fatal ConfigurationException: " << ex.what() << endl;
	throw PresageException("Unable to init " + name + " predictive plugin.");
    }

    try {
        value = config->get(TIMEOUT);
        logger << INFO << "TIMEOUT: " << value << endl;
        timeout = atoi(value.c_str());

    } catch (Configuration::ConfigurationException ex) {
        logger << ERROR << "Caught fatal ConfigurationException: " << ex.what() << endl;
        throw PresageException("Unable to init " + name + " predictive plugin.");
    }

    loadVocabulary();
    createARPATable();

}
Esempio n. 3
0
int main(int argc, char **argv)
{
  std::setlocale(LC_ALL, "ru_RU.UTF-8");

  auto vocabulary = loadVocabulary(argv[1]);
  token_t root = std::make_tuple( 0, std::wstring(L"*root*"),
                                 -1, std::wstring(L"rroot"));
  auto sentence = sentence_t(1, root);
  std::wstring line;

  while (std::getline(std::wcin, line)) {
    line.erase(std::remove(line.begin(), line.end(), L'\n'), line.end());
    std::transform(line.begin(), line.end(), line.begin(), std::towlower);
    
    std::wistringstream stream(line);
    std::vector<std::wstring> rawSentence;

    std::copy(std::istream_iterator<std::wstring, wchar_t>(stream),
              std::istream_iterator<std::wstring, wchar_t>(),
              std::back_inserter(rawSentence));

    if (rawSentence.empty()) {
      if (sentence.size() > 1) {
        printPairs(vocabulary, sentence);
      }

      sentence = sentence_t(1, root);
    }
    else {
      int index = std::stoi(rawSentence[0]);
      auto tokenWord = rawSentence[1];
      int parentIndex = std::stoi(rawSentence[rawSentence.size() - 2]);
      auto relation = rawSentence[rawSentence.size() - 1];

      sentence.push_back(std::make_tuple(index, tokenWord,
                                         parentIndex, relation));
    }
  }

  if (sentence.size() > 1) {
    printPairs(vocabulary, sentence);
  }

  return 0;
}
bool BowVocabulary::computeVocabulary(Mat& vocabularyOut, const string& vocabularyImgsList, bool outputAnalyzedImages, bool useOnlyTargetRegions) {
	if (loadVocabulary(vocabularyOut)) {
		return true;
	}	

	_bowTrainer->clear();

	ifstream imgsList(vocabularyImgsList);
	if (imgsList.is_open()) {		
		vector<string> fileNames;
		string filename;
		while (getline(imgsList, filename)) {									
			fileNames.push_back(filename);
		}
		int numberOfFiles = fileNames.size();


		cout << "    -> Building vocabulary with " << numberOfFiles << " images..." << endl;
		PerformanceTimer performanceTimer;
		performanceTimer.start();

		int descriptorsOriginalMatrixType = CV_32FC1;

		//#pragma omp parallel for schedule(dynamic)
		for (int i = 0; i < numberOfFiles; ++i) {
			Mat imagePreprocessed;
			string imageFilename = IMGS_DIRECTORY + fileNames[i] + IMAGE_TOKEN;
			if (_imagePreprocessor->loadAndPreprocessImage(imageFilename, imagePreprocessed, CV_LOAD_IMAGE_GRAYSCALE, false)) {
				Mat outputImage;
				if (outputAnalyzedImages) {
					outputImage = imagePreprocessed.clone();
				}

				if (useOnlyTargetRegions) {
					vector<Mat> masks;
					ImageUtils::retriveTargetsMasks(IMGS_DIRECTORY + fileNames[i], masks);
					for (size_t maskIndex = 0; maskIndex < masks.size(); ++maskIndex) {
						vector<KeyPoint> keypoints;
						Mat targetMask = masks[maskIndex];
						_featureDetector->detect(imagePreprocessed, keypoints, targetMask);						
						//_featureDetector->detect(imagePreprocessed, keypoints, masks[maskIndex]);

						if (keypoints.size() > 3) {
							Mat descriptors;
							_descriptorExtractor->compute(imagePreprocessed, keypoints, descriptors);
							descriptorsOriginalMatrixType = descriptors.type();
							descriptors.convertTo(descriptors, CV_32FC1);

							if (descriptors.rows > 0) {
								//#pragma omp critical
								_bowTrainer->add(descriptors);
							}

							if (outputAnalyzedImages) {
								cv::drawKeypoints(outputImage, keypoints, outputImage);
							}
						}
					}
				} else {
					vector<KeyPoint> keypoints;
					_featureDetector->detect(imagePreprocessed, keypoints);

					if (keypoints.size() > 3) {
						Mat descriptors;
						_descriptorExtractor->compute(imagePreprocessed, keypoints, descriptors);
						descriptorsOriginalMatrixType = descriptors.type();
						descriptors.convertTo(descriptors, CV_32FC1);

						if (descriptors.rows > 0) {
							//#pragma omp critical
							_bowTrainer->add(descriptors);
						}

						if (outputAnalyzedImages) {
							cv::drawKeypoints(outputImage, keypoints, outputImage);
						}
					}					
				}
				
				if (outputAnalyzedImages) {
					stringstream imageOutputFilename;
					imageOutputFilename << VOCABULARY_BUILD_OUTPUT_DIRECTORY << fileNames[i] << FILENAME_SEPARATOR << _vocabularyFilename << IMAGE_OUTPUT_EXTENSION;
					imwrite(imageOutputFilename.str(), outputImage);
				}				
			}
		}
		vocabularyOut = _bowTrainer->cluster();
		saveVocabulary(vocabularyOut);

		vocabularyOut.convertTo(vocabularyOut, descriptorsOriginalMatrixType);
		_bowImgDescriptorExtractor->setVocabulary(vocabularyOut);		
		cout << "    -> Finished building vocabulary with " << vocabularyOut.rows << " word size in " << performanceTimer.getElapsedTimeFormated() << "\n" << endl;
		

		_bowTrainer->clear();
		return true;
	}
	
	return false;
}
Esempio n. 5
0
/** ***********************************************************************************
	 CONSTRUCTION OF THE INDEX, from a given text file "inbasename".
    ***********************************************************************************/
int build_WordIndex_from_postings (char *inbasename, char *build_options, void **index){
	twcsa *wcsa;
	wcsa = (twcsa *) malloc (sizeof (twcsa) * 1);
	*index = wcsa;
	void *Index = *index;
	wcsa->text = NULL;
	double t0, t1;
	t0 = getSYSTimeBF();

	/** processing the parameters of the index:: blockSize, and q-gram-len (q) */
	{
		char delimiters[] = " =;";
		int j,num_parameters;
		char ** parameters;
		
		if (build_options != NULL) {
			parse_parameters_II(build_options,&num_parameters, &parameters, delimiters);
			for (j=0; j<num_parameters;j++) {

			  if ((strcmp(parameters[j], "blocksize") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->blockSize = atoi(parameters[j+1]) * BLOCK_MULTIPLIER;	    
				j++;
			  } 
			  else if ((strcmp(parameters[j], "qgram") == 0 ) && (j < num_parameters-1) ) {
				//wcsa->q =atoi(parameters[j+1]);	    
				j++;
			  }
			  else if ((strcmp(parameters[j], "path2repaircompressor") == 0 ) && (j < num_parameters-1) ) {
				//strcpy(path2repaircompressor,parameters[j+1]);	    
				j++;
			  }
			  
			}
			free_parameters_II(num_parameters, &parameters);
		}
		//fprintf(stderr,"\n Parameters of II-blocks:: *basename = %s, blocksize = %d, q= %d",inbasename,wcsa->blockSize,wcsa->q);
		//fprintf(stderr,"\n \t path2repaircompressor= %s\n",path2repaircompressor);
	}

	wcsa->freqs=NULL;


	/** 0 ** Inicializes the arrays used to detect if a char is valid or not. **/
	StartValid();
	
		
	/** 1 ** loads the vocabulary of words. zonewords, words vector, nwords */
	t1 = getSYSTimeBF();
	
	loadVocabulary (Index, inbasename);
		{	
		uint totaltmp=0;  //words
		totaltmp += ((((wcsa->nwords+1)* (wcsa->wordsData.elemSize))+W-1) /W) * (sizeof(uint));  //the pointers
		totaltmp += wcsa->wordsData.wordsZoneMem.size * sizeof(byte); //the characters of the words.	
		fprintf(stderr,"\n\t*Loaded Vocabulary: %u words, %d bytes", wcsa->nwords, totaltmp);
		}
		fprintf(stderr,"\n\t... Done: %2.2f seconds (sys+usr t)\n", getSYSTimeBF() -t1);		

	/** 2 ** Loads some configuration constants: sourceTextSize, maxNumOccs */
	loadIndexConstants(Index, inbasename);
	fprintf(stderr,"\n\t*Loaded  configuration constants: %lu bytes\n", (ulong) (2 * sizeof(uint ) + sizeof(ulong)) );		
/*
	//shows the words parsed...
	{
		int i;
		fprintf(stderr,"\n\n Despues de sorting ....");	fflush(stderr);
		unsigned char *str;
		uint len;
//		for (i = 0; i<100; i++) {
		for (i = 0; ((uint)i)<wcsa->nwords; i++) {
			if ((i<10) || (((uint)i) >wcsa->nwords-5)) {
				getWord(wcsa,i,&str,&len);				
				fprintf(stderr,"\n freq[%6d]=%6u ",i,  wcsa->freqs[i]);
				fprintf(stderr,", words[%6d] = ",i);
				printWord(str,len);
			}
		}		
	}

	t1 = getSYSTimeBF();
	fprintf(stderr,"\n %u words have been loaded", wcsa->nwords);
	fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1); 					
*/	

	#ifdef FREQ_VECTOR_AVAILABLE
	/** 3 ** Loading freq vector */
	{uint size; //the size of the vocabulary in #ofwords =>> already init in "loadvocabulary"
	loadFreqVector(&(wcsa->freqs), &size, (char *)inbasename);	
	fprintf(stderr,"\t*Loaded freq vector: %d bytes\n", wcsa->nwords * sizeof(uint) );
	}		
	#endif	
	
	/** 5 ** Loading the Representation of the source text */
	load_representation( &wcsa->ct,inbasename); 
	{
		uint size;
		size_representation(wcsa->ct, &size);	
		fprintf(stderr,"\n\t*Loaded (compressed) representation of the source Text: %u bytes\n", size);
	}
		
	/** 4 ** Loading the uncompressed posting lists previously created by the indexer. */
	
		//Preparing a "list of occurrences" that will be later indexed through build_il() **
		uint *source_il, sourcelen_il;
		uint maxPost ; //just to check it loads OK ;)
		ulong source_il_ulong;

		t1 = getSYSTimeBF();
		fprintf(stderr,"\n... Loading the posting lists from disk \n"); fflush(stderr);	
		
		load_posting_lists_from_file(&maxPost, &source_il_ulong, &source_il, inbasename);

		/** FOR CIKM ILISTS_DO NOT STILL SUPPORT an ULONG HERE **/
		sourcelen_il = (uint)source_il_ulong;
		
		fprintf(stderr,"\n there are %lu uints in all the lists of occurrences: size [uint32] = %lu bytes.\n ", 
		               (ulong)sourcelen_il - wcsa->nwords -2, (ulong) sizeof(uint)*(sourcelen_il - wcsa->nwords -2));
		fprintf(stderr,"\n MAXPOST loaded = %u, source_il_len = %u \n\n",maxPost,sourcelen_il);
		fprintf(stderr,"\n NLISTS loaded = %u, MAXPOSTS_sET \n\n",source_il[0],source_il[1]);

		
		
		t1 = getSYSTimeBF();
		fprintf(stderr,"\n**... entering BUILD INVERTED LIST REPRESENTATION************!! \n\n"); fflush(stderr);	
/*
		{ char fileposts[2048];
			sprintf(fileposts,"%s.%s.%u","postinglistsXX","posts", getpid());
			FILE *ff = fopen(fileposts,"w");
			fwrite(source_il, sizeof(uint), sourcelen_il,ff);
			fclose(ff);
			
		}
*/
		//compressing the lists of occurrences and setting wcsa->ils
		int error = build_il(source_il, sourcelen_il, build_options, &(wcsa->ils));  //source_il is freed inside!.
		IFERRORIL(error);							

		fprintf(stderr,"\n... Done: %2.2f seconds (sys+usr time)\n", getSYSTimeBF() -t1);
		
	
	#ifndef FREQ_VECTOR_AVAILABLE   //<----- not needed in advance, only during construction
		free(wcsa->freqs);
	#endif

		
	ulong sizeI;
	index_size(*index, &sizeI);
	fflush(stderr); fflush(stdout);
	fprintf(stderr,"\n ---------------------------------------------");
	fprintf(stderr,"\n The index has already been built: %lu bytes!!\n", sizeI);
	fprintf(stderr,"\n... Done: OVERALL TIME = %2.2f seconds (sys+usr time)", getSYSTimeBF() -t0);
	fprintf(stderr,"\n ---------------------------------------------\n\n\n");
	fflush(stderr);
	fflush(stdout);
	return 0;
}