Пример #1
0
void TokenCreator(const char *source,const char *symtab,const char *outfile,NFA *N,int nfaCount){
	printf("\nFILENAME = '%s'\n",source);
	char *line=(char *)malloc(sizeof(char)*1000);
	char *debug=(char *)malloc(sizeof(char)*1000);
	// char debug[1000];
	FILE *s=fopen(source,"r");
	ENDOFFILE=0;
	while(1){
		line[0]='\0';
		readLine(s,line);
		// printChar(line);
		// printf("%s",line);
		// printf("\n **LINE = { %s }*** \n",line);
		printf("clearing tabs\n");
		clearTabs(line);
		printf("tabs cleared\n");
		sprintf(debug,"\n**********LINE = [%s]**************\n",line);
		printf("%s",debug);
		// printChar(debug);
		// printf("\n%s\n",line );
		if(line[0]=='\0'&&ENDOFFILE==1){
			break;
		}
		if(line[0]=='\0'){
			continue;
		}
		lineToTokens(line,symtab,outfile,N,nfaCount);
		if(ENDOFFILE==1){
			break;
		}
	}
	fclose(s);
}
Пример #2
0
void CBinaryBayes::loadModel(std::string fileName) { //TODO: 
	if(!fileExists(fileName))
		reportError("Couldn't load model file.");

	std::ifstream input (fileName.c_str());
	std::string line;

	//Get general data from first line
	getline(input, line); 	
	std::vector<std::string> splits = lineToTokens(line.c_str(),' ');
	this->m    = lexical_cast<int,std::string>(splits[0]);

	this->Cpos = lexical_cast<double,std::string>(splits[1]);
	this->Cneg = lexical_cast<double,std::string>(splits[2]);
	this->prior= lexical_cast<double,std::string>(splits[3]);
	this->allZeroSumPos = lexical_cast<double,std::string>(splits[4]);
	this->allZeroSumNeg = lexical_cast<double,std::string>(splits[5]);

	//Input feature priors
	this->isKnown.clear();
	this->P0N.clear();
	this->P0P.clear();
	this->P1N.clear();
	this->P1P.clear();
	this->P0N.reserve(m);
	this->P0P.reserve(m);
	this->P1N.reserve(m);
	this->P1P.reserve(m);
	this->isKnown.reserve(m);

	for(int i = 0; i < m; i++) {
		getline(input, line); 	
		std::vector<std::string> splits = lineToTokens(line.c_str(),' ');	
		this->isKnown.push_back(lexical_cast<bool,std::string>(splits[0]));
		this->P0P.push_back(lexical_cast<double,std::string>(splits[1]));
		this->P0N.push_back(lexical_cast<double,std::string>(splits[2]));
		this->P1P.push_back(lexical_cast<double,std::string>(splits[3]));
		this->P1N.push_back(lexical_cast<double,std::string>(splits[4]));
	}
	input.close();
}
Пример #3
0
//used for cross-validation: allow to ignore a certain part of the trainfile
void CBinaryBayes::trainOnline(std::string filename, int n, int numFeat, int ignore_start, int ignore_end) {
	this->m = numFeat;
	std::ifstream input(filename.c_str());	
	std::string line;	

	if(!input)
		reportError("File not found ...");
	
	clock_t tbegin = clock();
	clock_t tprev  = clock();
	clock_t tend   = clock();

	//Prepare the vectors to avoid unnecessary space usage
	this->P0N.reserve(m);
	this->P1N.reserve(m);
	this->P0P.reserve(m);
	this->P1P.reserve(m);
	this->isKnown.reserve(m);
	
	//PASS 1: determine probabilities for each class
	int classCountPos = 0;
	int classCountNeg = 0;
	
	std::vector<int> oneCountPos;	//How many ones in for a features given that the class value is Pos
	std::vector<int> oneCountNeg;   //How many ones in for a features given that the class value is Neg
	
	for(int i = 0; i < m; i++) {
		oneCountPos  .push_back(0);
		oneCountNeg  .push_back(0);
		this->isKnown.push_back(false);
	}
	
	//PASS 1: Add all training points
	for(int i = 0; i<n; i++) {//i<n
		getline( input, line ); 
		if(!input) break;														//Stop at end of file
		if(i > ignore_start && i < ignore_end) continue;						//Ignore part of the file

		std::vector<std::string> tokens = lineToTokens(line.c_str(),' ');		
		//Update prior probability count
		int classval	= atoi(tokens[0].c_str());
		
		if(classval != -1 && classval != 1) 
			reportError("Class values should be either -1 or 1.");

		bool hit		= classval > 0;
		if(hit) classCountPos ++;
		else    classCountNeg ++;


		int prev_item = 0;
		for(std::vector<std::string>::size_type j = 1; j < tokens.size(); j++) {
			std::string token = tokens[j];
			std::vector<std::string> pair = lineToTokens(token.c_str(),':');
			if(pair.size() < 2) continue; //extra space element
			int feat   = lexical_cast<int,std::string>(pair[0]);
			//double val = lexical_cast<double,std::string>(pair[1]); //Not used, assumed to be equal to one
			if(feat > this->m) {
				std::stringstream s;
				s << "Found a feature with dimension higher than actual dataset dimension (" << m << " vs " << feat << ").";
				reportError(s.str());
			}
			
			if(hit) {
				oneCountPos[feat]++;
			}
			else {
				oneCountNeg[feat]++;
			}
		}
		#ifdef DEBUG
		tend   = clock();
		if( (tend-tprev)/ CLOCKS_PER_SEC  > INTERVAL_TIME ) {
			int done = i;
			int left = 2*n-i;
			double timedone = double(tend-tbegin) / CLOCKS_PER_SEC;
			double timeleft = timedone / done * left;
			LOG << "Time " << done << "/" << 2*n << ":" << timeleft << "s\n";
			tprev = tend;
		}
		#endif
	}
	//Intermediate: format all counts to (log)probabilities
	this->prior			= classCountPos * 1.0 / n;
	double priorNeg		= 1-prior;

	this->allZeroSumPos = 0;
	this->allZeroSumNeg = 0;
	for(int i = 0; i < this->m; i++) {
		int zeroCountPos = classCountPos - oneCountPos[i];
		int zeroCountNeg = classCountNeg - oneCountNeg[i];

		//Score class 1		
		double d = 1;
		double P0Pi = log((1.0*zeroCountPos+1) / (classCountPos+2));
		double P1Pi = log((1.0*oneCountPos[i]+1) / (classCountPos+2));
		this->P0P.push_back(P0Pi);
		this->P1P.push_back(P1Pi);				
		
		//Score class 2
		double P0Ni = log((1.0*zeroCountNeg+1) / (classCountNeg+1));
		double P1Ni = log((1.0* oneCountNeg[i]+2) / (classCountNeg+2));
		this->P0N.push_back(P0Ni);
		this->P1N.push_back(P1Ni);
		
		//Score class 0 (is the negative value of the other one ...)
		isKnown[i] = (oneCountPos[i] > 0) && (oneCountNeg[i] > 0);

		this->allZeroSumPos += P0Pi;
		this->allZeroSumNeg += P0Ni;

		#ifdef DEBUG
		//Timer update
		tend   = clock();		
		if( (tend-tprev)/ CLOCKS_PER_SEC  > INTERVAL_TIME ) {
			int done = n+i;
			int left = 2*n-i;			
			float timedone = double(tend-tbegin) / CLOCKS_PER_SEC;
			float timeleft = timedone / done * left;
			LOG << "Time " << done << "/" << n << ":" << timeleft << "s\n";
			tprev = tend;
		}
		#endif
	}
	
	this->treshold = 0;
}