void TokenCreator(const char *source,const char *symtab,const char *outfile,NFA *N,int nfaCount){ printf("\nFILENAME = '%s'\n",source); char *line=(char *)malloc(sizeof(char)*1000); char *debug=(char *)malloc(sizeof(char)*1000); // char debug[1000]; FILE *s=fopen(source,"r"); ENDOFFILE=0; while(1){ line[0]='\0'; readLine(s,line); // printChar(line); // printf("%s",line); // printf("\n **LINE = { %s }*** \n",line); printf("clearing tabs\n"); clearTabs(line); printf("tabs cleared\n"); sprintf(debug,"\n**********LINE = [%s]**************\n",line); printf("%s",debug); // printChar(debug); // printf("\n%s\n",line ); if(line[0]=='\0'&&ENDOFFILE==1){ break; } if(line[0]=='\0'){ continue; } lineToTokens(line,symtab,outfile,N,nfaCount); if(ENDOFFILE==1){ break; } } fclose(s); }
void CBinaryBayes::loadModel(std::string fileName) { //TODO: if(!fileExists(fileName)) reportError("Couldn't load model file."); std::ifstream input (fileName.c_str()); std::string line; //Get general data from first line getline(input, line); std::vector<std::string> splits = lineToTokens(line.c_str(),' '); this->m = lexical_cast<int,std::string>(splits[0]); this->Cpos = lexical_cast<double,std::string>(splits[1]); this->Cneg = lexical_cast<double,std::string>(splits[2]); this->prior= lexical_cast<double,std::string>(splits[3]); this->allZeroSumPos = lexical_cast<double,std::string>(splits[4]); this->allZeroSumNeg = lexical_cast<double,std::string>(splits[5]); //Input feature priors this->isKnown.clear(); this->P0N.clear(); this->P0P.clear(); this->P1N.clear(); this->P1P.clear(); this->P0N.reserve(m); this->P0P.reserve(m); this->P1N.reserve(m); this->P1P.reserve(m); this->isKnown.reserve(m); for(int i = 0; i < m; i++) { getline(input, line); std::vector<std::string> splits = lineToTokens(line.c_str(),' '); this->isKnown.push_back(lexical_cast<bool,std::string>(splits[0])); this->P0P.push_back(lexical_cast<double,std::string>(splits[1])); this->P0N.push_back(lexical_cast<double,std::string>(splits[2])); this->P1P.push_back(lexical_cast<double,std::string>(splits[3])); this->P1N.push_back(lexical_cast<double,std::string>(splits[4])); } input.close(); }
//used for cross-validation: allow to ignore a certain part of the trainfile void CBinaryBayes::trainOnline(std::string filename, int n, int numFeat, int ignore_start, int ignore_end) { this->m = numFeat; std::ifstream input(filename.c_str()); std::string line; if(!input) reportError("File not found ..."); clock_t tbegin = clock(); clock_t tprev = clock(); clock_t tend = clock(); //Prepare the vectors to avoid unnecessary space usage this->P0N.reserve(m); this->P1N.reserve(m); this->P0P.reserve(m); this->P1P.reserve(m); this->isKnown.reserve(m); //PASS 1: determine probabilities for each class int classCountPos = 0; int classCountNeg = 0; std::vector<int> oneCountPos; //How many ones in for a features given that the class value is Pos std::vector<int> oneCountNeg; //How many ones in for a features given that the class value is Neg for(int i = 0; i < m; i++) { oneCountPos .push_back(0); oneCountNeg .push_back(0); this->isKnown.push_back(false); } //PASS 1: Add all training points for(int i = 0; i<n; i++) {//i<n getline( input, line ); if(!input) break; //Stop at end of file if(i > ignore_start && i < ignore_end) continue; //Ignore part of the file std::vector<std::string> tokens = lineToTokens(line.c_str(),' '); //Update prior probability count int classval = atoi(tokens[0].c_str()); if(classval != -1 && classval != 1) reportError("Class values should be either -1 or 1."); bool hit = classval > 0; if(hit) classCountPos ++; else classCountNeg ++; int prev_item = 0; for(std::vector<std::string>::size_type j = 1; j < tokens.size(); j++) { std::string token = tokens[j]; std::vector<std::string> pair = lineToTokens(token.c_str(),':'); if(pair.size() < 2) continue; //extra space element int feat = lexical_cast<int,std::string>(pair[0]); //double val = lexical_cast<double,std::string>(pair[1]); //Not used, assumed to be equal to one if(feat > this->m) { std::stringstream s; s << "Found a feature with dimension higher than actual dataset dimension (" << m << " vs " << feat << ")."; reportError(s.str()); } if(hit) { oneCountPos[feat]++; } else { oneCountNeg[feat]++; } } #ifdef DEBUG tend = clock(); if( (tend-tprev)/ CLOCKS_PER_SEC > INTERVAL_TIME ) { int done = i; int left = 2*n-i; double timedone = double(tend-tbegin) / CLOCKS_PER_SEC; double timeleft = timedone / done * left; LOG << "Time " << done << "/" << 2*n << ":" << timeleft << "s\n"; tprev = tend; } #endif } //Intermediate: format all counts to (log)probabilities this->prior = classCountPos * 1.0 / n; double priorNeg = 1-prior; this->allZeroSumPos = 0; this->allZeroSumNeg = 0; for(int i = 0; i < this->m; i++) { int zeroCountPos = classCountPos - oneCountPos[i]; int zeroCountNeg = classCountNeg - oneCountNeg[i]; //Score class 1 double d = 1; double P0Pi = log((1.0*zeroCountPos+1) / (classCountPos+2)); double P1Pi = log((1.0*oneCountPos[i]+1) / (classCountPos+2)); this->P0P.push_back(P0Pi); this->P1P.push_back(P1Pi); //Score class 2 double P0Ni = log((1.0*zeroCountNeg+1) / (classCountNeg+1)); double P1Ni = log((1.0* oneCountNeg[i]+2) / (classCountNeg+2)); this->P0N.push_back(P0Ni); this->P1N.push_back(P1Ni); //Score class 0 (is the negative value of the other one ...) isKnown[i] = (oneCountPos[i] > 0) && (oneCountNeg[i] > 0); this->allZeroSumPos += P0Pi; this->allZeroSumNeg += P0Ni; #ifdef DEBUG //Timer update tend = clock(); if( (tend-tprev)/ CLOCKS_PER_SEC > INTERVAL_TIME ) { int done = n+i; int left = 2*n-i; float timedone = double(tend-tbegin) / CLOCKS_PER_SEC; float timeleft = timedone / done * left; LOG << "Time " << done << "/" << n << ":" << timeleft << "s\n"; tprev = tend; } #endif } this->treshold = 0; }