PClassifier TTreeSplitConstructor_ExhaustiveBinary::operator()( PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute, PExampleGenerator gen, const int &weightID , PDomainContingency dcont, PDistribution apriorClass, const vector<bool> &candidates, PClassifier ) { checkProperty(measure); measure->checkClassTypeExc(gen->domain->classVar->varType); PIntList bestMapping; int wins, bestAttr; PVariable bvar; if (measure->needs==TMeasureAttribute::Generator) { bool cse = candidates.size()==0; bool haveCandidates = false; vector<bool> myCandidates; myCandidates.reserve(gen->domain->attributes->size()); vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); TVarList::const_iterator vi, ve(gen->domain->attributes->end()); for(vi = gen->domain->attributes->begin(); vi != ve; vi++) { bool co = (*vi)->varType == TValue::INTVAR && (!cse || (ci!=ce) && *ci); myCandidates.push_back(co); haveCandidates = haveCandidates || co; } if (!haveCandidates) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); PDistribution thisSubsets; float thisQuality; wins = 0; int thisAttr = 0; int N = gen->numberOfExamples(); TSimpleRandomGenerator rgen(N); ci = myCandidates.begin(); for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) { if (*ci) { thisSubsets = NULL; PIntList thisMapping = /*throughCont ? measure->bestBinarization(thisSubsets, thisQuality, *dci, dcont->classes, apriorClass, minSubset) : */measure->bestBinarization(thisSubsets, thisQuality, *vi, gen, apriorClass, weightID, minSubset); if (thisMapping && ( (!wins || (thisQuality>quality)) && ((wins=1)==1) || (thisQuality==quality) && rgen.randbool(++wins))) { bestAttr = thisAttr; quality = thisQuality; subsetSizes = thisSubsets; bestMapping = thisMapping; } } /*if (thoughCont) dci++; */ } if (!wins) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); if (quality<worstAcceptable) return returnNothing(descriptions, subsetSizes, spentAttribute); if (subsetSizes && subsetSizes->variable) bvar = subsetSizes->variable; else { TEnumVariable *evar = mlnew TEnumVariable(""); evar->addValue("0"); evar->addValue("1"); bvar = evar; } } else { bool cse = candidates.size()==0; if (!cse && noCandidates(candidates)) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); if (!dcont || dcont->classIsOuter) { dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID)); // raiseWarningWho("TreeSplitConstructor_ExhaustiveBinary", "this class is not optimized for 'candidates' list and can be very slow"); } int N = gen ? gen->numberOfExamples() : -1; if (N<0) N = dcont->classes->cases; TSimpleRandomGenerator rgen(N); PDistribution classDistribution = dcont->classes; vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); TDiscDistribution *dis0, *dis1; TContDistribution *con0, *con1; int thisAttr = 0; bestAttr = -1; wins = 0; quality = 0.0; float leftExamples, rightExamples; TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) { // We consider the attribute only if it is a candidate, discrete and has at least two values if ((cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && ((*dci)->discrete->size()>=2)) { const TDistributionVector &distr = *(*dci)->discrete; if (distr.size()>16) raiseError("'%s' has more than 16 values, cannot exhaustively binarize", gen->domain->attributes->at(thisAttr)->get_name().c_str()); // If the attribute is binary, we check subsetSizes and assess the quality if they are OK if (distr.size()==2) { if ((distr.front()->abs<minSubset) || (distr.back()->abs<minSubset)) continue; // next attribute else { float thisMeas = measure->call(thisAttr, dcont, apriorClass); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { bestAttr = thisAttr; quality = thisMeas; leftExamples = distr.front()->abs; rightExamples = distr.back()->abs; bestMapping = mlnew TIntList(2, 0); bestMapping->at(1) = 1; } continue; } } vector<int> valueIndices; int ind = 0; for(TDistributionVector::const_iterator dvi(distr.begin()), dve(distr.end()); (dvi!=dve); dvi++, ind++) if ((*dvi)->abs>0) valueIndices.push_back(ind); if (valueIndices.size()<2) continue; PContingency cont = prepareBinaryCheat(classDistribution, *dci, bvar, dis0, dis1, con0, con1); // A real job: go through all splits int binWins = 0; float binQuality = -1.0; float binLeftExamples = -1.0, binRightExamples = -1.0; // Selection: each element correspons to a value of the original attribute and is 1, if the value goes right // The first value always goes left (and has no corresponding bit in selection. TBoolCount selection(valueIndices.size()-1), bestSelection(0); // First for discrete classes if (dis0) { do { *dis0 = CAST_TO_DISCDISTRIBUTION(distr[valueIndices[0]]); *dis1 *= 0; vector<int>::const_iterator ii(valueIndices.begin()); ii++; for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++) *(*bi ? dis1 : dis0) += distr[*ii]; cont->outerDistribution->setint(0, dis0->abs); cont->outerDistribution->setint(1, dis1->abs); if ((dis0->abs < minSubset) || (dis1->abs < minSubset)) continue; // cannot split like that, to few examples in one of the branches float thisMeas = measure->operator()(cont, classDistribution, apriorClass); if ( ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1) || (thisMeas==binQuality) && rgen.randbool(++binWins)) { bestSelection = selection; binQuality = thisMeas; binLeftExamples = dis0->abs; binRightExamples = dis1->abs; } } while (selection.next()); } // And then exactly the same for continuous classes else { do { *con0 = CAST_TO_CONTDISTRIBUTION(distr[0]); *con1 = TContDistribution(); vector<int>::const_iterator ii(valueIndices.begin()); for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++) *(*bi ? con1 : con0) += distr[*ii]; if ((con0->abs<minSubset) || (con1->abs<minSubset)) continue; // cannot split like that, to few examples in one of the branches float thisMeas = measure->operator()(cont, classDistribution, apriorClass); if ( ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1) || (thisMeas==binQuality) && rgen.randbool(++binWins)) { bestSelection = selection; binQuality = thisMeas; binLeftExamples = con0->abs; binRightExamples = con1->abs; } } while (selection.next()); } if ( binWins && ( (!wins || (binQuality>quality)) && ((wins=1)==1) || (binQuality==quality) && rgen.randbool(++wins))) { bestAttr = thisAttr; quality = binQuality; leftExamples = binLeftExamples; rightExamples = binRightExamples; bestMapping = mlnew TIntList(distr.size(), -1); vector<int>::const_iterator ii = valueIndices.begin(); bestMapping->at(*(ii++)) = 0; ITERATE(TBoolCount, bi, bestSelection) bestMapping->at(*(ii++)) = *bi ? 1 : 0; } } } if (!wins) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); subsetSizes = mlnew TDiscDistribution(); subsetSizes->addint(0, leftExamples); subsetSizes->addint(1, rightExamples); } PVariable attribute = gen->domain->attributes->at(bestAttr); if (attribute->noOfValues() == 2) { spentAttribute = bestAttr; descriptions = mlnew TStringList(attribute.AS(TEnumVariable)->values.getReference()); TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes); cfv->transformUnknowns = false; return cfv; } string s0, s1; int ns0 = 0, ns1 = 0; TValue ev; attribute->firstValue(ev); PITERATE(TIntList, mi, bestMapping) { string str; attribute->val2str(ev, str); if (*mi==1) { s1 += string(ns1 ? ", " : "") + str; ns1++; } else if (*mi==0) { s0 += string(ns0 ? ", " : "") + str; ns0++; } attribute->nextValue(ev); }
PITERATE(TIntList, mi, bestMapping) { string str; attribute->val2str(ev, str); if (*mi==1) { s1 += string(ns1 ? ", " : "") + str; ns1++; } else if (*mi==0) { s0 += string(ns0 ? ", " : "") + str; ns0++; } attribute->nextValue(ev); } descriptions = mlnew TStringList(); descriptions->push_back(ns0>1 ? "in ["+s0+"]" : s0); descriptions->push_back(ns1>1 ? "in ["+s1+"]" : s1); bvar->set_name(gen->domain->attributes->at(bestAttr)->get_name()); spentAttribute = (ns0==1) && (ns1==1) ? bestAttr : -1; TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(bvar, gen->domain, bestAttr, subsetSizes, mlnew TMapIntValue(bestMapping)); cfv->transformUnknowns = false; return cfv; }
PClassifier TTreeSplitConstructor_Attribute::operator()( PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute, PExampleGenerator gen, const int &weightID, PDomainContingency dcont, PDistribution apriorClass, const vector<bool> &candidates, PClassifier nodeClassifier ) { checkProperty(measure); measure->checkClassTypeExc(gen->domain->classVar->varType); bool cse = candidates.size()==0; vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); if (!cse) { if (noCandidates(candidates)) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); ci = candidates.begin(); } int N = gen ? gen->numberOfExamples() : -1; if (N<0) N = dcont->classes->cases; TSimpleRandomGenerator rgen(N); int thisAttr = 0, bestAttr = -1, wins = 0; quality = 0.0; if (measure->needs == TMeasureAttribute::Contingency_Class) { vector<bool> myCandidates; if (cse) { myCandidates.reserve(gen->domain->attributes->size()); PITERATE(TVarList, vi, gen->domain->attributes) myCandidates.push_back((*vi)->varType == TValue::INTVAR); } else { myCandidates.reserve(candidates.size()); TVarList::const_iterator vi(gen->domain->attributes->begin()); for(; ci != ce; ci++, vi++) myCandidates.push_back(*ci && ((*vi)->varType == TValue::INTVAR)); } if (!dcont || dcont->classIsOuter) dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID, myCandidates)); ci = myCandidates.begin(); ce = myCandidates.end(); TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); for(; (ci != ce) && (dci!=dce); dci++, ci++, thisAttr++) if (*ci && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) { float thisMeas = measure->call(thisAttr, dcont, apriorClass); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { quality = thisMeas; subsetSizes = (*dci)->outerDistribution; bestAttr = thisAttr; } } } else if (measure->needs == TMeasureAttribute::DomainContingency) { if (!dcont || dcont->classIsOuter) dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID)); TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) if ( (cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) { float thisMeas = measure->call(thisAttr, dcont, apriorClass); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { quality = thisMeas; subsetSizes = (*dci)->outerDistribution; bestAttr = thisAttr; } } } else { TDomainDistributions ddist(gen, weightID); TDomainDistributions::iterator ddi(ddist.begin()), dde(ddist.end()-1); for(; (cse || (ci!=ce)) && (ddi!=dde); ddi++, thisAttr++) if (cse || *(ci++)) { TDiscDistribution *discdist = (*ddi).AS(TDiscDistribution); if (discdist && checkDistribution(*discdist, minSubset)) { float thisMeas = measure->call(thisAttr, gen, apriorClass, weightID); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { quality = thisMeas; subsetSizes = PDiscDistribution(*ddi); // not discdist - this would be double wrapping! bestAttr = thisAttr; } } } } if (!wins) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); if (quality<worstAcceptable) return returnNothing(descriptions, subsetSizes, spentAttribute); PVariable attribute = gen->domain->attributes->at(bestAttr); TEnumVariable *evar = attribute.AS(TEnumVariable); if (evar) descriptions = mlnew TStringList(evar->values.getReference()); else descriptions = mlnew TStringList(subsetSizes->size(), ""); spentAttribute = bestAttr; TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes); cfv->transformUnknowns = false; return cfv; }
//*************************************************** int Logic::AddIncludes() { TStringList IncludeStrings,IncludeLines; int CurInputLine,CurIncludeLine; string filename; int err=0; string::size_type pos1,pos2; int CurLine; char *ptr; IncludeFilenames = TStringList(); IncludeStrings = TStringList(); EditLines = TStringList(); IncludeLines = TStringList(); CurLine = 0; for(CurInputLine = 0;CurInputLine<InputLines.num;CurInputLine++){ EditLines.add(InputLines.at(CurInputLine)); CurLine = EditLines.num -1; RealLineNum[CurLine] = CurInputLine; LineFile[CurLine] = 0; #ifdef _WIN32 if(_strnicmp(InputLines.at(CurInputLine).c_str(),"#include",8)) { #else if(strncasecmp(InputLines.at(CurInputLine).c_str(),"#include",8)){ #endif continue; } string str = InputLines.at(CurInputLine).substr(8); if(str.length()<4){ ShowError(CurLine,"Missing include filename !"); err=1; continue; } if(str[0] != ' '){ ShowError(CurLine,"' ' expected after #include."); err=1; continue; } pos1 = str.find_first_of("\"",1); pos2 = str.find_first_of("\"",pos1+1); if(pos1 == string::npos || pos2 == string::npos){ ShowError(CurLine,"Include filenames need quote marks around them."); err=1; continue; } filename = str.substr(pos1+1,pos2-pos1-1); if(filename.find_first_of("/")!=string::npos){ ShowError(CurLine,"Only files in the src directory can be included."); err=1; continue; } sprintf(tmp,"%s/src/%s",game->dir.c_str(),filename.c_str()); FILE *fptr = fopen(tmp,"rb"); if(fptr==NULL){ sprintf(tmp,"Can't open include file: %s/src/%s",game->dir.c_str(),filename.c_str()); ShowError(CurLine,tmp); err=1; continue; } IncludeLines.lfree(); while(fgets(tmp,MAX_TMP,fptr)!=NULL){ if((ptr=strchr(tmp,0x0a)))*ptr=0; if((ptr=strchr(tmp,0x0d)))*ptr=0; IncludeLines.add(tmp); } fclose(fptr); if(IncludeLines.num==0)continue; IncludeFilenames.add(filename); RemoveComments(IncludeLines); EditLines.replace(CurLine,empty_tmp); for(CurIncludeLine=0;CurIncludeLine<IncludeLines.num;CurIncludeLine++){ EditLines.add(IncludeLines.at(CurIncludeLine)); CurLine=EditLines.num-1; RealLineNum[CurLine] = CurIncludeLine; LineFile[CurLine] = IncludeFilenames.num; } } IncludeLines.lfree(); InputLines.lfree(); return err; } //*************************************************** int Logic::ReadDefines() { int err=0,i; string::size_type pos1,pos2; string ThisDefineName,ThisDefineValue; int CurLine; NumDefines = 0; for(CurLine = 0;CurLine<EditLines.num;CurLine++){ #ifdef _WIN32 if(_strnicmp(EditLines.at(CurLine).c_str(),"#define",7)){ #else if(strncasecmp(EditLines.at(CurLine).c_str(),"#define",7)){ #endif continue; } string str = EditLines.at(CurLine).substr(7); toLower(&str); if(str.length()<4){ ShowError(CurLine,"Missing define name !"); err=1; continue; } if(str[0] != ' '){ ShowError(CurLine,"' ' expected after #define."); err=1; continue; } if(NumDefines >= MaxDefines){ ShowError(CurLine,"Too many defines (max " + IntToStr(MaxDefines) + ")"); err=1; continue; } pos1 = str.find_first_not_of(" ",1); pos2 = str.find_first_of(" ",pos1); if(pos1 == string::npos||pos2 == string::npos){ ShowError(CurLine,"Missing define name !"); err=1; continue; } ThisDefineName = str.substr(pos1,pos2-1); if(ThisDefineName.find_first_not_of("qwertyuiopasdfghjklzxcvbnm1234567890._") != string::npos){ ShowError(CurLine,"Define name can contain only characters from [a-z],'.' and '_'."); err=1; continue; } for(i=0;i<NumDefines;i++){ if(ThisDefineName == DefineNames[i]){ ShowError(CurLine,ThisDefineName + " already defined !"); err=1; break; } } if(err)continue; for(i=0;i<=NumAGICommands;i++){ if(ThisDefineName == AGICommand[i].Name){ ShowError(CurLine,"Define name can not be a command name."); err=1; break; } } if(err)continue; for(i=1;i<=NumTestCommands;i++){ if(ThisDefineName == TestCommand[i].Name){ ShowError(CurLine,"Define name can not be a command name."); err=1; break; } } if(err)continue; if(ThisDefineName == "if" || ThisDefineName == "else" || ThisDefineName == "goto"){ ShowError(CurLine,"Invalid define name (" + ThisDefineName + ")"); err=1; continue; } pos1 = str.find_first_not_of(" ",pos2+1); if(pos1 == string::npos){ ShowError(CurLine,"Missing define value !"); err=1; continue; } if(str[pos1] == '"'){ ThisDefineValue = "\"" + ReadString(&pos1,str) + "\""; if(ErrorOccured)continue; if(str.find_first_not_of(" ",pos1) != string::npos){ ShowError(CurLine,"Nothing allowed on line after define value."); err=1; continue; } } else{ pos2 = str.find_first_of(" ",pos1+1); if(pos2 == string::npos){ ThisDefineValue = str.substr(pos1); } else{ ThisDefineValue = str.substr(pos1,pos2-pos1); if(str.find_first_not_of(" ",pos2) != string::npos){ ShowError(CurLine,"Nothing allowed on line after define value."); err=1; continue; } } if(ThisDefineValue.find_first_not_of("qwertyuiopasdfghjklzxcvbnm1234567890._") != string::npos){ ShowError(CurLine,"Non-string define value can contain only characters from [a-z],'.' and '_'."); err=1; continue; } } DefineNames[NumDefines]=ThisDefineName; DefineValues[NumDefines]=ThisDefineValue; DefineNameLength[NumDefines] = ThisDefineName.length(); NumDefines++; EditLines.replace(CurLine,empty_tmp); } return err; } //*************************************************** int Logic::ReadPredefinedMessages() { int err=0,i; string::size_type pos1; int MessageNum; for(i=0;i<MaxMessages;i++){ Messages[i]=""; MessageExists[i]=false; } for(CurLine = 0;CurLine<EditLines.num;CurLine++){ #ifdef _WIN32 if(_strnicmp(EditLines.at(CurLine).c_str(),"#message",8)){ #else if(strncasecmp(EditLines.at(CurLine).c_str(),"#message",8)){ #endif continue; } string str = EditLines.at(CurLine).substr(8); if(str[0] != ' '){ ShowError(CurLine,"' ' expected after #message."); err=1; continue; } MessageNum = atoi(str.c_str()); if(MessageNum==0){ ShowError(CurLine,"Invalid message number (must be 1-255)."); err=1; continue; } pos1 = str.find_first_of("\""); if(pos1 == string::npos){ ShowError(CurLine,"\" required at start of string."); err=1; continue; } Messages[MessageNum]=ReadString(&pos1,str); if(ErrorOccured)continue; if(Messages[MessageNum].find_first_not_of(" ",pos1) != string::npos){ sprintf(tmp,"Nothing allowed on line after message. "); ShowError(CurLine,tmp); err=1; continue; } MessageExists[MessageNum] =true; EditLines.replace(CurLine,empty_tmp); } return err; } //*************************************************** int Logic::ReadLabels() { int err=0,i; string::size_type pos1,pos2; string LabelName; int CurLine; NumLabels = 0; for(CurLine = 0;CurLine<EditLines.num;CurLine++){ string str = EditLines.at(CurLine); toLower(&str); pos1 = str.find_first_not_of(" "); if(pos1 == string::npos)continue; pos2 = str.find_first_not_of("qwertyuiopasdfghjklzxcvbnm1234567890._",pos1); if(pos2 == string::npos)continue; if((pos1 == pos2) || (str[pos2]!=':'))continue; LabelName = str.substr(pos1,pos2-pos1); for(i=1;i<=NumLabels;i++){ if(LabelName == Labels[i].Name){ ShowError(CurLine,"Label "+LabelName+" already defined."); err=1;break; } } if(err)continue; if(NumLabels > MaxLabels){ ShowError(CurLine,"Too many labels (max "+IntToStr(MaxLabels)+")"); err=1;continue; } if(LabelName == "if" || LabelName == "else" || LabelName == "goto"){ ShowError(CurLine,"Invalid label name ("+LabelName+")"); err=1;continue; } for(i=0;i<NumDefines;i++){ if((LabelName == DefineNames[i]) || (LabelName+":" == DefineNames[i])){ ShowError(CurLine,"Can't have a label with the same name a a define."); err=1;break; } } if(err)continue; NumLabels++; Labels[NumLabels].Name = LabelName; Labels[NumLabels].Loc = 0; } return err; }