void TCartesianClassifier::domainHasChanged() { TEnumVariable *classV = mlnew TEnumVariable("new"); classVar = classV; mults = vector<int>(domain->attributes->size(), 0); TLimitsCounter counter(vector<int>(domain->attributes->size(), 0)); int mult = 1; vector<int>::reverse_iterator li(counter.limits.rbegin()); vector<int>::reverse_iterator mi(mults.rbegin()); for(TVarList::reverse_iterator vi(domain->attributes->rbegin()), ve(domain->attributes->rend()); vi!=ve; vi++) { if ((*vi)->varType!=TValue::INTVAR) raiseError("invalid attribute '%s' (discrete attributes expected)", (*vi)->get_name().c_str()); *li = (*vi)->noOfValues(); if (!*li) raiseError("invalid attribute '%s' (no values)", (*vi)->get_name().c_str()); (*(mi++)) = mult; mult *= *(li++); } counter.reset(); do { string val; TVarList::iterator vi(domain->attributes->begin()); ITERATE(TLimitsCounter, ci, counter) { if (val.length()) val+="_"; val += (*(vi++)).AS(TEnumVariable)->values->at(*ci); } classV->addValue(val); } while (counter.next()); }
TCostMatrix::TCostMatrix(PVariable acv, const float &inside) : classVar(acv), dimension(0), costs(NULL) { TEnumVariable *dcv = classVar.AS(TEnumVariable); if (!dcv) raiseError("attribute '%s' is not discrete", classVar->get_name().c_str()); dimension = dcv->noOfValues(); if (!dimension) raiseError("attribute '%s' has no values", classVar->get_name().c_str()); init(inside); }
PClassifier TTreeSplitConstructor_ExhaustiveBinary::operator()( PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute, PExampleGenerator gen, const int &weightID , PDomainContingency dcont, PDistribution apriorClass, const vector<bool> &candidates, PClassifier ) { checkProperty(measure); measure->checkClassTypeExc(gen->domain->classVar->varType); PIntList bestMapping; int wins, bestAttr; PVariable bvar; if (measure->needs==TMeasureAttribute::Generator) { bool cse = candidates.size()==0; bool haveCandidates = false; vector<bool> myCandidates; myCandidates.reserve(gen->domain->attributes->size()); vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); TVarList::const_iterator vi, ve(gen->domain->attributes->end()); for(vi = gen->domain->attributes->begin(); vi != ve; vi++) { bool co = (*vi)->varType == TValue::INTVAR && (!cse || (ci!=ce) && *ci); myCandidates.push_back(co); haveCandidates = haveCandidates || co; } if (!haveCandidates) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); PDistribution thisSubsets; float thisQuality; wins = 0; int thisAttr = 0; int N = gen->numberOfExamples(); TSimpleRandomGenerator rgen(N); ci = myCandidates.begin(); for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) { if (*ci) { thisSubsets = NULL; PIntList thisMapping = /*throughCont ? measure->bestBinarization(thisSubsets, thisQuality, *dci, dcont->classes, apriorClass, minSubset) : */measure->bestBinarization(thisSubsets, thisQuality, *vi, gen, apriorClass, weightID, minSubset); if (thisMapping && ( (!wins || (thisQuality>quality)) && ((wins=1)==1) || (thisQuality==quality) && rgen.randbool(++wins))) { bestAttr = thisAttr; quality = thisQuality; subsetSizes = thisSubsets; bestMapping = thisMapping; } } /*if (thoughCont) dci++; */ } if (!wins) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); if (quality<worstAcceptable) return returnNothing(descriptions, subsetSizes, spentAttribute); if (subsetSizes && subsetSizes->variable) bvar = subsetSizes->variable; else { TEnumVariable *evar = mlnew TEnumVariable(""); evar->addValue("0"); evar->addValue("1"); bvar = evar; } } else { bool cse = candidates.size()==0; if (!cse && noCandidates(candidates)) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); if (!dcont || dcont->classIsOuter) { dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID)); // raiseWarningWho("TreeSplitConstructor_ExhaustiveBinary", "this class is not optimized for 'candidates' list and can be very slow"); } int N = gen ? gen->numberOfExamples() : -1; if (N<0) N = dcont->classes->cases; TSimpleRandomGenerator rgen(N); PDistribution classDistribution = dcont->classes; vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); TDiscDistribution *dis0, *dis1; TContDistribution *con0, *con1; int thisAttr = 0; bestAttr = -1; wins = 0; quality = 0.0; float leftExamples, rightExamples; TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) { // We consider the attribute only if it is a candidate, discrete and has at least two values if ((cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && ((*dci)->discrete->size()>=2)) { const TDistributionVector &distr = *(*dci)->discrete; if (distr.size()>16) raiseError("'%s' has more than 16 values, cannot exhaustively binarize", gen->domain->attributes->at(thisAttr)->get_name().c_str()); // If the attribute is binary, we check subsetSizes and assess the quality if they are OK if (distr.size()==2) { if ((distr.front()->abs<minSubset) || (distr.back()->abs<minSubset)) continue; // next attribute else { float thisMeas = measure->call(thisAttr, dcont, apriorClass); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { bestAttr = thisAttr; quality = thisMeas; leftExamples = distr.front()->abs; rightExamples = distr.back()->abs; bestMapping = mlnew TIntList(2, 0); bestMapping->at(1) = 1; } continue; } } vector<int> valueIndices; int ind = 0; for(TDistributionVector::const_iterator dvi(distr.begin()), dve(distr.end()); (dvi!=dve); dvi++, ind++) if ((*dvi)->abs>0) valueIndices.push_back(ind); if (valueIndices.size()<2) continue; PContingency cont = prepareBinaryCheat(classDistribution, *dci, bvar, dis0, dis1, con0, con1); // A real job: go through all splits int binWins = 0; float binQuality = -1.0; float binLeftExamples = -1.0, binRightExamples = -1.0; // Selection: each element correspons to a value of the original attribute and is 1, if the value goes right // The first value always goes left (and has no corresponding bit in selection. TBoolCount selection(valueIndices.size()-1), bestSelection(0); // First for discrete classes if (dis0) { do { *dis0 = CAST_TO_DISCDISTRIBUTION(distr[valueIndices[0]]); *dis1 *= 0; vector<int>::const_iterator ii(valueIndices.begin()); ii++; for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++) *(*bi ? dis1 : dis0) += distr[*ii]; cont->outerDistribution->setint(0, dis0->abs); cont->outerDistribution->setint(1, dis1->abs); if ((dis0->abs < minSubset) || (dis1->abs < minSubset)) continue; // cannot split like that, to few examples in one of the branches float thisMeas = measure->operator()(cont, classDistribution, apriorClass); if ( ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1) || (thisMeas==binQuality) && rgen.randbool(++binWins)) { bestSelection = selection; binQuality = thisMeas; binLeftExamples = dis0->abs; binRightExamples = dis1->abs; } } while (selection.next()); } // And then exactly the same for continuous classes else { do { *con0 = CAST_TO_CONTDISTRIBUTION(distr[0]); *con1 = TContDistribution(); vector<int>::const_iterator ii(valueIndices.begin()); for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++) *(*bi ? con1 : con0) += distr[*ii]; if ((con0->abs<minSubset) || (con1->abs<minSubset)) continue; // cannot split like that, to few examples in one of the branches float thisMeas = measure->operator()(cont, classDistribution, apriorClass); if ( ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1) || (thisMeas==binQuality) && rgen.randbool(++binWins)) { bestSelection = selection; binQuality = thisMeas; binLeftExamples = con0->abs; binRightExamples = con1->abs; } } while (selection.next()); } if ( binWins && ( (!wins || (binQuality>quality)) && ((wins=1)==1) || (binQuality==quality) && rgen.randbool(++wins))) { bestAttr = thisAttr; quality = binQuality; leftExamples = binLeftExamples; rightExamples = binRightExamples; bestMapping = mlnew TIntList(distr.size(), -1); vector<int>::const_iterator ii = valueIndices.begin(); bestMapping->at(*(ii++)) = 0; ITERATE(TBoolCount, bi, bestSelection) bestMapping->at(*(ii++)) = *bi ? 1 : 0; } } } if (!wins) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); subsetSizes = mlnew TDiscDistribution(); subsetSizes->addint(0, leftExamples); subsetSizes->addint(1, rightExamples); } PVariable attribute = gen->domain->attributes->at(bestAttr); if (attribute->noOfValues() == 2) { spentAttribute = bestAttr; descriptions = mlnew TStringList(attribute.AS(TEnumVariable)->values.getReference()); TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes); cfv->transformUnknowns = false; return cfv; } string s0, s1; int ns0 = 0, ns1 = 0; TValue ev; attribute->firstValue(ev); PITERATE(TIntList, mi, bestMapping) { string str; attribute->val2str(ev, str); if (*mi==1) { s1 += string(ns1 ? ", " : "") + str; ns1++; } else if (*mi==0) { s0 += string(ns0 ? ", " : "") + str; ns0++; } attribute->nextValue(ev); }
bool convertFromPythonExisting(PyObject *lst, TExample &example) { PDomain dom=example.domain; if (PyOrExample_Check(lst)) { const TExample &orex = PyExample_AS_ExampleReference(lst); if (orex.domain != dom) dom->convert(example, orex); else example = orex; return true; } if (!PyList_Check(lst)) { PyErr_Format(PyExc_TypeError, "invalid argument type (expected list, got '%s)", lst ? lst->ob_type->tp_name : "None"); return false; } int const nvars = dom->variables->size() + dom->classVars->size(); if (Py_ssize_t(nvars) != PyList_Size(lst)) { PyErr_Format(PyExc_IndexError, "invalid list size (got %i, expected %i items)", PyList_Size(lst), nvars); return false; } Py_ssize_t pos = 0; TExample::iterator ei(example.begin()); TVarList::iterator vi(dom->variables->begin()); TVarList::const_iterator const ve(dom->variables->end()); TVarList::const_iterator const ce(dom->classVars->end()); while(vi != ce && vi != ve) { PyObject *li=PyList_GetItem(lst, pos++); if (!li) PYERROR(PyExc_SystemError, "can't read the list", false); if (PyOrValue_Check(li)) if (PyValue_AS_Variable(li) ? (PyValue_AS_Variable(li) != *vi) : (PyValue_AS_Value(li).varType=!(*vi)->varType) ) { PyErr_Format(PyExc_TypeError, "wrong value type for attribute no. %i (%s)", pos, (*vi)->get_name().c_str()); return false; } else *(ei++)=PyValue_AS_Value(li); else { if (li == Py_None) { *(ei++) = (*vi)->DK(); } else if (PyString_Check(li)) { (*vi)->str2val(string(PyString_AsString(li)), *(ei++)); } else if ((*vi)->varType==TValue::INTVAR) { if (PyInt_Check(li)) { TEnumVariable * enumvar = dynamic_cast<TEnumVariable *>(vi->getUnwrappedPtr()); int value = int(PyInt_AsLong(li)); if (value < 0 || value >= enumvar->noOfValues()) { PyErr_Format(PyExc_ValueError, "value index %i out of range (0 - %i) at attribute no %i (%s)", value, enumvar->noOfValues() - 1, pos, enumvar->get_name().c_str()); return false; } *(ei++) = TValue(value); } else { PyErr_Format(PyExc_TypeError, "attribute no. %i (%s) is ordinal, string or int value expected", pos, (*vi)->get_name().c_str()); return false; } } else if ((*vi)->varType==TValue::FLOATVAR) { float f; if (PyNumber_ToFloat(li, f)) *(ei++) = TValue(f); else { PyErr_Format(PyExc_TypeError, "attribute no. %i (%s) is continuous, float value expected", pos, (*vi)->get_name().c_str()); return false; } } else ei++; } if (++vi == ve) { vi = dom->classVars->begin(); } } return true; }