IdxPair distribute(unsigned Nodes, unsigned Elements, unsigned Capacity, const unsigned *CurSize, unsigned NewSize[], unsigned Position, bool Grow) { assert(Elements + Grow <= Nodes * Capacity && "Not enough room for elements"); assert(Position <= Elements && "Invalid position"); if (!Nodes) return IdxPair(); // Trivial algorithm: left-leaning even distribution. const unsigned PerNode = (Elements + Grow) / Nodes; const unsigned Extra = (Elements + Grow) % Nodes; IdxPair PosPair = IdxPair(Nodes, 0); unsigned Sum = 0; for (unsigned n = 0; n != Nodes; ++n) { Sum += NewSize[n] = PerNode + (n < Extra); if (PosPair.first == Nodes && Sum > Position) PosPair = IdxPair(n, Position - (Sum - NewSize[n])); } assert(Sum == Elements + Grow && "Bad distribution sum"); // Subtract the Grow element that was added. if (Grow) { assert(PosPair.first < Nodes && "Bad algebra"); assert(NewSize[PosPair.first] && "Too few elements to need Grow"); --NewSize[PosPair.first]; } #ifndef NDEBUG Sum = 0; for (unsigned n = 0; n != Nodes; ++n) { assert(NewSize[n] <= Capacity && "Overallocated node"); Sum += NewSize[n]; } assert(Sum == Elements && "Bad distribution sum"); #endif return PosPair; }
// chooses split operators that can be usesful in separating the classes // this is done before an intial model is created // This function does not create the appropriate features, and should be // called on a temporary list. // verbose = 0, minimal chatter // verbose = 1, one line summary for each feature // verbose > 1, more details void MlOperatorList::selectInitialSplitOperators(MlTrainingContainer& mtc, const MlFeatureSet& featureSet, bool addBooleanIndicators, size_t verboseLevel) { MlOperatorSearchData* auxilaryData = mtc.getAuxilarySearchData(); const size_t featureGenerationType = mtc.getFeatureGenerationType(); const double minRelativeInformationGain = mtc.getMinRelativeInformationGain(); const size_t maxNumSplits = mtc.getMaxNumSplits(); const vector<size_t>& numFeatureValues = auxilaryData->numFeatureValues_; const vector<MlFeature>& features = featureSet.getFeatures(); assert( auxilaryData->getNumBasicFeatures() == features.size()); if (operatorFeatureSpaceSize_ < features.size()) operatorFeatureSpaceSize_ = features.size(); for (size_t i=0; i<features.size(); i++) { if (numFeatureValues[i] < 2) continue; if (verboseLevel>1) cout << endl << "FEATURE " << i << " - " << features[i].getName() << endl; SplitOperator sop; double gain = auxilaryData->findOptimalSplit(i, sop.thresholds, -0.333, minRelativeInformationGain, maxNumSplits, 300, (verboseLevel>1) ); if (verboseLevel == 1) { cout << "F" << i << "\t" << features[i].getName() << "\tgain:" << fixed << setprecision(4) << gain; cout << " ["; for (size_t i=0; i<sop.thresholds.size(); i++) cout << " " << sop.thresholds[i]; cout << " ]" << endl; } if (sop.thresholds.size()>0) { // add for now sop.sourceIdx = i; sop.indexesForBinValues.resize(sop.thresholds.size()+1,MAX_UINT); for (size_t i=0; i<=sop.thresholds.size(); i++) sop.indexesForBinValues[i]=operatorFeatureSpaceSize_++; if (addBooleanIndicators) { sop.indexesForBinIndicators.resize(sop.thresholds.size()+1,MAX_UINT); for (size_t i=0; i<=sop.thresholds.size(); i++) sop.indexesForBinIndicators[i]=operatorFeatureSpaceSize_++; } executionOrder_.push_back(IdxPair(OT_SPLIT,splits_.size())); splits_.push_back(sop); if (verboseLevel>1) { cout << endl << "[" << setprecision(4); for (size_t j=0; j<sop.thresholds.size(); j++) cout << " " << sop.thresholds[j]; cout << "]" << endl; } } } }
bool MlOperatorList::readOperatorList(ifstream& ifs) { char buffer[1024]; while (ifs.good() && ifs.getline(buffer,1024)) if (ifs.gcount()>0 && buffer[0] != '#') break; size_t n=0; if (sscanf(buffer,"%d",&n) != 1) error("expected line with number of operaotrs"); executionOrder_.clear(); executionOrder_.reserve(n); drops_.clear(); indicators_.clear(); normalizations_.clear(); functions_.clear(); splits_.clear(); conditionals_.clear(); for (size_t i=0; i<n; i++) { if (! ifs.good()) return false; ifs.getline(buffer,1024); if (ifs.gcount()<=0) return false; istringstream iss(buffer); char type=' '; iss >> type; switch (type) { case 'D': { size_t dropIdx=0; iss >> dropIdx; if (iss.fail()) error("Error reading line:",buffer); executionOrder_.push_back(IdxPair(OT_DROP,drops_.size())); drops_.push_back(dropIdx); } break; case 'I': { IndicatorOperator iop; iss >> iop.sourceIdx >> iop.targetIdx; if (iss.fail()) error("Error reading line:",buffer); executionOrder_.push_back(IdxPair(OT_INDICATOR,indicators_.size())); indicators_.push_back(iop); break; } case 'F': { FunctionOperator fop; iss >> fop.sourceIdx >> fop.targetIdx; string typeStr; iss >> typeStr; size_t i; for (i=0; i<numConditionalValueLabels; i++) if (! strcmp(typeStr.c_str(),conditionalValueLabels[i])) break; if (i == numConditionalValueLabels) error("Error reading line:",buffer); fop.type = i; executionOrder_.push_back(IdxPair(OT_FUNCTION,functions_.size())); functions_.push_back(fop); } case 'N': { NormalizationOperator nop; iss >> nop.sourceIdx >> nop.targetIdx >> nop.mu >> nop.sigma; if (iss.fail()) error("Error reading line:",buffer); executionOrder_.push_back(IdxPair(OT_NORMALIZATION,normalizations_.size())); normalizations_.push_back(nop); } break; case 'S': { SplitOperator sop; iss >> sop.sourceIdx; size_t n; iss >> n; if (iss.fail()) error("Error reading line:",buffer); sop.thresholds.resize(n); sop.indexesForBinValues.resize(n+1); sop.indexesForBinIndicators.resize(n+1); for (size_t i=0; i<n; i++) iss >> sop.thresholds[i]; for (size_t i=0; i<=n; i++) { // a fix so the model file can use -1 for MAX_UINT (so it won't hurt my eyes...) int index; iss >> index; sop.indexesForBinValues[i] = (index>=0 ? static_cast<size_t>(index) : MAX_UINT); } for (size_t i=0; i<=n; i++) { // a fix so the model file can use -1 for MAX_UINT (so it won't hurt my eyes...) int index; iss >> index; sop.indexesForBinIndicators[i] = (index>=0 ? static_cast<size_t>(index) : MAX_UINT); } if (iss.fail()) error("Error reading line:",buffer); executionOrder_.push_back(IdxPair(OT_SPLIT,splits_.size())); splits_.push_back(sop); } break; case 'C': { ConditionalOperator cod; size_t n; iss >> n; cod.sourceIdxs.resize(n); for (size_t i=0; i<n; i++) iss >> cod.sourceIdxs[i]; iss >> cod.targetIdx; // a fix so the model file can use -1 for MAX_UINT (so it won't hurt my eyes...) int indexForBool; iss >> indexForBool; cod.indexForBool = (indexForBool>=0 ? static_cast<size_t>(indexForBool) : MAX_UINT); string resultStr, conditionStr; iss >> conditionStr >> resultStr; if (iss.fail()) error("Error reading line:",buffer); size_t idxType, idxResult; for (idxType=0; idxType<numConditionalOperatorLabels; idxType++) if (! strcmp(conditionStr.c_str(),conditionalOperatorLabels[idxType])) break; if (idxType == numConditionalOperatorLabels) error("Error reading line:",buffer); for (idxResult=0; idxResult<numConditionalValueLabels; idxResult++) if (! strcmp(resultStr.c_str(),conditionalValueLabels[idxResult])) break; if (idxResult == numConditionalValueLabels) error("Error reading line:",buffer); cod.conditionType = idxType; cod.resultType = idxResult; executionOrder_.push_back(IdxPair(OT_CONDITIONAL,conditionals_.size())); conditionals_.push_back(cod); } break; }; } return true; }