void GffFileWriter::WriteLabelEntries( __inout GFF_HEADER & Header, __in GffWriteContext * Context ) /*++ Routine Description: This routine writes the contents of each label out to the writer context. Arguments: Header - Receives the constructed file header. The header is updated as the write operation progresses. Context - Supplies the write context that receives the contents of the formatted GFF file. Return Value: None. The routine raises an std::exception on failure. Environment: User mode. --*/ { STRUCT_INDEX StructIndex; LabelIndexMap AssignedLabels; #if !GFFFILEWRITER_PRETRACK_STRUCTS // // If we are not pre-tracking structures, clear out any lingering state // from a previous write attempt and then index each structure in the tree. // // Note that the index takes weak references as the data tree does not // change during the lifetime of the index (m_Structs). // m_Structs.clear( ); AddStructRecursive( m_RootStruct.get( ) ); #endif // // Write the label of each field to disk. Also, take the opportunity to // assign struct indicies now as the struct array is frozen for writing and // this is our first pass. // StructIndex = 0; for (FieldStructIdxVec::iterator it = m_Structs.begin( ); it != m_Structs.end( ); ++it) { (*it)->StructIndex = StructIndex++; for (FieldEntryVec::iterator fit = (*it)->StructFields.begin( ); fit != (*it)->StructFields.end( ); ++fit) { LabelIndexMap::iterator lit; // // If we have not already stored this label, assign a new label // index and write it. // lit = AssignedLabels.find( fit->FieldLabelEntry ); if (lit == AssignedLabels.end( )) { Context->Write( fit->FieldLabel, sizeof( fit->FieldLabel ) ); AssignedLabels.insert( LabelIndexMap::value_type( fit->FieldLabelEntry, Header.LabelCount ) ); fit->FieldLabelIndex = (LABEL_INDEX) Header.LabelCount; Header.LabelCount += 1; } else { fit->FieldLabelIndex = lit->second; } } } }
bool testJointBoostFile(string const & path) { ifstream in(path.c_str()); if (!in) { cerr << "Couldn't open file " << path << endl; return false; } cout << "======================================" << endl; cout << "Reading features from file" << endl; cout << "======================================" << endl << endl; ExampleSet::Ptr all_training; ExampleSet::Ptr training_subset; ExampleSet::Ptr holdout_subset; typedef UnorderedMap<string, long> LabelIndexMap; LabelIndexMap labels; vector<double> features; double feature; string label; string line; while (getline(in, line)) { line = trimWhitespace(line); if (line.empty()) continue; vector<string> fields; stringSplit(line, ",\t", fields, /* skip_empty_fields = */ false); if (fields.size() < 2) { cerr << "Data has too few features per example" << endl; return false; } long nfeat = (long)fields.size() - 1; if (!all_training) { cout << "Data has " << nfeat << " features per example" << endl; all_training = ExampleSet::Ptr(new ExampleSet(nfeat)); } else { if (nfeat != all_training->numFeatures()) { cout << "Inconsistent number of features for example " << all_training->numExamples() << endl; return false; } } features.clear(); for (long i = 0; i < nfeat; ++i) { istringstream iss(fields[(size_t)i]); iss >> feature; features.push_back(feature); } label = fields.back(); LabelIndexMap::const_iterator existing_label = labels.find(label); long index; if (existing_label == labels.end()) { index = (long)labels.size(); labels[label] = index; cout << "Added class with label '" << label << "' and index " << index << endl; } else index = existing_label->second; all_training->addExample(features, index); if (rand() % 2) // holdout half the input set for testing { if (!training_subset) training_subset = ExampleSet::Ptr(new ExampleSet(nfeat)); training_subset->addExample(features, index); } else { if (!holdout_subset) holdout_subset = ExampleSet::Ptr(new ExampleSet(nfeat)); holdout_subset->addExample(features, index); } } if (!all_training) { cout << "Could not read any lines from file" << endl; return false; } long num_classes = (long)labels.size(); cout << "Read " << all_training->numExamples() << " examples from " << num_classes << " classes from file" << endl; JointBoost::Options opts; #ifdef JB_FAST opts.setMinBoostingRounds(num_classes) .setMaxBoostingRounds(4 * num_classes) .setMinFractionalErrorReduction(-1) .setFeatureSamplingFraction(3.0 / all_training->numFeatures()) .setMaxThresholdsFraction(0.25) .setForceGreedy(true) .setVerbose(false); // Options for bupa // opts.setMinBoostingRounds(10 * num_classes) // .setMaxBoostingRounds(40 * num_classes) // .setMinFractionalErrorReduction(0.00001) // .setFeatureSamplingFraction(1) // .setMaxThresholdsFraction(0.25) // .setForceGreedy(true) // .setVerbose(false); // Options for pendigits // opts.setMinBoostingRounds(min(num_classes, 3L)) // .setMaxBoostingRounds(4 * num_classes) // .setMinFractionalErrorReduction(0.0000001) // .setFeatureSamplingFraction(0.25) // .setMaxThresholdsFraction(0.001) // .setForceGreedy(true) // .setVerbose(true); #else opts.setMinBoostingRounds(10 * num_classes) .setMaxBoostingRounds(40 * num_classes) .setMinFractionalErrorReduction(0.00001) .setFeatureSamplingFraction(1) .setMaxThresholdsFraction(1) .setForceExhaustive(true) .setVerbose(true); #endif JointBoost jb(num_classes, all_training->numFeatures(), opts); // Self-testing { cout << endl; cout << "======================================" << endl; cout << "Self-testing" << endl; cout << "======================================" << endl << endl; jb.train(*all_training); jb.dumpToConsole(); if (!test(jb, *all_training, true)) return false; } jb.clear(); // Holdout-testing if (training_subset && holdout_subset) { cout << endl; cout << "======================================" << endl; cout << "Holdout-testing" << endl; cout << "======================================" << endl << endl; jb.train(*training_subset); jb.dumpToConsole(); if (!test(jb, *holdout_subset, true)) return false; } else { cerr << "Not enough examples to do holdout testing" << endl; return false; } return true; }