Пример #1
0
void
GffFileWriter::WriteLabelEntries(
	__inout GFF_HEADER & Header,
	__in GffWriteContext * Context
	)
/*++

Routine Description:

	This routine writes the contents of each label out to the writer context.

Arguments:

	Header - Receives the constructed file header.  The header is updated as
	         the write operation progresses.

	Context - Supplies the write context that receives the contents of the
	          formatted GFF file.

Return Value:

	None.  The routine raises an std::exception on failure.

Environment:

	User mode.

--*/
{
	STRUCT_INDEX  StructIndex;
	LabelIndexMap AssignedLabels;

#if !GFFFILEWRITER_PRETRACK_STRUCTS
	//
	// If we are not pre-tracking structures, clear out any lingering state
	// from a previous write attempt and then index each structure in the tree.
	//
	// Note that the index takes weak references as the data tree does not
	// change during the lifetime of the index (m_Structs).
	//

	m_Structs.clear( );

	AddStructRecursive( m_RootStruct.get( ) );
#endif

	//
	// Write the label of each field to disk.  Also, take the opportunity to
	// assign struct indicies now as the struct array is frozen for writing and
	// this is our first pass.
	//

	StructIndex = 0;

	for (FieldStructIdxVec::iterator it = m_Structs.begin( );
	     it != m_Structs.end( );
	     ++it)
	{
		(*it)->StructIndex = StructIndex++;

		for (FieldEntryVec::iterator fit = (*it)->StructFields.begin( );
		     fit != (*it)->StructFields.end( );
		     ++fit)
		{
			LabelIndexMap::iterator lit;

			//
			// If we have not already stored this label, assign a new label
			// index and write it.
			//

			lit = AssignedLabels.find( fit->FieldLabelEntry );

			if (lit == AssignedLabels.end( ))
			{
				Context->Write( fit->FieldLabel, sizeof( fit->FieldLabel ) );

				AssignedLabels.insert(
					LabelIndexMap::value_type(
						fit->FieldLabelEntry,
						Header.LabelCount
						)
					);

				fit->FieldLabelIndex = (LABEL_INDEX) Header.LabelCount;

				Header.LabelCount += 1;
			}
			else
			{
				fit->FieldLabelIndex = lit->second;
			}
		}
	}
}
Пример #2
0
bool
testJointBoostFile(string const & path)
{
  ifstream in(path.c_str());
  if (!in)
  {
    cerr << "Couldn't open file " << path << endl;
    return false;
  }

  cout << "======================================" << endl;
  cout << "Reading features from file" << endl;
  cout << "======================================" << endl << endl;

  ExampleSet::Ptr all_training;
  ExampleSet::Ptr training_subset;
  ExampleSet::Ptr holdout_subset;

  typedef UnorderedMap<string, long> LabelIndexMap;
  LabelIndexMap labels;

  vector<double> features;
  double feature;
  string label;

  string line;
  while (getline(in, line))
  {
    line = trimWhitespace(line);
    if (line.empty())
      continue;

    vector<string> fields;
    stringSplit(line, ",\t", fields, /* skip_empty_fields = */ false);

    if (fields.size() < 2)
    {
      cerr << "Data has too few features per example" << endl;
      return false;
    }

    long nfeat = (long)fields.size() - 1;
    if (!all_training)
    {
      cout << "Data has " << nfeat << " features per example" << endl;
      all_training = ExampleSet::Ptr(new ExampleSet(nfeat));
    }
    else
    {
      if (nfeat != all_training->numFeatures())
      {
        cout << "Inconsistent number of features for example " << all_training->numExamples() << endl;
        return false;
      }
    }

    features.clear();
    for (long i = 0; i < nfeat; ++i)
    {
      istringstream iss(fields[(size_t)i]);
      iss >> feature;
      features.push_back(feature);
    }

    label = fields.back();

    LabelIndexMap::const_iterator existing_label = labels.find(label);
    long index;
    if (existing_label == labels.end())
    {
      index = (long)labels.size();
      labels[label] = index;
      cout << "Added class with label '" << label << "' and index " << index << endl;
    }
    else
      index = existing_label->second;

    all_training->addExample(features, index);

    if (rand() % 2)  // holdout half the input set for testing
    {
      if (!training_subset) training_subset = ExampleSet::Ptr(new ExampleSet(nfeat));
      training_subset->addExample(features, index);
    }
    else
    {
      if (!holdout_subset) holdout_subset = ExampleSet::Ptr(new ExampleSet(nfeat));
      holdout_subset->addExample(features, index);
    }
  }

  if (!all_training)
  {
    cout << "Could not read any lines from file" << endl;
    return false;
  }

  long num_classes = (long)labels.size();

  cout << "Read " << all_training->numExamples() << " examples from " << num_classes << " classes from file" << endl;

  JointBoost::Options opts;

#ifdef JB_FAST
  opts.setMinBoostingRounds(num_classes)
      .setMaxBoostingRounds(4 * num_classes)
      .setMinFractionalErrorReduction(-1)
      .setFeatureSamplingFraction(3.0 / all_training->numFeatures())
      .setMaxThresholdsFraction(0.25)
      .setForceGreedy(true)
      .setVerbose(false);

  // Options for bupa
  // opts.setMinBoostingRounds(10 * num_classes)
  //     .setMaxBoostingRounds(40 * num_classes)
  //     .setMinFractionalErrorReduction(0.00001)
  //     .setFeatureSamplingFraction(1)
  //     .setMaxThresholdsFraction(0.25)
  //     .setForceGreedy(true)
  //     .setVerbose(false);

  // Options for pendigits
  // opts.setMinBoostingRounds(min(num_classes, 3L))
  //     .setMaxBoostingRounds(4 * num_classes)
  //     .setMinFractionalErrorReduction(0.0000001)
  //     .setFeatureSamplingFraction(0.25)
  //     .setMaxThresholdsFraction(0.001)
  //     .setForceGreedy(true)
  //     .setVerbose(true);
#else
  opts.setMinBoostingRounds(10 * num_classes)
      .setMaxBoostingRounds(40 * num_classes)
      .setMinFractionalErrorReduction(0.00001)
      .setFeatureSamplingFraction(1)
      .setMaxThresholdsFraction(1)
      .setForceExhaustive(true)
      .setVerbose(true);
#endif

  JointBoost jb(num_classes, all_training->numFeatures(), opts);

  // Self-testing
  {
    cout << endl;
    cout << "======================================" << endl;
    cout << "Self-testing" << endl;
    cout << "======================================" << endl << endl;

    jb.train(*all_training);
    jb.dumpToConsole();

    if (!test(jb, *all_training, true))
      return false;
  }

  jb.clear();

  // Holdout-testing
  if (training_subset && holdout_subset)
  {
    cout << endl;
    cout << "======================================" << endl;
    cout << "Holdout-testing" << endl;
    cout << "======================================" << endl << endl;

    jb.train(*training_subset);
    jb.dumpToConsole();

    if (!test(jb, *holdout_subset, true))
      return false;
  }
  else
  {
    cerr << "Not enough examples to do holdout testing" << endl;
    return false;
  }

  return true;
}