Ejemplo n.º 1
0
void Alignment::Assign(Alignment &rhs) {
    ((AlignmentStats*)this)->Assign(rhs);
    qPos = rhs.qPos; tPos = rhs.tPos;
    qAlignLength = rhs.qAlignLength;
    tAlignLength = rhs.tAlignLength;
    qLength = rhs.qLength;
    zScore  = rhs.zScore;
    qName   = rhs.qName;
    tName   = rhs.tName;
    qStrand = rhs.qStrand;
    tStrand = rhs.tStrand;
    nCells  = rhs.nCells;
    std::vector<Block> empty;
    blocks.swap(empty);
    blocks.resize(rhs.size());
    int b;
    for (b = 0; b < rhs.blocks.size(); b++) {
        blocks[b].Assign(rhs.blocks[b]);
    }
}
Ejemplo n.º 2
0
int ExtractGHKM::Main(int argc, char *argv[])
{
  // Process command-line options.
  Options options;
  ProcessOptions(argc, argv, options);

  // Open input files.
  InputFileStream targetStream(options.targetFile);
  InputFileStream sourceStream(options.sourceFile);
  InputFileStream alignmentStream(options.alignmentFile);

  // Open output files.
  OutputFileStream fwdExtractStream;
  OutputFileStream invExtractStream;
  std::ofstream glueGrammarStream;
  std::ofstream unknownWordStream;
  std::string fwdFileName = options.extractFile;
  std::string invFileName = options.extractFile + std::string(".inv");
  if (options.gzOutput) {
    fwdFileName += ".gz";
    invFileName += ".gz";
  }
  OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
  OpenOutputFileOrDie(invFileName, invExtractStream);
  if (!options.glueGrammarFile.empty()) {
    OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
  }
  if (!options.unknownWordFile.empty()) {
    OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream);
  }

  // Target label sets for producing glue grammar.
  std::set<std::string> labelSet;
  std::map<std::string, int> topLabelSet;

  // Word count statistics for producing unknown word labels.
  std::map<std::string, int> wordCount;
  std::map<std::string, std::string> wordLabel;

  std::string targetLine;
  std::string sourceLine;
  std::string alignmentLine;
  XmlTreeParser xmlTreeParser(labelSet, topLabelSet);
  ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
  size_t lineNum = options.sentenceOffset;
  while (true) {
    std::getline(targetStream, targetLine);
    std::getline(sourceStream, sourceLine);
    std::getline(alignmentStream, alignmentLine);

    if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
      break;
    }

    if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
      Error("Files must contain same number of lines");
    }

    ++lineNum;

    // Parse target tree.
    if (targetLine.size() == 0) {
      std::cerr << "skipping line " << lineNum << " with empty target tree\n";
      continue;
    }
    std::auto_ptr<ParseTree> t;
    try {
      t = xmlTreeParser.Parse(targetLine);
      assert(t.get());
    } catch (const Exception &e) {
      std::ostringstream s;
      s << "Failed to parse XML tree at line " << lineNum;
      if (!e.GetMsg().empty()) {
        s << ": " << e.GetMsg();
      }
      Error(s.str());
    }

    // Read source tokens.
    std::vector<std::string> sourceTokens(ReadTokens(sourceLine));

    // Read word alignments.
    Alignment alignment;
    try {
      alignment = ReadAlignment(alignmentLine);
    } catch (const Exception &e) {
      std::ostringstream s;
      s << "Failed to read alignment at line " << lineNum << ": ";
      s << e.GetMsg();
      Error(s.str());
    }
    if (alignment.size() == 0) {
      std::cerr << "skipping line " << lineNum << " without alignment points\n";
      continue;
    }

    // Record word counts.
    if (!options.unknownWordFile.empty()) {
      CollectWordLabelCounts(*t, options, wordCount, wordLabel);
    }

    // Form an alignment graph from the target tree, source words, and
    // alignment.
    AlignmentGraph graph(t.get(), sourceTokens, alignment);

    // Extract minimal rules, adding each rule to its root node's rule set.
    graph.ExtractMinimalRules(options);

    // Extract composed rules.
    if (!options.minimal) {
      graph.ExtractComposedRules(options);
    }

    // Write the rules, subject to scope pruning.
    const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
    for (std::vector<Node *>::const_iterator p = targetNodes.begin();
         p != targetNodes.end(); ++p) {
      const std::vector<const Subgraph *> &rules = (*p)->GetRules();
      for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
           q != rules.end(); ++q) {
        ScfgRule r(**q);
        // TODO Can scope pruning be done earlier?
        if (r.Scope() <= options.maxScope) {
          writer.Write(r);
        }
      }
    }
  }

  if (!options.glueGrammarFile.empty()) {
    WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream);
  }

  if (!options.unknownWordFile.empty()) {
    WriteUnknownWordLabel(wordCount, wordLabel, options, unknownWordStream);
  }

  return 0;
}