void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
                                      , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
  // print alignment of words
  for(int ti=startT; ti<=endT; ti++) {
    WordIndex::const_iterator p = indexT.find(ti);
    if (p != indexT.end()) { // does word still exist?
      for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) {
        int si = m_sentence.alignedToT[ti][i];
        std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
        std::string targetSymbolIndex = IntToString(p->second);
        rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
        if (! m_options.onlyDirectFlag)
          rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";

  // print alignment of non terminals
  HoleList::const_iterator iterHole;
  for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
    const Hole &hole = *iterHole;

    std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
    std::string targetSymbolIndex = IntToString(hole.GetPos(1));
    rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
    if (!m_options.onlyDirectFlag)
      rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";

  if (!m_options.onlyDirectFlag) {
void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule)
  rule.targetSyntacticPreference = "";
  int holeCount = 0;
  for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
       iterHoleList != holeColl.GetHoles().end();
       ++iterHoleList) {

    const Hole &hole = *iterHoleList;

    int labelI = labelIndex[ 2+holeCount ];
    string targetLabel = "X";
    int startT = hole.GetStart(1);
    int endT = hole.GetEnd(1);
    if (m_sentence.targetTree.HasNode(startT,endT)) {
      rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label;
      rule.targetSyntacticPreference += " ";
    } else {
      rule.targetSyntacticPreference += "X ";

string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int endS
    , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
    , int countS)
  HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
  assert(iterHoleList != holeColl.GetHoles().end());

  string out = "";
  int outPos = 0;
  int holeCount = 0;
  for(int currPos = startT; currPos <= endT; currPos++) {
    bool isHole = false;
    if (iterHoleList != holeColl.GetHoles().end()) {
      const Hole &hole = *iterHoleList;
      isHole = hole.GetStart(1) == currPos;

    if (isHole) {
      Hole &hole = *iterHoleList;

      const string &sourceLabel = hole.GetLabel(0);
      assert(sourceLabel != "");

      int labelI = labelIndex[ 2+holeCount ];
      string targetLabel;
      if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) {
        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label;
      } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
        targetLabel = "S";
      } else {
        targetLabel = "X";

      hole.SetLabel(targetLabel, 1);

      if (m_options.unpairedExtractFormat) {
        out += "[" + targetLabel + "] ";
      } else {
        out += "[" + sourceLabel + "][" + targetLabel + "] ";

      if (m_options.pcfgScore) {
        logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]);

      currPos = hole.GetEnd(1);
      hole.SetPos(outPos, 1);
    } else {
      indexT[currPos] = outPos;
      out += m_sentence.target[currPos] + " ";


  assert(iterHoleList == holeColl.GetHoles().end());
  return out.erase(out.size()-1);
Beispiel #4
string printTargetHieroPhrase(SentenceAlignmentWithSyntax &sentence
                              , int startT, int endT, int startS, int endS
                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
  HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
  assert(iterHoleList != holeColl.GetHoles().end());

  bool stringToTree = !options.sourceSyntax && options.targetSyntax;

  string out = "";
  int outPos = 0;
  int holeCount = 0;
  for(int currPos = startT; currPos <= endT; currPos++) {
    bool isHole = false;
    if (iterHoleList != holeColl.GetHoles().end()) {
      const Hole &hole = *iterHoleList;
      isHole = hole.GetStart(1) == currPos;

    if (isHole) {
      Hole &hole = *iterHoleList;

      const string &sourceLabel = hole.GetLabel(0);
      assert(sourceLabel != "");

      int labelI = labelIndex[ 2+holeCount ];
      string targetLabel = options.targetSyntax ?
                           sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
      hole.SetLabel(targetLabel, 1);

      if (stringToTree)
        out += "[" + targetLabel + "] ";
        out += "[" + sourceLabel + "][" + targetLabel + "] ";

      if (options.pcfgScore) {
        double score = sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetScore();
        logPCFGScore -= score;

      currPos = hole.GetEnd(1);
      hole.SetPos(outPos, 1);
    } else {
      indexT[currPos] = outPos;
      out += sentence.target[currPos] + " ";


  assert(iterHoleList == holeColl.GetHoles().end());
  return out.erase(out.size()-1);
void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
  LabelIndex labelIndex,labelCount;

  // number of target head labels
  int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
  if (m_options.targetSyntacticPreferences && !numLabels) {

  // number of source head labels
  numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;

  // number of target hole labels
  for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
       hole != holeColl.GetHoles().end(); hole++ ) {
    int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
    if (m_options.targetSyntacticPreferences && !numLabels) {

  // number of source hole labels
  for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
       i != holeColl.GetSortedSourceHoles().end(); i++ ) {
    const Hole &hole = **i;
    int numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;

  // loop through the holes
  bool done = false;
  while(!done) {
    saveHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
    for(unsigned int i=0; i<labelIndex.size(); i++) {
      if(labelIndex[i] == labelCount[i]) {
        labelIndex[i] = 0;
        if (i == labelIndex.size()-1)
          done = true;
      } else {
Beispiel #6
void printAllHieroPhrases( SentenceAlignmentWithSyntax &sentence
                           , int startT, int endT, int startS, int endS
                           , HoleCollection &holeColl)
  LabelIndex labelIndex,labelCount;

  // number of target head labels
  int numLabels = options.targetSyntax ? sentence.targetTree.GetNodes(startT,endT).size() : 1;

  // number of source head labels
  numLabels =  options.sourceSyntax ? sentence.sourceTree.GetNodes(startS,endS).size() : 1;

  // number of target hole labels
  for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
       hole != holeColl.GetHoles().end(); hole++ ) {
    int numLabels =  options.targetSyntax ? sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;

  // number of source hole labels
  for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
       i != holeColl.GetSortedSourceHoles().end(); i++ ) {
    const Hole &hole = **i;
    int numLabels =  options.sourceSyntax ? sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;

  // loop through the holes
  bool done = false;
  while(!done) {
    printHieroPhrase( sentence, startT, endT, startS, endS, holeColl, labelIndex );
    for(int i=0; i<labelIndex.size(); i++) {
      if(labelIndex[i] == labelCount[i]) {
        labelIndex[i] = 0;
        if (i == labelIndex.size()-1)
          done = true;
      } else {
Beispiel #7
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex)
  HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
  assert(iterHoleList != holeColl.GetHoles().end());

  string out = "";
  int outPos = 0;
  int holeCount = 0;
  for(int currPos = startT; currPos <= endT; currPos++) {
    bool isHole = false;
    if (iterHoleList != holeColl.GetHoles().end()) {
      const Hole &hole = *iterHoleList;
      isHole = hole.GetStart(1) == currPos;

    if (isHole) {
      Hole &hole = *iterHoleList;

      const string &sourceLabel = hole.GetLabel(0);
      assert(sourceLabel != "");

      int labelI = labelIndex[ 2+holeCount ];
      string targetLabel = m_options.targetSyntax ?
                           m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
      hole.SetLabel(targetLabel, 1);

      out += "[" + sourceLabel + "][" + targetLabel + "] ";

      currPos = hole.GetEnd(1);
      hole.SetPos(outPos, 1);
    } else {
      indexT[currPos] = outPos;
      out += m_sentence->target[currPos] + " ";


  assert(iterHoleList == holeColl.GetHoles().end());
  return out.erase(out.size()-1);
Beispiel #8
void preprocessSourceHieroPhrase( SentenceAlignmentWithSyntax &sentence
                                  , int startT, int endT, int startS, int endS
                                  , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex)
  vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
  assert(iterHoleList != holeColl.GetSortedSourceHoles().end());

  int outPos = 0;
  int holeCount = 0;
  int holeTotal = holeColl.GetHoles().size();
  for(int currPos = startS; currPos <= endS; currPos++) {
    bool isHole = false;
    if (iterHoleList != holeColl.GetSortedSourceHoles().end()) {
      const Hole &hole = **iterHoleList;
      isHole = hole.GetStart(0) == currPos;

    if (isHole) {
      Hole &hole = **iterHoleList;

      int labelI = labelIndex[ 2+holeCount+holeTotal ];
      string label = options.sourceSyntax ?
                     sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
      hole.SetLabel(label, 0);

      currPos = hole.GetEnd(0);
      hole.SetPos(outPos, 0);
    } else {
      indexS[currPos] = outPos;


  assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
Beispiel #9
// this function is called recursively
// it pokes a new hole into the phrase pair, and then calls itself for more holes
void addHieroRule( SentenceAlignmentWithSyntax &sentence
                   , int startT, int endT, int startS, int endS
                   , RuleExist &ruleExist, const HoleCollection &holeColl
                   , int numHoles, int initStartT, int wordCountT, int wordCountS)
  // done, if already the maximum number of non-terminals in phrase pair
  if (numHoles >= options.maxNonTerm)

  // find a hole...
  for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) {
    for (int endHoleT = startHoleT+(options.minHoleTarget-1); endHoleT <= endT; ++endHoleT) {
      // if last non-terminal, enforce word count limit
      if (numHoles == options.maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > options.maxSymbolsTarget)

      // determine the number of remaining target words
      const int newWordCountT = wordCountT - (endHoleT-startHoleT+1);

      // always enforce min word count limit
      if (newWordCountT < options.minWords)

      // except the whole span
      if (startHoleT == startT && endHoleT == endT)

      // does a phrase cover this target span?
      // if it does, then there should be a list of mapped source phrases
      // (multiple possible due to unaligned words)
      const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT);

      // loop over sub phrase pairs
      HoleList::const_iterator iterSourceHoles;
      for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) {
        const Hole &sourceHole = *iterSourceHoles;

        const int sourceHoleSize = sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1;

        // enforce minimum hole size
        if (sourceHoleSize < options.minHoleSource)

        // determine the number of remaining source words
        const int newWordCountS = wordCountS - sourceHoleSize;

        // if last non-terminal, enforce word count limit
        if (numHoles == options.maxNonTerm-1 && newWordCountS + (numHoles+1) > options.maxSymbolsSource)

        // enforce min word count limit
        if (newWordCountS < options.minWords)

        // hole must be subphrase of the source phrase
        // (may be violated if subphrase contains additional unaligned source word)
        if (startS > sourceHole.GetStart(0) || endS <  sourceHole.GetEnd(0))

        // make sure target side does not overlap with another hole
        if (holeColl.OverlapSource(sourceHole))

        // if consecutive non-terminals are not allowed, also check for source
        if (!options.nonTermConsecSource && holeColl.ConsecSource(sourceHole) )

        // check that rule scope would not exceed limit if sourceHole
        // were added
        if (holeColl.Scope(sourceHole) > options.maxScope)

        // require that at least one aligned word is left (unless there are no words at all)
        if (options.requireAlignedWord && (newWordCountS > 0 || newWordCountT > 0)) {
          HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
          bool foundAlignedWord = false;
          // loop through all word positions
          for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) {
            // new hole? moving on...
            if (pos == startHoleT) {
              pos = endHoleT;
            // covered by hole? moving on...
            else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) {
              pos = iterHoleList->GetEnd(1);
            // covered by word? check if it is aligned
            else {
              if (sentence.alignedToT[pos].size() > 0)
                foundAlignedWord = true;
          if (!foundAlignedWord)

        // update list of holes in this phrase pair
        HoleCollection copyHoleColl(holeColl);
        copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0));

        // now some checks that disallow this phrase pair, but not further recursion
        bool allowablePhrase = true;

        // maximum words count violation?
        if (newWordCountS + (numHoles+1) > options.maxSymbolsSource)
          allowablePhrase = false;

        if (newWordCountT + (numHoles+1) > options.maxSymbolsTarget)
          allowablePhrase = false;

        // passed all checks...
        if (allowablePhrase)
          printAllHieroPhrases(sentence, startT, endT, startS, endS, copyHoleColl);

        // recursively search for next hole
        int nextInitStartT = options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
        addHieroRule(sentence, startT, endT, startS, endS
                     , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT
                     , newWordCountT, newWordCountS);
void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS
                                   , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
  WordIndex indexS, indexT; // to keep track of word positions in rule

  ExtractedRule rule( startT, endT, startS, endS );

  // phrase labels
  string targetLabel;
  if (m_options.targetSyntax) {
    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
  } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
    targetLabel = "S";
  } else {
    targetLabel = "X";

  string sourceLabel = m_options.sourceSyntax ?
                       m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";

  // create non-terms on the source side
  preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);

  // target
  if (m_options.pcfgScore) {
    double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
    rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                  + " [" + targetLabel + "]";
    rule.pcfgScore = std::exp(logPCFGScore);
  } else {
    double logPCFGScore = 0.0f;
    rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                  + " [" + targetLabel + "]";

  // source
  rule.source = saveSourceHieroPhrase(startT, endT, startS, endS, holeColl, labelIndex);
  if (m_options.conditionOnTargetLhs) {
    rule.source += " [" + targetLabel + "]";
  } else {
    rule.source += " [" + sourceLabel + "]";

  // alignment
  saveHieroAlignment(startT, endT, startS, endS, indexS, indexT, holeColl, rule);

  // context (words to left and right)
  if (m_options.flexScoreFlag) {
    rule.sourceContextLeft = startS == 0 ? "<s>" : m_sentence.source[startS-1];
    rule.sourceContextRight = endS+1 == m_sentence.source.size() ? "<s>" : m_sentence.source[endS+1];
    rule.targetContextLeft = startT == 0 ? "<s>" : m_sentence.target[startT-1];
    rule.targetContextRight = endT+1 == m_sentence.target.size() ? "<s>" : m_sentence.target[endT+1];
    rule.sourceHoleString = "";
    rule.targetHoleString = "";

    HoleList::const_iterator iterHole;
    for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
      const Hole &hole = *iterHole;
      rule.sourceHoleString += hole.GetLabel(0) + ": ";

      // rule starts with nonterminal; end of NT is considered left context
      if (hole.GetStart(0) == startS) {
        rule.sourceContextLeft = m_sentence.source[hole.GetEnd(0)];
      // rule ends with nonterminal; start of NT is considered right context
      else if (hole.GetEnd(0) == endS) {
        rule.sourceContextRight = m_sentence.source[hole.GetStart(0)];

      if (hole.GetStart(1) == startT) {
        rule.targetContextLeft = m_sentence.target[hole.GetEnd(1)];
      } else if (hole.GetEnd(1) == endT) {
        rule.targetContextRight = m_sentence.target[hole.GetStart(1)];

      for (int i = hole.GetStart(0); i <= hole.GetEnd(0); ++i) {
        rule.sourceHoleString += m_sentence.source[i] + " ";
      rule.targetHoleString += hole.GetLabel(1) + ": ";
      for (int i = hole.GetStart(1); i <= hole.GetEnd(1); ++i) {
        rule.targetHoleString += m_sentence.target[i] + " ";

  addRuleToCollection( rule );