Exemple #1
0
bool sentenceHandler::readNextSentence(sentPair& sent)
  /* This method reads in a new pair of sentences, each pair is read from the 
     corpus file as line triples. The first line the no of times this line 
     pair occured in the corpus, the second line is the source sentence and 
     the third is the target sentence. The sentences are represented by a space
     separated positive integer token ids. */
{

  string line;
  bool fail(false) ;
  
  sent.clear();
  if (getline(*inputFile, line)){
    istringstream buffer(line);
    buffer >> sent.noOcc;
    if( sent.noOcc<0 )
      {
	if( realCount )
	  {
	    if( Manlexfactor1 && sent.noOcc==-1.0 )
	      sent.realCount=Manlexfactor1;
	    else if( Manlexfactor2 && sent.noOcc==-2.0 )
	      sent.realCount=Manlexfactor2;
	    else
	      {
		sent.realCount=(*realCount)[pair_no];
	      }
	  }
	else
	  sent.realCount=1.0;
      }
    else
      sent.realCount=sent.noOcc;
  }
Exemple #2
0
bool sentenceHandler::readNextSentence(sentPair& sent)
  /* This method reads in a new pair of sentences, each pair is read from the 
     corpus file as line triples. The first line the no of times this line 
     pair occured in the corpus, the second line is the source sentence and 
     the third is the target sentence. The sentences are represented by a space
     separated positive integer token ids. */
{

  string line;
  bool fail(false) ;
  
  sent.clear();
  vector<string> splits;
  if (getline(*inputFile, line)){

	  boost::algorithm::split(splits,line,boost::algorithm::is_any_of("|#*"));

	  if(splits.size() == 1 || splits.size() == 0){
		  // continue, no problem
		  
	  }else if(splits.size()==3){
		  line = splits[0];
	  }else{
		  fail = true;
		  return false;
	  }
	  
    istrstream buffer(line.c_str());
    buffer >> sent.noOcc;
    if( sent.noOcc<0 )
      {
	if( realCount )
	  {
	    if( Manlexfactor1 && sent.noOcc==-1.0 )
	      sent.realCount=Manlexfactor1;
	    else if( Manlexfactor2 && sent.noOcc==-2.0 )
	      sent.realCount=Manlexfactor2;
	    else
	      {
		sent.realCount=(*realCount)[pair_no];
	      }
	  }
	else
	  sent.realCount=1.0;
      }
    else
      sent.realCount=sent.noOcc;
  }
bool bilCorpus::getNextSentPair (sentPair & sp, vocTable & svoc, vocTable & tvoc, int sInputType,int tInputType, vocTable & sTagVoc, vocTable & tTagVoc){
  sp.clear();
  ++sentPairNum;
  if (!untilEnd && sentPairNum>lastSentPairNum){
    return false;
  }
  string line;
  sp.nlevels=fsFile.size();
  sp.initPhToWdMaps();
  for (vector<ifstream*>::size_type nf=0; nf<fsFile.size();++nf){
    if (! fsFile[nf]->good()){
      return false;
    }else{
      // read source sentence and split it
      if (sInputType==0){
	if (nf==0 && getline(*fsFile[nf],line)) sp.loadSrcWords(line,svoc);
	else if (getline(*fsFile[nf],line)) sp.loadSrcPhrases(line,nf);
	else  return false;
      }else if (sInputType==1){
	if (nf==0){
	  vector<string> lines;
	  string buf;
	  int stop=-1;
	  int filepos;
	  while (getline(*fsFile[nf],line) && stop <=0){
	    buf=""; //if buf is not reset and "line" is empty, buf keeps its previous value
	    istringstream iss(line);
	    iss >> buf;
	    if (buf == "align"){lines.push_back(line);}
	    if (buf == "name"){++stop;}
	    else {filepos=fsFile[nf]->tellg();}
	  }
	  fsFile[nf]->seekg(filepos);
	  //	  for (vector<string>::const_iterator it=lines.begin();it!=lines.end();++it){cout<<"l:"<<*it<<endl;}
	  sp.loadSrcWords(lines,svoc);
	  //	  cout<<sp.src().print()<<endl;
	  //cout<<sp.srcAlUnits().print()<<endl;
	}else {sp.loadSrcPhrases(line,nf);}
      }
    }