Exemple #1
0
bool sentenceHandler::readNextSentence(sentPair& sent)
  /* This method reads in a new pair of sentences, each pair is read from the 
     corpus file as line triples. The first line the no of times this line 
     pair occured in the corpus, the second line is the source sentence and 
     the third is the target sentence. The sentences are represented by a space
     separated positive integer token ids. */
{

  string line;
  bool fail(false) ;
  
  sent.clear();
  if (getline(*inputFile, line)){
    istringstream buffer(line);
    buffer >> sent.noOcc;
    if( sent.noOcc<0 )
      {
	if( realCount )
	  {
	    if( Manlexfactor1 && sent.noOcc==-1.0 )
	      sent.realCount=Manlexfactor1;
	    else if( Manlexfactor2 && sent.noOcc==-2.0 )
	      sent.realCount=Manlexfactor2;
	    else
	      {
		sent.realCount=(*realCount)[pair_no];
	      }
	  }
	else
	  sent.realCount=1.0;
      }
    else
      sent.realCount=sent.noOcc;
  }
Exemple #2
0
bool sentenceHandler::readNextSentence(sentPair& sent)
  /* This method reads in a new pair of sentences, each pair is read from the 
     corpus file as line triples. The first line the no of times this line 
     pair occured in the corpus, the second line is the source sentence and 
     the third is the target sentence. The sentences are represented by a space
     separated positive integer token ids. */
{

  string line;
  bool fail(false) ;
  
  sent.clear();
  vector<string> splits;
  if (getline(*inputFile, line)){

	  boost::algorithm::split(splits,line,boost::algorithm::is_any_of("|#*"));

	  if(splits.size() == 1 || splits.size() == 0){
		  // continue, no problem
		  
	  }else if(splits.size()==3){
		  line = splits[0];
	  }else{
		  fail = true;
		  return false;
	  }
	  
    istrstream buffer(line.c_str());
    buffer >> sent.noOcc;
    if( sent.noOcc<0 )
      {
	if( realCount )
	  {
	    if( Manlexfactor1 && sent.noOcc==-1.0 )
	      sent.realCount=Manlexfactor1;
	    else if( Manlexfactor2 && sent.noOcc==-2.0 )
	      sent.realCount=Manlexfactor2;
	    else
	      {
		sent.realCount=(*realCount)[pair_no];
	      }
	  }
	else
	  sent.realCount=1.0;
      }
    else
      sent.realCount=sent.noOcc;
  }
bool bilCorpus::getNextSentPair (sentPair & sp, vocTable & svoc, vocTable & tvoc, int sInputType,int tInputType, vocTable & sTagVoc, vocTable & tTagVoc){
  sp.clear();
  ++sentPairNum;
  if (!untilEnd && sentPairNum>lastSentPairNum){
    return false;
  }
  string line;
  sp.nlevels=fsFile.size();
  sp.initPhToWdMaps();
  for (vector<ifstream*>::size_type nf=0; nf<fsFile.size();++nf){
    if (! fsFile[nf]->good()){
      return false;
    }else{
      // read source sentence and split it
      if (sInputType==0){
	if (nf==0 && getline(*fsFile[nf],line)) sp.loadSrcWords(line,svoc);
	else if (getline(*fsFile[nf],line)) sp.loadSrcPhrases(line,nf);
	else  return false;
      }else if (sInputType==1){
	if (nf==0){
	  vector<string> lines;
	  string buf;
	  int stop=-1;
	  int filepos;
	  while (getline(*fsFile[nf],line) && stop <=0){
	    buf=""; //if buf is not reset and "line" is empty, buf keeps its previous value
	    istringstream iss(line);
	    iss >> buf;
	    if (buf == "align"){lines.push_back(line);}
	    if (buf == "name"){++stop;}
	    else {filepos=fsFile[nf]->tellg();}
	  }
	  fsFile[nf]->seekg(filepos);
	  //	  for (vector<string>::const_iterator it=lines.begin();it!=lines.end();++it){cout<<"l:"<<*it<<endl;}
	  sp.loadSrcWords(lines,svoc);
	  //	  cout<<sp.src().print()<<endl;
	  //cout<<sp.srcAlUnits().print()<<endl;
	}else {sp.loadSrcPhrases(line,nf);}
      }
    }
Exemple #4
0
int sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist)
{
    pthread_mutex_lock(&readsent_mutex);
    
    do{
        sentPair s ;
        if (readflag){
            cerr << "Attempting to read from the end of corpus, rewinding\n";
            //rewind();
            break;
        } 
        if (currentSentence >= noSentInBuffer){
            if (allInMemory)
                break;
            /* no more sentences in buffer */
            noSentInBuffer = 0 ;
            currentSentence = 0 ;
            Buffer.clear();
            cout << "Reading more sentence pairs into memory ... \n";
            while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){
                if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){
                    cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<<
                        "the maximum allowed limit for a source word fertility\n"<<
                        " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 <<
                        " ratio " << double(s.fSent.size()-1)/  (s.eSent.size()-1) << " ferility limit : " <<
                        MAX_FERTILITY-1 << '\n';
                    cerr << "Shortening sentence \n";
                    cerr << s;
                    s.eSent.resize(min(s.eSent.size(),s.fSent.size()));
                    s.fSent.resize(min(s.eSent.size(),s.fSent.size()));
                }
                Buffer.push_back(s) ;
                if (elist && flist){
                    if ((*elist).size() > 0)
                        for (WordIndex i= 0 ; i < s.eSent.size() ; i++){
                            if (s.eSent[i] >= (*elist).uniqTokens()){
                                if( PrintedTooLong++<100)
                                    cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n";
                                exit(-1);
                            }
                            (*elist).incFreq(s.eSent[i], s.realCount);
                        }
                    if ((*flist).size() > 0)
                        for (WordIndex j= 1 ; j < s.fSent.size() ; j++){
                            if (s.fSent[j] >= (*flist).uniqTokens()){
                                cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n";
                                exit(-1);
                            }
                            (*flist).incFreq(s.fSent[j], s.realCount);
                        }
                }
                noSentInBuffer++;
            }
            if (inputFile->eof()){
                allInMemory = (Buffer.size() >= 1 && 
                               Buffer[currentSentence].sentenceNo == 1) ;
                if (allInMemory)
                    cout << "Corpus fits in memory, corpus has: " << Buffer.size() <<
                    " sentence pairs.\n";
            }
        }
        if(noSentInBuffer <= 0 ){
            //cerr << "# sent in buffer " << noSentInBuffer << '\n';
            readflag = true ;
            break;
        }
        sent = Buffer[currentSentence++] ;
        position ++;
        if( sent.noOcc<0 && realCount ){
            if( Manlexfactor1 && sent.noOcc==-1.0 )
                sent.realCount=Manlexfactor1;
            else if( Manlexfactor2 && sent.noOcc==-2.0 )
                sent.realCount=Manlexfactor2;
            else
                sent.realCount=(*realCount)[sent.getSentenceNo()-1];
        }
        pthread_mutex_unlock(&readsent_mutex);
        return position ;
    }while(false);
    pthread_mutex_unlock(&readsent_mutex);
    return 0;
}