int sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist) { pthread_mutex_lock(&readsent_mutex); do{ sentPair s ; if (readflag){ cerr << "Attempting to read from the end of corpus, rewinding\n"; //rewind(); break; } if (currentSentence >= noSentInBuffer){ if (allInMemory) break; /* no more sentences in buffer */ noSentInBuffer = 0 ; currentSentence = 0 ; Buffer.clear(); cout << "Reading more sentence pairs into memory ... \n"; while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){ if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){ cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<< "the maximum allowed limit for a source word fertility\n"<< " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 << " ratio " << double(s.fSent.size()-1)/ (s.eSent.size()-1) << " ferility limit : " << MAX_FERTILITY-1 << '\n'; cerr << "Shortening sentence \n"; cerr << s; s.eSent.resize(min(s.eSent.size(),s.fSent.size())); s.fSent.resize(min(s.eSent.size(),s.fSent.size())); } Buffer.push_back(s) ; if (elist && flist){ if ((*elist).size() > 0) for (WordIndex i= 0 ; i < s.eSent.size() ; i++){ if (s.eSent[i] >= (*elist).uniqTokens()){ if( PrintedTooLong++<100) cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n"; exit(-1); } (*elist).incFreq(s.eSent[i], s.realCount); } if ((*flist).size() > 0) for (WordIndex j= 1 ; j < s.fSent.size() ; j++){ if (s.fSent[j] >= (*flist).uniqTokens()){ cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n"; exit(-1); } (*flist).incFreq(s.fSent[j], s.realCount); } } noSentInBuffer++; } if (inputFile->eof()){ allInMemory = (Buffer.size() >= 1 && Buffer[currentSentence].sentenceNo == 1) ; if (allInMemory) cout << "Corpus fits in memory, corpus has: " << Buffer.size() << " sentence pairs.\n"; } } if(noSentInBuffer <= 0 ){ //cerr << "# sent in buffer " << noSentInBuffer << '\n'; readflag = true ; break; } sent = Buffer[currentSentence++] ; position ++; if( sent.noOcc<0 && realCount ){ if( Manlexfactor1 && sent.noOcc==-1.0 ) sent.realCount=Manlexfactor1; else if( Manlexfactor2 && sent.noOcc==-2.0 ) sent.realCount=Manlexfactor2; else sent.realCount=(*realCount)[sent.getSentenceNo()-1]; } pthread_mutex_unlock(&readsent_mutex); return position ; }while(false); pthread_mutex_unlock(&readsent_mutex); return 0; }