Exemple #1
0
int sentenceHandler::getNextSentence(sentPair& sent, vcbList* elist, vcbList* flist)
{
    pthread_mutex_lock(&readsent_mutex);
    
    do{
        sentPair s ;
        if (readflag){
            cerr << "Attempting to read from the end of corpus, rewinding\n";
            //rewind();
            break;
        } 
        if (currentSentence >= noSentInBuffer){
            if (allInMemory)
                break;
            /* no more sentences in buffer */
            noSentInBuffer = 0 ;
            currentSentence = 0 ;
            Buffer.clear();
            cout << "Reading more sentence pairs into memory ... \n";
            while((noSentInBuffer < TRAIN_BUFFER_SIZE) && readNextSentence(s)){
                if ((s.fSent.size()-1) > (MAX_FERTILITY-1) * (s.eSent.size()-1)){
                    cerr << "WARNING: The following sentence pair has source/target sentence length ration more than\n"<<
                        "the maximum allowed limit for a source word fertility\n"<<
                        " source length = " << s.eSent.size()-1 << " target length = " << s.fSent.size()-1 <<
                        " ratio " << double(s.fSent.size()-1)/  (s.eSent.size()-1) << " ferility limit : " <<
                        MAX_FERTILITY-1 << '\n';
                    cerr << "Shortening sentence \n";
                    cerr << s;
                    s.eSent.resize(min(s.eSent.size(),s.fSent.size()));
                    s.fSent.resize(min(s.eSent.size(),s.fSent.size()));
                }
                Buffer.push_back(s) ;
                if (elist && flist){
                    if ((*elist).size() > 0)
                        for (WordIndex i= 0 ; i < s.eSent.size() ; i++){
                            if (s.eSent[i] >= (*elist).uniqTokens()){
                                if( PrintedTooLong++<100)
                                    cerr << "ERROR: source word " << s.eSent[i] << " is not in the vocabulary list \n";
                                exit(-1);
                            }
                            (*elist).incFreq(s.eSent[i], s.realCount);
                        }
                    if ((*flist).size() > 0)
                        for (WordIndex j= 1 ; j < s.fSent.size() ; j++){
                            if (s.fSent[j] >= (*flist).uniqTokens()){
                                cerr << "ERROR: target word " << s.fSent[j] << " is not in the vocabulary list \n";
                                exit(-1);
                            }
                            (*flist).incFreq(s.fSent[j], s.realCount);
                        }
                }
                noSentInBuffer++;
            }
            if (inputFile->eof()){
                allInMemory = (Buffer.size() >= 1 && 
                               Buffer[currentSentence].sentenceNo == 1) ;
                if (allInMemory)
                    cout << "Corpus fits in memory, corpus has: " << Buffer.size() <<
                    " sentence pairs.\n";
            }
        }
        if(noSentInBuffer <= 0 ){
            //cerr << "# sent in buffer " << noSentInBuffer << '\n';
            readflag = true ;
            break;
        }
        sent = Buffer[currentSentence++] ;
        position ++;
        if( sent.noOcc<0 && realCount ){
            if( Manlexfactor1 && sent.noOcc==-1.0 )
                sent.realCount=Manlexfactor1;
            else if( Manlexfactor2 && sent.noOcc==-2.0 )
                sent.realCount=Manlexfactor2;
            else
                sent.realCount=(*realCount)[sent.getSentenceNo()-1];
        }
        pthread_mutex_unlock(&readsent_mutex);
        return position ;
    }while(false);
    pthread_mutex_unlock(&readsent_mutex);
    return 0;
}