bool sentenceHandler::readNextSentence(sentPair& sent) /* This method reads in a new pair of sentences, each pair is read from the corpus file as line triples. The first line the no of times this line pair occured in the corpus, the second line is the source sentence and the third is the target sentence. The sentences are represented by a space separated positive integer token ids. */ { string line; bool fail(false) ; sent.clear(); if (getline(*inputFile, line)){ istringstream buffer(line); buffer >> sent.noOcc; if( sent.noOcc<0 ) { if( realCount ) { if( Manlexfactor1 && sent.noOcc==-1.0 ) sent.realCount=Manlexfactor1; else if( Manlexfactor2 && sent.noOcc==-2.0 ) sent.realCount=Manlexfactor2; else { sent.realCount=(*realCount)[pair_no]; } } else sent.realCount=1.0; } else sent.realCount=sent.noOcc; }
bool sentenceHandler::readNextSentence(sentPair& sent) /* This method reads in a new pair of sentences, each pair is read from the corpus file as line triples. The first line the no of times this line pair occured in the corpus, the second line is the source sentence and the third is the target sentence. The sentences are represented by a space separated positive integer token ids. */ { string line; bool fail(false) ; sent.clear(); vector<string> splits; if (getline(*inputFile, line)){ boost::algorithm::split(splits,line,boost::algorithm::is_any_of("|#*")); if(splits.size() == 1 || splits.size() == 0){ // continue, no problem }else if(splits.size()==3){ line = splits[0]; }else{ fail = true; return false; } istrstream buffer(line.c_str()); buffer >> sent.noOcc; if( sent.noOcc<0 ) { if( realCount ) { if( Manlexfactor1 && sent.noOcc==-1.0 ) sent.realCount=Manlexfactor1; else if( Manlexfactor2 && sent.noOcc==-2.0 ) sent.realCount=Manlexfactor2; else { sent.realCount=(*realCount)[pair_no]; } } else sent.realCount=1.0; } else sent.realCount=sent.noOcc; }
bool bilCorpus::getNextSentPair (sentPair & sp, vocTable & svoc, vocTable & tvoc, int sInputType,int tInputType, vocTable & sTagVoc, vocTable & tTagVoc){ sp.clear(); ++sentPairNum; if (!untilEnd && sentPairNum>lastSentPairNum){ return false; } string line; sp.nlevels=fsFile.size(); sp.initPhToWdMaps(); for (vector<ifstream*>::size_type nf=0; nf<fsFile.size();++nf){ if (! fsFile[nf]->good()){ return false; }else{ // read source sentence and split it if (sInputType==0){ if (nf==0 && getline(*fsFile[nf],line)) sp.loadSrcWords(line,svoc); else if (getline(*fsFile[nf],line)) sp.loadSrcPhrases(line,nf); else return false; }else if (sInputType==1){ if (nf==0){ vector<string> lines; string buf; int stop=-1; int filepos; while (getline(*fsFile[nf],line) && stop <=0){ buf=""; //if buf is not reset and "line" is empty, buf keeps its previous value istringstream iss(line); iss >> buf; if (buf == "align"){lines.push_back(line);} if (buf == "name"){++stop;} else {filepos=fsFile[nf]->tellg();} } fsFile[nf]->seekg(filepos); // for (vector<string>::const_iterator it=lines.begin();it!=lines.end();++it){cout<<"l:"<<*it<<endl;} sp.loadSrcWords(lines,svoc); // cout<<sp.src().print()<<endl; //cout<<sp.srcAlUnits().print()<<endl; }else {sp.loadSrcPhrases(line,nf);} } }