Ejemplo n.º 1
0
LanguageModel::LanguageModel(const std::string & filename, ClassEncoder & encoder, ClassDecoder * classdecoder, bool debug) {
    this->DEBUG = debug; 
    this->classdecoder = classdecoder;
    order = 0;
    bool hasunk = false;
    ifstream f;    
    f.open(filename.c_str(), ios::in);
    if ((!f) || (!f.good())) {
       cerr << "File does not exist: " << filename << endl;
       exit(3);
    }    
    while (!f.eof()) {               
        string line;
        getline(f, line);                
        if (line == "\\data\\") {
            order = 0;
        } else if (line == "\\1-grams:") { //bit inelegant, but simplest
            order = 1;
        } else if (line == "\\2-grams:") {
            order = 2;
        } else if (line == "\\3-grams:") {
            order = 3;            
        } else if (line == "\\4-grams:") {
            order = 4;
        } else if (line == "\\5-grams:") {
            order = 5;            
        } else if (line == "\\6-grams:") {
            order = 6;            
        } else if (line == "\\7-grams:") {
            order = 7;            
        } else if (line == "\\8-grams:") {
            order = 8;            
        } else if (line == "\\9-grams:") {
            order = 9;                        
        } else if (!line.empty()) {
            if (order == 0) {
              if (line.substr(0,5) == "ngram") {
                string n_s = line.substr(6,1);
                string v_s = line.substr(8);
                int n = atoi(n_s.c_str());
                int v = atoi(v_s.c_str());
                total[n] = v;
              }   
            } else if (order > 0) {
                string logprob_s = "";
                string backofflogprob_s = "";
                string ngramcontent = "";
                int fields = 0;
                int begin = 0;
                for (unsigned int i = 0; i  <= line.length(); i++) {
                    if ((line[i] == '\t') || (line[i] == '\n') || (i == line.length())) {
                        if (fields == 0) {
                            logprob_s = line.substr(begin, i - begin);
                        } else if (fields == 1) {
                            ngramcontent = line.substr(begin, i - begin);
                        } else if (fields == 2) {
                            backofflogprob_s = line.substr(begin, i - begin);
                        }
                        begin = i + 1;
                        fields++;
                    }
                }
                
                
                if ((!logprob_s.empty()) && (!ngramcontent.empty())) {
                    if (ngramcontent == "<unk>") {
                        ngrams[UNKPATTERN] = atof(logprob_s.c_str()) * log(10); //* log(10) does log10 to log_e conversion
                        hasunk = true;
                        if (DEBUG) {
                            cerr << " Adding UNKNOWN to LM: " << (int) UNKPATTERN.n() << "\t" <<  ngramcontent << "\t" << ngrams[UNKPATTERN] << endl;
                        }
                    } else {
                        Pattern ngram = encoder.buildpattern(ngramcontent);
                        if (ngram != UNKPATTERN) {
                            ngrams[ngram] = atof(logprob_s.c_str()) * log(10); //* log(10) does log10 to log_e conversion
                            if (!backofflogprob_s.empty()) {
                                backoff[ngram] = atof(backofflogprob_s.c_str()) * log(10); //* log(10) does log10 to log_e conversion
                                if (DEBUG) cerr << " Adding to LM: " << (int) ngram.n() << "\t" <<  ngramcontent << "\t" << ngrams[ngram] << "\t" << backoff[ngram] << endl;
                            } else {
                                if (DEBUG) cerr << " Adding to LM: " << (int) ngram.n() << "\t" << ngramcontent << "\t" << ngrams[ngram] << endl;
                            }
                        }
                    }
                } else {
                    cerr << "WARNING: Ignoring line: " << line << endl;
                }
            } else {
                cerr << "WARNING: Don't know what to do with line: " << line << endl;
            }
        }
        
    }
    f.close();
    
    if (!hasunk) {
        cerr << "ERROR: Language Model has no value <unk>, make sure to generate SRILM model with -unk parameter" << endl;
        exit(3);
    }
}
void loadmosesphrasetable(PatternAlignmentModel<double> & model,  const std::string & filename, ClassEncoder & sourceencoder, ClassEncoder & targetencoder, PatternSetModel * constrainsourcemodel = NULL, PatternSetModel * constraintargetmodel = NULL, int max_sourcen =0, const double pts=0, const double pst=0, const double joinedthreshold=0, const double divergencefrombestthreshold=0.0, const std::string delimiter = "|||", const int score_column=3, const int pstfield = 0, const int ptsfield=2, const int maxscores = 10)
  {
    unsigned int added = 0;
    unsigned int skipped = 0;
    unsigned int constrained = 0;
    unsigned int count = 0;

    PatternSetModel firstwords;
    if (constrainsourcemodel != NULL) {
        cerr << "(Inferring extra contraints from source model, for faster discarding of patterns)" << endl;
        for (PatternSetModel::iterator iter = constrainsourcemodel->begin(); iter != constrainsourcemodel->end(); iter++) {
            const Pattern pattern = *iter;
            const Pattern firstword = pattern[0];
            firstwords.insert(firstword);
        }
        cerr << "(added " << firstwords.size() << " unigrams)" << endl;
    }

    //load from moses-style phrasetable file
    istream * f = NULL;
    if (filename.substr(filename.size()-3) == ".gz") {
        cerr << "(Reading from gzip)" << endl;
        f = new igzstream(filename.c_str(), ios::in | ios::binary);
    } else {
        f = new ifstream(filename.c_str(), ios::in | ios::binary);
    }
    if ((f == NULL) || (!f->good())) {
       cerr << "File does not exist: " << filename << endl;
       exit(2);
    }

    vector<BufferItem> buffer;

    string firstword;
 
    bool skipsamesource = false;
    string prevsource;
    string skipfirstword;

    string source = "";
    string target = "";
    string scores_s;
    bool abort = false;
    int mode = 0;
    int begin = 0;
    bool updated = false;
    string line;
    vector<double> scores;

    while (!f->eof()) {
        line.clear();
        getline(*f, line);
        count++;
        if (count % 100000 == 0) {
            cerr << endl;
            cerr <<  "Loading and encoding phrase-table: @" << count << " total added: " << added  << ", skipped because of threshold: " << skipped << ", skipped because of constraints: " << constrained;            
        }
        if (count % 1000 == 0) {
            if (updated) {
                cerr << ":";
            } else {
                cerr << ".";
            }
            updated = false;
        }
        mode = 0;
        abort = false;
        begin = 0;
        source.clear();
        target.clear();
        scores_s.clear();
        const int linesize = line.size();
        if (linesize == 0) continue;
        for (unsigned int i = 0; i < linesize; i++) {
            if (line.substr(i,5) == " ||| ") {
                if (mode == 0) {
                    source = line.substr(begin, i - begin);
                    int j = 0;
                    firstword = source;
                    for (auto c : source) {
                        if (c == ' ') {
                            firstword = source.substr(0,j);
                            break;
                        }
                        j++;
                    }
                    if (firstword == skipfirstword) {
                        abort = true;
                        break;
                    }
                } else if (mode == 1) {
                    target = line.substr(begin, i - begin);
                } else if (mode == 2) {
                    scores_s = line.substr(begin, i - begin);
                }
                begin = i+5;
                mode++;
            }
        }
        if (mode == 2) {
            scores_s = line.substr(begin);
            mode++;
        }

        if ((abort) || (firstword == skipfirstword)) {
            constrained++;
            continue;
        } else if ((skipsamesource) && (source == prevsource)) {
            constrained++;
            continue;
        } else {
            skipsamesource = false;
            skipfirstword = "";
        }


        if (mode < 3) {
            cerr << endl << "WARNING: Error in input format, line " << count << " (length=" << linesize << "): " << endl;
            cerr << line << endl;
            cerr << "SKIPPING..." << endl;
        }

        scores.clear();
        scores_s = scores_s + " ";
        begin = 0;
        //cerr << "DEBUG: scores_s=" << scores_s << endl;
        for (unsigned int i = 0; i < scores_s.size(); i++) {
            if ((scores_s[i] == ' ')  && (i > begin)) {
                double score = atof(scores_s.substr(begin, i - begin).c_str());
                //cerr << scores_s.substr(begin, i - begin) << " -> " << score << endl;
                scores.push_back(score);
                begin = i + 1;
            }
        }

        if (((!buffer.empty()) && (source != prevsource))) {
            if (buffer.size() >= 100) {
                cerr << "!";
            }
            if (divergencefrombestthreshold > 0) {
                double bestscore = 0;
                for (auto bufferitem : buffer) {
                    if (bufferitem.scores[ptsfield] > bestscore) bestscore = bufferitem.scores[ptsfield];
                }

                double p = bestscore * divergencefrombestthreshold;
                for (auto bufferitem : buffer) {
                    if (bufferitem.scores[ptsfield] >= p) {
                        model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores, false );
                        added++;
                        updated = true;
                    } else {
                        skipped++;
                    }
                }
            } else {
                for (auto bufferitem : buffer) {
                    model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores, false );
                    added++;
                    updated = true;
                }
            }
            buffer.clear();
        }


        //check score threshold
        if (  ((pst > 0) && (scores[pstfield] < pst))
            || ((pts > 0) && (scores[ptsfield] < pts))
            || ((joinedthreshold > 0) && (scores[ptsfield] * scores[pstfield] < joinedthreshold))
        ) {
            skipped++;
        } else {
            //add to phrasetable
            try{
                Pattern sourcepattern = sourceencoder.buildpattern(source);

                if ((constrainsourcemodel != NULL) && (!constrainsourcemodel->has(sourcepattern))) {
                    const int _n = sourcepattern.n();
                    if (_n == 1) {
                        skipfirstword = firstword;
                    } else if (!firstwords.has(sourcepattern[0])) {
                        skipfirstword = firstword;
                    }

                    constrained++;
                    skipsamesource = true;
                    continue;
                }

                if ((max_sourcen > 0) && (sourcepattern.n() > max_sourcen)) {
                    skipped++;
                    skipsamesource = true;
                    continue;
                }

                Pattern targetpattern = targetencoder.buildpattern(target);

                if ((constraintargetmodel != NULL) && (!constraintargetmodel->has(targetpattern))) {
                    constrained++;
                    continue;
                }

                if (scores.size() > maxscores) {
                    cerr << endl << "*** WARNING: Unexpectedly large number of scores in line " << count << ", something wrong? ***" << endl;
                }

                buffer.push_back( BufferItem(sourcepattern, targetpattern, scores) );
            } catch (const UnknownTokenError &e) {
                cerr << endl << "*** WARNING: UnknownTokenError in encoding of source or target fragment on line " << count << " skipping: " << source << " ||| " << target <<endl;
            }
        }

    }

    //don't forget last one in buffer:
    if (!buffer.empty()) {
        if (divergencefrombestthreshold > 0) {
            double bestscore = 0;
            for (auto bufferitem : buffer) if (bufferitem.scores[ptsfield] > bestscore) bestscore = bufferitem.scores[ptsfield];

            double p = bestscore * divergencefrombestthreshold;
            for (auto bufferitem : buffer) {
                if (bufferitem.scores[ptsfield] >= p) {
                    model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores, false );
                    added++;
                } else {
                    skipped++;
                }
            }
        } else {
            for (auto bufferitem : buffer) {
                model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores,false );
                added++;
            }
        }
        buffer.clear();
    }
    cerr  << "Read " << count << " lines" << endl;
    cerr << "Added: " << added << " -- skipped due to threshold: " << skipped << " -- skipped by constraint: " << constrained << endl;
    cerr << "Source patterns: " << model.size() <<  endl;
}