Model* TableCategorical::ReadModel(ByteReader* byte_reader, const Schema& schema, size_t index) { size_t predictor_size = byte_reader->ReadByte(); size_t cell_size = byte_reader->ReadByte(); std::vector<size_t> predictor_list; for (size_t i = 0; i < predictor_size; ++i ) { size_t pred = byte_reader->Read16Bit(); predictor_list.push_back(pred); } // set err to 0 because err is only used in training TableCategorical* model = new TableCategorical(schema, predictor_list, index, 0); size_t target_range = byte_reader->Read16Bit(); model->target_range_ = target_range; // Read Model Parameters size_t table_size = model->dynamic_list_.size(); for (size_t i = 0; i < table_size; ++i ) { std::vector<Prob>& prob_segs = model->dynamic_list_[i].prob; prob_segs.resize(target_range - 1); for (size_t j = 0; j < prob_segs.size(); ++j ) if (cell_size == 16) { prob_segs[j] = GetProb(byte_reader->Read16Bit(), 16); } else { prob_segs[j] = GetProb(byte_reader->ReadByte(), 8); } } return model; }
/* GetProb: return nSize-gram probability for ngram in wlab */ static double GetProb(LabId *wlab, int nSize) { /* this routine will return the interpolated nSize-gram probability for the words in wlab. Note that the context maybe shortened in the case of multiple LMs and words which do not occur in some of them. */ int i,j; LMInfo *li; Boolean inThisLM,inAnyLM; double x,prob,psum; NameId nGram[LM_NSIZE]; if (nLModel==1) { inThisLM = TRUE; for (j=0; j<nSize; j++) { if ((nGram[j] = l2nId[0][(int) (wlab[j]->aux)])==NULL) inThisLM = FALSE; } if (inThisLM) { prob = GetNGramProb(lmInfo[0].lm, nGram, nSize); } else if (nSize > 1) prob = GetProb(wlab+1,nSize-1); else { prob = LZERO; HError(-16690,"GetProb: assigning zero probability"); } } else { psum = 0.0; inAnyLM = FALSE; for (li=lmInfo, i=0; i<nLModel; i++, li++) { for (inThisLM=TRUE, j=0; j<nSize; j++) { if ((nGram[j] = l2nId[i][(int) (wlab[j]->aux)])==NULL) inThisLM = FALSE; } if (!inThisLM) continue; x = GetNGramProb(li->lm, nGram, nSize); #ifdef INTERPOLATE_MAX if ((x = exp(x)) > psum) psum = x; #else psum += li->weight*exp(x); #endif inAnyLM = TRUE; } if (inAnyLM) prob = log(psum); else if (nSize > 1) prob = GetProb(wlab+1,nSize-1); else { prob = LZERO; HError(-16690,"GetProb: assigning zero probability"); } } return prob; }
ostream& Print( ostream& o ) const { return o << GetSeqId() << "\t" << GetPos() << "\t" << GetProb(0) << "\t" << GetProb(1) << "\t" << GetProb(2) << "\t" << GetProb(3); }
Model* StringModel::ReadModel(ByteReader* byte_reader, size_t index) { StringModel* model = new StringModel(index); model->char_count_.clear(); model->length_count_.clear(); model->char_prob_.resize(255); model->length_prob_.resize(63); for (int i = 0; i < 255; ++i ) model->char_prob_[i] = GetProb(byte_reader->Read16Bit(), 16); for (int i = 0; i < 63; ++i ) model->length_prob_[i] = GetProb(byte_reader->ReadByte(), 8); return model; }
/* CalcPerplexity: compute perplexity and other statistics */ static void CalcPerplexity(PStats *sent, LabId *pLab, int numPLabs, int nSize) { int i,j; LabId *p; float prob; Boolean hasOOV; for (p=pLab, i=nSize-1; i<numPLabs; i++, p++) { if (pLab[i]==unkId) continue; /* cannot predict OOVs */ if (skipOOV) { hasOOV = FALSE; for (j=1; j<nSize; j++) { if (pLab[i-j]==unkId) { hasOOV=TRUE; break; } } if (hasOOV) continue; /* skip to next label since context contains OOV */ } prob = GetProb(p, nSize); sent->nWrd++; sent->logpp += prob; sent->logpp2 += prob*prob; if (outStreamFN != NULL) fprintf(outStream,"%e\n",exp(prob)); if (trace&T_PROB) { printf("logP(%s |", pLab[i]->name); for (j=1; j<nSize; j++) { printf(" %s%s", (j==1)?"":",", pLab[i-j]->name); } printf(") = %.4f\n", prob); /* if (trace&T_INST_INFO) PrintInstStats(nSize); */ fflush(stdout); } } if (trace&T_SENT) PrintInfo(sent,FALSE); }