/** Prints one decomposition to stdout */ void printDecomposition(const Alphabet& alphabet, const decomposition_t& decomposition) { assert(alphabet.size() == decomposition.size()); bool first = true; for (Alphabet::size_type i = 0; i < alphabet.size(); ++i) { if (decomposition[i] > 0) { if (!first) { cout << " "; } else { first = false; } cout << alphabet.getName(i) << decomposition[i]; } } }
otws_t OTWS_Load(const char *cfg_file) { // allocate config parser and set default config. ltp_configure *cfg = new ltp_configure(); cfg->set_cfg("target", "test"); cfg->set_cfg("agenda", "1"); cfg->set_cfg("model", "./"); cfg->set_cfg("dict", "./"); // load config. if (-1 == cfg->load_cfg(cfg_file)) { WARNING_LOG("Failed to load config file"); delete cfg; return NULL; } OTWS_Engine *engine = new OTWS_Engine(); // allocate alphabet Alphabet *features = new HashDict(); Alphabet *labels = new HashDict(); Alphabet *words = new HashDict(); Parameter *param = new CParameter(0, 0); // load model. Model *model = new Model(); model->registAlphabet("FEATURES", features); model->registAlphabet("LABELS", labels); model->registParameter("PARAMETER", param); model->loadModel(cfg->config("model").c_str()); TRACE_LOG("Loading model is done."); TRACE_LOG("Num Features: %d", features->size()); TRACE_LOG("Num Labels: %d", labels->size()); Extractor *extractor = new SegmentExtractor( cfg->config("dict").c_str(), features, labels, words); Decoder *decoder = new SegmentDecoder(model, 1); engine->model = model; engine->extractor = extractor; engine->decoder = decoder; return reinterpret_cast<otws_t>(engine); }
// ---------------------------------------------- // PLAIN DECODER // ---------------------------------------------- SegmentDecoder::SegmentDecoder( Model *model, int agenda) { this->m_Model = model; this->m_Agenda = agenda; Alphabet *labelAlpha = model->getAlphabet("LABELS"); this->m_NumLabels = labelAlpha->size(); m_Legal = new int *[m_NumLabels + 1]; for (int i = 0; i <= m_NumLabels; ++ i) { char prev = 'X'; if (i < m_NumLabels) prev = labelAlpha->rlookup(i)[0]; m_Legal[i] = new int[m_NumLabels]; for (int j = 0; j < m_NumLabels; ++ j) { char curr = labelAlpha->rlookup(j)[0]; m_Legal[i][j] = 0; if ((prev == 'X' || prev == 'S' || prev == 'E') && (curr == 'S' || curr == 'B')) m_Legal[i][j] = 1; if ((prev == 'M' || prev == 'B') && (curr == 'M' || curr == 'E')) m_Legal[i][j] = 1; } } }
ProfileFelsenstein::ProfileFelsenstein(const MultSeqAlignment &A, const Alphabet &alphabet, const AlphabetMap &alphabetMap) : alignment(A), alphabet(alphabet), numAlpha(alphabet.size()), alphabetMap(alphabetMap) { // ctor }
unsigned convertNmerCode(unsigned rawCode,int seqLength,const Alphabet &alphabet) { Sequence S; S.resize(seqLength); unsigned base=alphabet.size(); for(int i=seqLength-1 ; i>=0 ; --i) { unsigned digit=rawCode%base; Symbol s=(int)digit; S[i]=s; rawCode/=base; } return S.asInt(alphabet,0,S.getLength()); }
void Segmentor::readEmbeddings(Alphabet &alpha, const string& inFile, NRMat<dtype>& emb) { static ifstream inf; if (inf.is_open()) { inf.close(); inf.clear(); } inf.open(inFile.c_str()); static string strLine, curWord; static int wordId; //find the first line, decide the wordDim; while (1) { if (!my_getline(inf, strLine)) { break; } if (!strLine.empty()) break; } int unknownId = alpha.from_string(m_classifier.fe.unknownkey); static vector<string> vecInfo; split_bychar(strLine, vecInfo, ' '); int wordDim = vecInfo.size() - 1; std::cout << "embedding dim is " << wordDim << std::endl; emb.resize(alpha.size(), wordDim); emb = 0.0; curWord = normalize_to_lowerwithdigit(vecInfo[0]); wordId = alpha.from_string(curWord); hash_set<int> indexers; dtype sum[wordDim]; int count = 0; bool bHasUnknown = false; if (wordId >= 0) { count++; if (unknownId == wordId) bHasUnknown = true; indexers.insert(wordId); for (int idx = 0; idx < wordDim; idx++) { dtype curValue = atof(vecInfo[idx + 1].c_str()); sum[idx] = curValue; emb[wordId][idx] = curValue; } } else { for (int idx = 0; idx < wordDim; idx++) { sum[idx] = 0.0; } } while (1) { if (!my_getline(inf, strLine)) { break; } if (strLine.empty()) continue; split_bychar(strLine, vecInfo, ' '); if (vecInfo.size() != wordDim + 1) { std::cout << "error embedding file" << std::endl; } curWord = normalize_to_lowerwithdigit(vecInfo[0]); wordId = alpha.from_string(curWord); if (wordId >= 0) { count++; if (unknownId == wordId) bHasUnknown = true; indexers.insert(wordId); for (int idx = 0; idx < wordDim; idx++) { dtype curValue = atof(vecInfo[idx + 1].c_str()); sum[idx] += curValue; emb[wordId][idx] += curValue; } } } if (!bHasUnknown) { for (int idx = 0; idx < wordDim; idx++) { emb[unknownId][idx] = sum[idx] / count; } count++; std::cout << unknownkey << " not found, using averaged value to initialize." << std::endl; } int oovWords = 0; int totalWords = 0; for (int id = 0; id < alpha.size(); id++) { if (indexers.find(id) == indexers.end()) { oovWords++; for (int idx = 0; idx < wordDim; idx++) { emb[id][idx] = emb[unknownId][idx]; } } totalWords++; } std::cout << "OOV num is " << oovWords << ", total num is " << alpha.size() << ", embedding oov ratio is " << oovWords * 1.0 / alpha.size() << std::endl; }