Esempio n. 1
0
/** Prints one decomposition to stdout */
void printDecomposition(const Alphabet& alphabet, const decomposition_t& decomposition)
{
	assert(alphabet.size() == decomposition.size());
	bool first = true;
	for (Alphabet::size_type i = 0; i < alphabet.size(); ++i) {
		if (decomposition[i] > 0) {
			if (!first) {
				cout << " ";
			} else {
				first = false;
			}
			cout << alphabet.getName(i) << decomposition[i];
		}
	}
}
Esempio n. 2
0
otws_t OTWS_Load(const char *cfg_file) {

    // allocate config parser and set default config.
    ltp_configure *cfg = new ltp_configure();

    cfg->set_cfg("target", "test");
    cfg->set_cfg("agenda", "1");
    cfg->set_cfg("model", "./");
    cfg->set_cfg("dict", "./");

    // load config.
    if (-1 == cfg->load_cfg(cfg_file)) {
        WARNING_LOG("Failed to load config file");
        delete cfg;
        return NULL;
    }

    OTWS_Engine *engine = new OTWS_Engine();

    // allocate alphabet
    Alphabet *features = new HashDict();
    Alphabet *labels   = new HashDict();
    Alphabet *words    = new HashDict();
    Parameter *param   = new CParameter(0, 0);

    // load model.
    Model *model       = new Model();
    model->registAlphabet("FEATURES", features);
    model->registAlphabet("LABELS", labels);
    model->registParameter("PARAMETER", param);
    model->loadModel(cfg->config("model").c_str());

    TRACE_LOG("Loading model is done.");
    TRACE_LOG("Num Features: %d", features->size());
    TRACE_LOG("Num Labels: %d", labels->size());

    Extractor *extractor = new SegmentExtractor(
            cfg->config("dict").c_str(),
            features, labels, words);

    Decoder *decoder = new SegmentDecoder(model, 1);

    engine->model = model;
    engine->extractor = extractor;
    engine->decoder = decoder;

    return reinterpret_cast<otws_t>(engine);
}
Esempio n. 3
0
// ----------------------------------------------
// PLAIN DECODER
// ----------------------------------------------
SegmentDecoder::SegmentDecoder(
        Model *model,
        int agenda) {
    this->m_Model  = model;
    this->m_Agenda = agenda;
    Alphabet *labelAlpha = model->getAlphabet("LABELS");
    this->m_NumLabels = labelAlpha->size();

    m_Legal = new int *[m_NumLabels + 1];
    for (int i = 0; i <= m_NumLabels; ++ i) {
        char prev = 'X';
        if (i < m_NumLabels)
            prev = labelAlpha->rlookup(i)[0];

        m_Legal[i] = new int[m_NumLabels];
        for (int j = 0; j < m_NumLabels; ++ j) {
            char curr = labelAlpha->rlookup(j)[0];

            m_Legal[i][j] = 0;
            if ((prev == 'X' || prev == 'S' || prev == 'E') 
                    && (curr == 'S' || curr == 'B'))
                m_Legal[i][j] = 1;
            if ((prev == 'M' || prev == 'B') && 
                    (curr == 'M' || curr == 'E'))
                m_Legal[i][j] = 1;
        }
    }
}
Esempio n. 4
0
ProfileFelsenstein::ProfileFelsenstein(const MultSeqAlignment &A,
				       const Alphabet &alphabet,
				       const AlphabetMap &alphabetMap)
  : alignment(A), alphabet(alphabet), numAlpha(alphabet.size()),
    alphabetMap(alphabetMap)
{
  // ctor
}
Esempio n. 5
0
unsigned convertNmerCode(unsigned rawCode,int seqLength,const Alphabet &alphabet)
{
  Sequence S;
  S.resize(seqLength);
  unsigned base=alphabet.size();
  for(int i=seqLength-1 ; i>=0 ; --i) {
    unsigned digit=rawCode%base;
    Symbol s=(int)digit;
    S[i]=s;
    rawCode/=base;
  }	
  return S.asInt(alphabet,0,S.getLength());
}
void Segmentor::readEmbeddings(Alphabet &alpha, const string& inFile, NRMat<dtype>& emb) {
  static ifstream inf;
  if (inf.is_open()) {
    inf.close();
    inf.clear();
  }
  inf.open(inFile.c_str());

  static string strLine, curWord;
  static int wordId;

  //find the first line, decide the wordDim;
  while (1) {
    if (!my_getline(inf, strLine)) {
      break;
    }
    if (!strLine.empty())
      break;
  }

  int unknownId = alpha.from_string(m_classifier.fe.unknownkey);

  static vector<string> vecInfo;
  split_bychar(strLine, vecInfo, ' ');
  int wordDim = vecInfo.size() - 1;

  std::cout << "embedding dim is " << wordDim << std::endl;

  emb.resize(alpha.size(), wordDim);
  emb = 0.0;
  curWord = normalize_to_lowerwithdigit(vecInfo[0]);
  wordId = alpha.from_string(curWord);
  hash_set<int> indexers;
  dtype sum[wordDim];
  int count = 0;
  bool bHasUnknown = false;
  if (wordId >= 0) {
    count++;
    if (unknownId == wordId)
      bHasUnknown = true;
    indexers.insert(wordId);
    for (int idx = 0; idx < wordDim; idx++) {
      dtype curValue = atof(vecInfo[idx + 1].c_str());
      sum[idx] = curValue;
      emb[wordId][idx] = curValue;
    }

  } else {
    for (int idx = 0; idx < wordDim; idx++) {
      sum[idx] = 0.0;
    }
  }

  while (1) {
    if (!my_getline(inf, strLine)) {
      break;
    }
    if (strLine.empty())
      continue;
    split_bychar(strLine, vecInfo, ' ');
    if (vecInfo.size() != wordDim + 1) {
      std::cout << "error embedding file" << std::endl;
    }
    curWord = normalize_to_lowerwithdigit(vecInfo[0]);
    wordId = alpha.from_string(curWord);
    if (wordId >= 0) {
      count++;
      if (unknownId == wordId)
        bHasUnknown = true;
      indexers.insert(wordId);

      for (int idx = 0; idx < wordDim; idx++) {
        dtype curValue = atof(vecInfo[idx + 1].c_str());
        sum[idx] += curValue;
        emb[wordId][idx] += curValue;
      }
    }

  }

  if (!bHasUnknown) {
    for (int idx = 0; idx < wordDim; idx++) {
      emb[unknownId][idx] = sum[idx] / count;
    }
    count++;
    std::cout << unknownkey << " not found, using averaged value to initialize." << std::endl;
  }

  int oovWords = 0;
  int totalWords = 0;
  for (int id = 0; id < alpha.size(); id++) {
    if (indexers.find(id) == indexers.end()) {
      oovWords++;
      for (int idx = 0; idx < wordDim; idx++) {
        emb[id][idx] = emb[unknownId][idx];
      }
    }
    totalWords++;
  }

  std::cout << "OOV num is " << oovWords << ", total num is " << alpha.size() << ", embedding oov ratio is " << oovWords * 1.0 / alpha.size()
      << std::endl;

}