Exemple #1
0
Dictionary<vector<FeatVal> > * KyteaModel::makeDictionaryFromPrefixes(const vector<KyteaString> & prefs, StringUtil* util, bool adjustPos) {
    typedef Dictionary<vector<FeatVal> >::WordMap WordMap;
    WordMap wm;
    int pos;
    for(int i = 0; i < (int)names_.size(); i++) {
        const KyteaString & str = names_[i];
        for(pos = 0; pos < (int)prefs.size() && !str.beginsWith(prefs[pos]); pos++);
        if(pos != (int)prefs.size()) {
            featuresAdded_++;
            KyteaString name = str.substr(prefs[pos].length());
            WordMap::iterator it = wm.find(name);
            if(it == wm.end()) {
                pair<WordMap::iterator, bool> p = wm.insert(WordMap::value_type(name,new vector<FeatVal>(prefs.size()*numW_)));
                it = p.first;
            }
            // If this is an n-gram dictionary, adjust the position according to
            // n-gram length, otherwise just use the location of th eprefix
            int id = (adjustPos ?
                (prefs.size()-pos-name.length())*numW_ :
                pos*numW_
            );
            for(int j = 0; j < numW_; j++) {
                // cerr << "adding for "<<util->showString(str)<<" @ "<<util->showString(name) << " ["<<id<<"]"<<"/"<<(*it->second).size()<<" == "<<getWeight(i,j)<<"/"<<weights_.size()<< " == " <<getWeight(i-1,j) * labels_[0]<<endl;
                (*it->second)[id+j] = getWeight(i-1,j) * labels_[0];
            }
        }
    }
    if(wm.size() > 0) {
        Dictionary<vector<FeatVal> > * ret = new Dictionary<vector<FeatVal> >(util);
        ret->buildIndex(wm);
        return ret;
    }
    return NULL;
}
Exemple #2
0
void Dictionary<Entry>::buildIndex(const WordMap & input) {
    if(input.size() == 0)
        THROW_ERROR("Cannot build dictionary for no input");
    clearData();
    states_.push_back(new DictionaryState());
    buildGoto(input.begin(), input.end(), 0, 0);
    buildFailures();
}
Exemple #3
0
int
main(int argc, char *argv[])
{
  ECArgs args( argc, argv );
  assert(args.nargs() == 1);
  ECString path(args.arg(0));
  cerr << "At start of pHsgt" << endl;

  for(int n = 0 ; n < MAXNUMNTS ; n++)
    numTerm[n] = 0;

  Term::init( path );
  readHeadInfo(path);

  int sentenceCount = 0;

  ECString s1lex("^^");
  ECString s1nm("S1");
  int s1Int = Term::get(s1nm)->toInt();
	
  UnitRules ur;
  ur.init();
  while(cin)
    {
      //if(sentenceCount > 4000) break;
      if(sentenceCount%10000 == 0) cerr << sentenceCount << endl;
      InputTree  parse;
      cin >> parse;
      //cerr << parse << endl;
      if(!cin) break;
      if(parse.length() == 0) break;
       EcSPairs wtList;
       parse.make(wtList); 
       InputTree* par;
       par = &parse;

      addWwData(par);
      incrWordData(s1Int, s1lex);
      ur.gatherData(par);
      sentenceCount++;
    }
  ECString resultsString(path);
  resultsString += "pSgT.txt";
  ofstream     resultsStream(resultsString.c_str());
  assert(resultsStream);

  int numWords = 0;
  resultsStream << "       \n";  //leave space for number of words;
  resultsStream.precision(3);
  ECString lastWord;
  int wordFreq = 0;
  WordMap::iterator wmi = wordMap.begin();
  resultsStream << wordMap.size() << "\n\n";
  for( ; wmi != wordMap.end() ; wmi++)
    {
      ECString w = (*wmi).first;
      resultsStream << w << "\t";
      PosD& posd = (*wmi).second;
      PosD::iterator pdi = posd.begin();
      int count = 0;
      for( ; pdi != posd.end(); pdi++)
	{
	  int posInt = (*pdi).first;
	  int c = (*pdi).second;
	  count += c;
	  float p = (float)c/(float)numTerm[posInt];
	  resultsStream << posInt << " " << p << " ";
	}
      resultsStream << "| " << count << "\n";
    }
  ur.setData(path);
  return 1;
}