示例#1
0
//////////////////////////////////////////////////////////////////////////
// String-To-Words
void TStrParser::DocStrToWIdV(const TStr& _DocStr, TIntV& WordIdV, const bool& Stemm) {
    TStr DocStr = _DocStr.GetUc();  // to upper case
    TStrV WordV; DocStr.SplitOnWs(WordV); int WordN = WordV.Len();
    WordIdV.Reserve(WordN, 0);

    PStemmer Stemmer = TStemmer::New(stmtPorter);
    TIntH WordsInDoc;
    for (int WordC = 0; WordC < WordN; WordC++) {
        TStr WordStr;
        if (Stemm) {
            WordStr = Stemmer->GetStem(WordV[WordC]);
        } else {
            WordStr = WordV[WordC];
        }
        int WId = GetWId(WordStr);
        if (WId == -1) {
            WId = WordToIdH.AddKey(WordStr);
            WordToIdH[WId] = 0;
        }
        WordIdV.Add(WId);
        
        // is it first time we see this word in this doc?
        if (!WordsInDoc.IsKey(WId)) WordsInDoc.AddKey(WId);
    }

    //do some statistics for DF
    DocsParsed++;
    for (int i = 0, l = WordsInDoc.Len(); i < l; i++)
        WordToIdH[WordsInDoc.GetKey(i)]++;

    Assert(WordV.Len() == WordIdV.Len());
}
示例#2
0
TUStr TUStr::GetSubValV(const int& BChN, const int& EChN){
	//size_t Len = EChN - BChN + 1;
	TIntV UniChV; 
	int capacity = BChN - EChN + 1;
	if(capacity < this->UniChV.Reserved()){
		UniChV.Reserve(BChN - EChN + 1, 0);
	}
	this->UniChV.GetSubValV(BChN, EChN, UniChV);
	return TUStr(UniChV);
}
示例#3
0
void TUStr::GetWordBoundPV(TIntV& WordBoundPosV){
  TBoolV WordBoundPV;
  TUnicodeDef::GetDef()->FindWordBoundaries(UniChV, WordBoundPV);
  int n = WordBoundPV.Len();
  WordBoundPosV.Reserve(n);
  for(int i = 0; i <= n; i++){
	  if(WordBoundPV[i]){
		WordBoundPosV.Add(i);
	  }
  }
}
示例#4
0
void TStrParser::DocStrToChIdV(const TStr& _DocStr, TIntV& ChIdV) {
    TStr DocStr = _DocStr.GetUc();  // to upper case
    int ChN = DocStr.Len();
    ChIdV.Reserve(ChN, 0);
    for (int ChC = 0; ChC < ChN; ChC++) {
        TStr ChStr = DocStr.GetSubStr(ChC,ChC);
        int ChId = GetWId(ChStr);
        if (ChId != -1) {
            WordToIdH[ChId]++;
        } else {
            ChId = WordToIdH.AddKey(ChStr);
            WordToIdH[ChId] = 1;
        }
        ChIdV.Add(ChId);
    }
}