Ejemplo n.º 1
0
//////////////////////////////////////////////////////////////////////////
// String-To-Words
void TStrParser::DocStrToWIdV(const TStr& _DocStr, TIntV& WordIdV, const bool& Stemm) {
    TStr DocStr = _DocStr.GetUc();  // to upper case
    TStrV WordV; DocStr.SplitOnWs(WordV); int WordN = WordV.Len();
    WordIdV.Reserve(WordN, 0);

    PStemmer Stemmer = TStemmer::New(stmtPorter);
    TIntH WordsInDoc;
    for (int WordC = 0; WordC < WordN; WordC++) {
        TStr WordStr;
        if (Stemm) {
            WordStr = Stemmer->GetStem(WordV[WordC]);
        } else {
            WordStr = WordV[WordC];
        }
        int WId = GetWId(WordStr);
        if (WId == -1) {
            WId = WordToIdH.AddKey(WordStr);
            WordToIdH[WId] = 0;
        }
        WordIdV.Add(WId);
        
        // is it first time we see this word in this doc?
        if (!WordsInDoc.IsKey(WId)) WordsInDoc.AddKey(WId);
    }

    //do some statistics for DF
    DocsParsed++;
    for (int i = 0, l = WordsInDoc.Len(); i < l; i++)
        WordToIdH[WordsInDoc.GetKey(i)]++;

    Assert(WordV.Len() == WordIdV.Len());
}
Ejemplo n.º 2
0
void LSH::GetHashedShinglesOfCluster(TQuoteBase *QuoteBase, TCluster& C,
    TInt ShingleLen, THashSet<TMd5Sig>& HashedShingles) {
  TIntV QuoteIds;
  C.GetQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);
    TStr QContentStr;
    Q.GetContentString(QContentStr);
    TStr QContentStrNoPunc;
    TStringUtil::RemovePunctuation(QContentStr, QContentStrNoPunc);
    TStrV QContentV;
    QContentStrNoPunc.SplitOnWs(QContentV);
    for (int i = 0; i < QContentV.Len() - ShingleLen + 1; i++) {
      TStr Shingle;
      for (int j = 0; j < ShingleLen; j++) {
        if (j > 0) {
          Shingle.InsStr(Shingle.Len(), " ");
        }
        Shingle.InsStr(Shingle.Len(), QContentV[i + j]);
      }
      TMd5Sig ShingleMd5(Shingle);
      HashedShingles.AddKey(ShingleMd5);
    }
  }
}
Ejemplo n.º 3
0
Archivo: util.cpp Proyecto: pikma/Snap
// <last_name>_<first name innitial>
TStr TStrUtil::GetStdName(TStr AuthorName) {
    TStr StdName;
    AuthorName.ToLc();
    AuthorName.ChangeChAll('\n', ' ');
    AuthorName.ChangeChAll('.', ' ');
    // if there is a number in the name, remove it and everything after it
    int i, pos = 0;
    while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
        pos++;
    }
    if (pos < AuthorName.Len()) {
        AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc();
    }
    if (AuthorName.Empty()) {
        return TStr::GetNullStr();
    }

    // replace everything after '('
    int b = AuthorName.SearchCh('(');
    if (b != -1) {
        AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc();
    }
    // skip if contains ')'
    if (AuthorName .SearchCh(')')!=-1) {
        return TStr::GetNullStr();
    }
    // skip if it is not a name
    if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
            || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
        return TStr::GetNullStr();
    }
    // remove all non-letters (latex tags, ...)
    TChA NewName;
    for (i = 0; i < AuthorName.Len(); i++) {
        const char Ch = AuthorName[i];
        if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') {
            NewName += Ch;
        }
    }
    StdName = NewName;
    StdName.ToTrunc();
    TStrV AuthNmV;
    StdName.SplitOnWs(AuthNmV);
    // too short -- not a name
    if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
    if (AuthNmV.Len() < 2) return TStr::GetNullStr();

    const TStr LastNm = AuthNmV.Last();
    if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();

    IAssert(isalpha(AuthNmV[0][0]));
    return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
}