////////////////////////////////////////////////////////////////////////// // String-To-Words void TStrParser::DocStrToWIdV(const TStr& _DocStr, TIntV& WordIdV, const bool& Stemm) { TStr DocStr = _DocStr.GetUc(); // to upper case TStrV WordV; DocStr.SplitOnWs(WordV); int WordN = WordV.Len(); WordIdV.Reserve(WordN, 0); PStemmer Stemmer = TStemmer::New(stmtPorter); TIntH WordsInDoc; for (int WordC = 0; WordC < WordN; WordC++) { TStr WordStr; if (Stemm) { WordStr = Stemmer->GetStem(WordV[WordC]); } else { WordStr = WordV[WordC]; } int WId = GetWId(WordStr); if (WId == -1) { WId = WordToIdH.AddKey(WordStr); WordToIdH[WId] = 0; } WordIdV.Add(WId); // is it first time we see this word in this doc? if (!WordsInDoc.IsKey(WId)) WordsInDoc.AddKey(WId); } //do some statistics for DF DocsParsed++; for (int i = 0, l = WordsInDoc.Len(); i < l; i++) WordToIdH[WordsInDoc.GetKey(i)]++; Assert(WordV.Len() == WordIdV.Len()); }
void LSH::GetHashedShinglesOfCluster(TQuoteBase *QuoteBase, TCluster& C, TInt ShingleLen, THashSet<TMd5Sig>& HashedShingles) { TIntV QuoteIds; C.GetQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); TStr QContentStr; Q.GetContentString(QContentStr); TStr QContentStrNoPunc; TStringUtil::RemovePunctuation(QContentStr, QContentStrNoPunc); TStrV QContentV; QContentStrNoPunc.SplitOnWs(QContentV); for (int i = 0; i < QContentV.Len() - ShingleLen + 1; i++) { TStr Shingle; for (int j = 0; j < ShingleLen; j++) { if (j > 0) { Shingle.InsStr(Shingle.Len(), " "); } Shingle.InsStr(Shingle.Len(), QContentV[i + j]); } TMd5Sig ShingleMd5(Shingle); HashedShingles.AddKey(ShingleMd5); } } }
// <last_name>_<first name innitial> TStr TStrUtil::GetStdName(TStr AuthorName) { TStr StdName; AuthorName.ToLc(); AuthorName.ChangeChAll('\n', ' '); AuthorName.ChangeChAll('.', ' '); // if there is a number in the name, remove it and everything after it int i, pos = 0; while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) { pos++; } if (pos < AuthorName.Len()) { AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); } if (AuthorName.Empty()) { return TStr::GetNullStr(); } // replace everything after '(' int b = AuthorName.SearchCh('('); if (b != -1) { AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); } // skip if contains ')' if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); } // skip if it is not a name if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1 || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) { return TStr::GetNullStr(); } // remove all non-letters (latex tags, ...) TChA NewName; for (i = 0; i < AuthorName.Len(); i++) { const char Ch = AuthorName[i]; if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; } } StdName = NewName; StdName.ToTrunc(); TStrV AuthNmV; StdName.SplitOnWs(AuthNmV); // too short -- not a name if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast(); if (AuthNmV.Len() < 2) return TStr::GetNullStr(); const TStr LastNm = AuthNmV.Last(); if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr(); IAssert(isalpha(AuthNmV[0][0])); return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]); }