bool TBagOfWords::Update(const TStrV& TokenStrV) { // Generate Ngrams if necessary TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); // process tokens to update DF counts bool UpdateP = false; if (IsHashing()) { // consolidate tokens and get their hashed IDs TIntSet TokenIdH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TInt TokenId = TokenStr.GetHashTrick() % HashDim; TokenIdH.AddKey(TokenId); if (IsStoreHashWords()) { HashWordV[TokenId].AddKey(TokenStr); } } // update document counts int KeyId = TokenIdH.FFirstKeyId(); while (TokenIdH.FNextKeyId(KeyId)) { const int TokenId = TokenIdH.GetKey(KeyId); // update DF DocFqV[TokenId]++; } } else { // consolidate tokens TStrH TokenStrH; for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; TokenStrH.AddKey(TokenStr); } // update document counts and update vocabulary with new tokens int KeyId = TokenStrH.FFirstKeyId(); while (TokenStrH.FNextKeyId(KeyId)) { // get token const TStr& TokenStr = TokenStrH.GetKey(KeyId); // different processing for hashing int TokenId = TokenSet.GetKeyId(TokenStr); if (TokenId == -1) { // new token, remember the dimensionality change UpdateP = true; // remember the new token TokenId = TokenSet.AddKey(TokenStr); // increase document count table const int TokenDfId = DocFqV.Add(0); // increase also the old count table OldDocFqV.Add(0.0); // make sure we DF vector and TokenSet still in sync IAssert(TokenId == TokenDfId); IAssert(DocFqV.Len() == OldDocFqV.Len()); } // document count update DocFqV[TokenId]++; } } // update document count Docs++; // tell if dimension changed return UpdateP; }
TStr TStrUtil::GetStr(const TStrH& StrH, const TStr& FieldDelimiterStr, const TStr& DelimiterStr) { if (StrH.Empty()) {return TStr();} TChA ResChA; int KeyId = StrH.FFirstKeyId(); while (StrH.FNextKeyId(KeyId)) { if (!ResChA.Empty()) { ResChA+=DelimiterStr; } ResChA+=StrH.GetKey(KeyId); ResChA+=FieldDelimiterStr; ResChA+=StrH[KeyId].GetStr(); } return ResChA; }
void TFtrGenToken::Update(const TStr& Val) { TStrV TokenStrV; GetTokenV(Val, TokenStrV); TStrH TokenStrH; for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) { const TStr& TokenStr = TokenStrV[TokenStrN]; TokenStrH.AddKey(TokenStr); } int KeyId = TokenStrH.FFirstKeyId(); while (TokenStrH.FNextKeyId(KeyId)) { const TStr& TokenStr = TokenStrH.GetKey(KeyId); TokenH.AddDat(TokenStr)++; } Docs++; }
void TCycBs::GetRelNmV(TStrV& RelNmV){ TStrH RelNmH; // traverse vertices for (int VId=0; VId<GetVIds(); VId++){ TStr VNm=GetVNm(VId); TCycVrtx& Vrtx=GetVrtx(VId); // traverse edges per vertice for (int EdgeN=0; EdgeN<Vrtx.GetEdges(); EdgeN++){ // extract relation name TCycEdge& Edge=Vrtx.GetEdge(EdgeN); TStr RelNm=GetVNm(Edge.GetRelId()); // add relation name to the pool RelNmH.AddKey(RelNm); } } // extract relation-name-vector RelNmH.GetKeyV(RelNmV); }
void TCpDoc::SaveAsfaToCpd(const TStr& InFPath, const TStr& OutCpdFNm){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse files TStrH AccessionIdH; TFFile FFile(TStr::GetNrFPath(InFPath)+"*.Asfa"); TStr AsfaFNm; while (FFile.Next(AsfaFNm)){ printf("Processing file '%s'\n", AsfaFNm.CStr()); PSIn SIn=TFIn::New(AsfaFNm); TILx Lx(SIn, TFSet(iloRetEoln, iloExcept)); Lx.GetSym(syLn, syEof); while (Lx.Sym!=syEof){ // Query Line TStr QueryLnStr=Lx.Str; TStrV QueryStrV; QueryLnStr.SplitOnAllCh('\t', QueryStrV, false); IAssert(QueryStrV[0]=="Query"); // RecordNo Line Lx.GetSym(syLn); TStr RecNoLnStr=Lx.Str; TStrV RecNoStrV; RecNoLnStr.SplitOnAllCh('\t', RecNoStrV, false); IAssert(RecNoStrV[0]=="RecordNo"); //int RecN=RecNoStrV[1].GetInt(); // fields (format: Short-Name Tab Long-Name Tab Value-String) TStr TitleStr, AbstractStr, PublicationYearStr, AccessionId; TStrV AuthorNmV; TStrV TermNmV1, TermNmV2; while (true){ Lx.GetSym(syLn); TStr FldLnStr=Lx.Str; TStrV FldStrV; FldLnStr.SplitOnAllCh('\t', FldStrV, false); if (FldStrV[0]=="----"){ if (!AccessionIdH.IsKey(AccessionId)){ AccessionIdH.AddKey(AccessionId); // create & save cpd document PCpDoc CpDoc=TCpDoc::New(); CpDoc->DocNm=AccessionId; CpDoc->DateStr=PublicationYearStr; CpDoc->TitleStr=TitleStr; CpDoc->ParStrV.Add(AbstractStr); CpDoc->TopCdNmV=TermNmV1; CpDoc->GeoCdNmV=TermNmV2; CpDoc->IndCdNmV=AuthorNmV; CpDoc->Save(*SOut); } else {/*printf("[%s]", AccessionId.CStr());*/} break; } else if (FldStrV[0]=="TI"){ TitleStr=FldStrV[2]; } else if (FldStrV[0]=="TI"){ TitleStr=FldStrV[2]; } else if (FldStrV[0]=="AU"){ FldStrV[2].SplitOnAllCh(';', AuthorNmV); for (int StrN=0; StrN<AuthorNmV.Len(); StrN++){AuthorNmV[StrN].ToTrunc();} } else if (FldStrV[0]=="AB"){ AbstractStr=FldStrV[2]; } else if (FldStrV[0]=="PY"){ PublicationYearStr=FldStrV[2]; } else if (FldStrV[0]=="DE"){ FldStrV[2].SplitOnAllCh(';', TermNmV1); for (int StrN=0; StrN<TermNmV1.Len(); StrN++){TermNmV1[StrN].ToTrunc();} } else if (FldStrV[0]=="CL"){ FldStrV[2].SplitOnAllCh(';', TermNmV2); for (int StrN=0; StrN<TermNmV2.Len(); StrN++){TermNmV2[StrN].ToTrunc();} } else if (FldStrV[0]=="AN"){ AccessionId=FldStrV[2]; } } printf("%d\r", AccessionIdH.Len()); Lx.GetSym(syLn, syEof); } } }