void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs, const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts, TVec<TStrFltPrV>& EntNmWgtPrVV) const { EntNmWgtPrVV.Clr(); // create bow PBowDocBs BowDocBs=TBowDocBs::New(); // collect documents TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV); DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs); if (DocIdV.Len()<MnDocs){return;} for (int DocN=0; DocN<DocIdV.Len(); DocN++){ int DocId=DocIdV[DocN]; PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId); // create vector of entity-weights TIntFltPrV WIdWgtPrV; for (int EntN=0; EntN<Doc->GetEnts(); EntN++){ int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq); TStr EntNm=SkyGridBs->GetEntNm(EntId); int EntWId=BowDocBs->AddWordStr(EntNm); WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq)); } // create bow-document int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV); TStr DocDescStr=Doc->GetTitleStr(); BowDocBs->PutDocDescStr(DId, DocDescStr); } // k-means clustering PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting PBowDocPart BowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output BowDocBs, // document data BowSim, // similarity function TRnd(1), // random generator Clusts, // number of clusters 1, // trials per k-means 1, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting 0, // cut-word-weights percentage 0); // minimal word frequency EntNmWgtPrVV.Clr(); for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){ PBowDocPartClust Clust=BowDocPart->GetClust(ClustN); TStrFltPrV WordStrWgtPrV; Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV); EntNmWgtPrVV.Add(WordStrWgtPrV); } //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false); }
bool test(PGraph &graph, bool followOut, bool followIn) { printf("\n================================\nFollowOut: %d, FollowIn: %d\n", followOut, followIn); int iters = 10; for (int k = 0; k < iters; k++) { TRnd rnd = TRnd((int)time(0)); int start = graph->GetRndNId(rnd); rnd.PutSeed(0); // int target = graph->GetRndNId(rnd); // printf("Start node: %d, target node: %d\n", start, target); int target = -1; printf("Start node: %d\n", start); struct timeval tv1, tv2; gettimeofday(&tv1, NULL); /* Hybrid */ TBreathFS<PGraph> bfs_hybrid(graph, true); int maxDist_hybrid = bfs_hybrid.DoBfsHybrid(start, followOut, followIn, target); gettimeofday(&tv2, NULL); double time_hybrid = timeInSeconds(tv1, tv2); /* Original */ gettimeofday(&tv1, NULL); TBreathFS<PGraph> bfs(graph, true); int maxDist = bfs.DoBfs(start, followOut, followIn, target); gettimeofday(&tv2, NULL); double time = timeInSeconds(tv1, tv2); /* Check results */ if (maxDist_hybrid != maxDist) { printf("MaxDist incorrect.\n"); return false; } if (target == -1) { if (!checkResults<PGraph>(bfs_hybrid, bfs)) { printf("NIdDistH values incorrect!\n"); return false; } } printf("Execution times: Original: %.2f, Hybrid: %.2f\n", time, time_hybrid); } return true; }
void TWebTravelHomeNet::StartTravel(){ PutConstrs(); TStrV UrlStrV(300000, 0); TIntIntH UserIdToDocsH(1000); PSIn SIn=PSIn(new TFIn(InFNm)); TILx Lx(SIn, TFSet()|iloRetEoln); TChA UrlStr; Lx.GetSym(syInt, syEof); while ((Lx.Sym!=syEof)&&(Lx.SymLnN<200000)){ // while (Lx.Sym!=syEof){ int UserId=Lx.Int; Lx.GetSym(syComma); Lx.GetInt(); Lx.GetSym(syComma); Lx.GetInt(); Lx.GetSym(syComma); Lx.GetInt(); Lx.GetSym(syComma); Lx.GetInt(); Lx.GetSym(syComma); TStr Method=Lx.GetIdStr(); Lx.GetSym(syComma); // GET, POST UrlStr.Clr(); UrlStr+=Lx.GetIdStr(); Lx.GetSym(syComma); // http, ftp UrlStr+="://"; UrlStr+=Lx.GetStrToCh(','); Lx.GetSym(syComma); // domain name UrlStr+=Lx.GetStrToEoln(); Lx.GetEoln(); // path if ((UserId==TgUserId)&&IsUrlOk(UrlStr)&&(Method=="GET")){ UserIdToDocsH.AddDat(UserId)++; UrlStrV.Add(UrlStr); } Lx.GetSym(syInt, syEof); if (Lx.SymLnN%100000==0){OnNotify(TInt::GetStr(Lx.SymLnN)+ " docs");} } int UserIdToDocsP=UserIdToDocsH.FFirstKeyId(); while (UserIdToDocsH.FNextKeyId(UserIdToDocsP)){ int UserId=UserIdToDocsH.GetKey(UserIdToDocsP); int Docs=UserIdToDocsH[UserIdToDocsP]; TStr MsgStr=TStr("User ")+TInt::GetStr(UserId)+": "+ TInt::GetStr(Docs)+" Docs."; OnNotify(MsgStr); } UrlStrV.Shuffle(TRnd()); for (int UrlStrN=0; UrlStrN<UrlStrV.Len(); UrlStrN++){ Go(UrlStrV[UrlStrN]); } }
void TMKCCASemSpace::CalcKCCA(const TVec<TBowMatrix>& BowMatrixV, const int& Dims, const double& Kapa, const int& CGMxIter, const int& HorstMxIter, TVec<TFltVV>& AlphaVV) { // some basic constants const int Langs = BowMatrixV.Len(); IAssert(Langs > 1); const int Docs = BowMatrixV[0].GetCols(); // reservate space on the output AlphaVV.Gen(Langs); for (int LangN = 0; LangN < Langs; LangN++) { IAssert(BowMatrixV[LangN].GetCols() == Docs); AlphaVV[LangN].Gen(Docs, Dims); AlphaVV[LangN].PutAll(0.0); } // par vektorjev TFltV x(Docs); // vektor dolzine docs TLAMisc::FillRnd(x, TRnd(1)); // napolnemo z random elementi TLinAlg::Normalize(x); // normiramo vektor TFltV y(BowMatrixV[1].GetRows()); // nek vektor // spreminanje elemnta v matriki AlphaVV[1](5,4) = 4.43; // skalarni produkt 5 stolpca matrike alphaVV[1] z vektorjem x TLinAlg::DotProduct(AlphaVV[1], 5, x); // pristeje 0.4 * prvi stolpec drugemu stoplcu TLinAlg::AddVec(0.4, AlphaVV[1], 0, AlphaVV[1], 1); // pristeje 0.4 * prvi stoplec vektorju x TLinAlg::AddVec(0.4, AlphaVV[1], 0, x); // naracunamo normo petega stoplca double Norm = TLinAlg::Norm(AlphaVV[1], 5); // preberemo cetrto vrstico matrike TFltV z; AlphaVV[1].GetRow(4, z); // preberemo cetrti stoplec matrike TFltV zz; AlphaVV[1].GetCol(4, zz); // mnozimo vektor x z matriko BowMatrixV[1] BowMatrixV[1].Multiply(x, y); BowMatrixV[1].MultiplyT(y, x); BowMatrixV[1].Multiply(AlphaVV[1], 0, y); }
int main(int argc, char* argv[]){ Try; // create environment Env=TEnv(argc, argv, TNotify::StdNotify); // command line parameters Env.PrepArgs("Bag-Of-Words K-Means"); TStr InBowFNm=Env.GetIfArgPrefixStr("-i:", "", "Input-Bow-File"); TStr OutPartFNm=Env.GetIfArgPrefixStr("-op:", "KMeans.BowPart", "Output-BowPartition-File"); TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "KMeans.Txt", "Output-Txt-File"); TStr OutXmlFNm=Env.GetIfArgPrefixStr("-ox:", "KMeans.Xml", "Output-Xml-File"); int Docs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents"); int Clusts=Env.GetIfArgPrefixInt("-clusts:", 10, "Clusters"); int RndSeed=Env.GetIfArgPrefixInt("-rseed:", 1, "RNG-Seed"); int ClustTrials=Env.GetIfArgPrefixInt("-ctrials:", 1, "Clustering-Trials"); double ConvergEps=Env.GetIfArgPrefixFlt("-ceps:", 10, "Convergence-Epsilon"); double CutWordWgtSumPrc=Env.GetIfArgPrefixFlt("-cutww:", 0.5, "Cut-Word-Weight-Sum-Percentage"); int MnWordFq=Env.GetIfArgPrefixInt("-mnwfq:", 5, "Minimal-Word-Frequency"); bool SaveDocNmP=Env.GetIfArgPrefixBool("-sdnm:", false, "Save-Document-Names"); if (Env.IsEndOfRun()){return 0;} // load data if (InBowFNm.Empty()){ TExcept::Throw("No Input-Bow-File specified!");} PBowDocBs BowDocBs=TBowDocBs::LoadBin(InBowFNm); // get doc-ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); if (Docs!=-1){AllDIdV.Trunc(Docs);} // get document partition PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting TSecTm StartTm=TSecTm::GetCurTm(); // get start-time PBowDocPart BowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output BowDocBs, // document data BowSim, // similarity function TRnd(RndSeed), // random generator Clusts, // number of clusters ClustTrials, // trials per k-means ConvergEps, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting CutWordWgtSumPrc, // cut-word-weights percentage MnWordFq, // minimal word frequency AllDIdV); // training documents TSecTm EndTm=TSecTm::GetCurTm(); // get end-time printf("Duration: %d secs\n", TSecTm::GetDSecs(StartTm, EndTm)); // output partition if (!OutPartFNm.Empty()){ TFOut SOut(OutPartFNm); BowDocPart->Save(SOut);} if (!OutTxtFNm.Empty()){ BowDocPart->SaveTxt(OutTxtFNm, BowDocBs, true, 15, 0.5, SaveDocNmP);} if (!OutXmlFNm.Empty()){ BowDocPart->SaveXml(OutXmlFNm, BowDocBs);} return 0; Catch; return 1; }
void __fastcall TContexterF::VizualizeBtClick(TObject *Sender){ if (!State->BowDocBs.Empty()){ // parameters int Clusts=TStr(VizClustsEd->Text.c_str()).GetInt(10); double ClustSimSumPrc=TStr(VizClustSimSumPrcEd->Text.c_str()).GetFlt(0.3)/100; // get doc-ids TIntV AllDIdV; State->BowDocBs->GetAllDIdV(AllDIdV); // get document partition PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting State->TmBowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output State->BowDocBs, // document data BowSim, // similarity function TRnd(1), // random generator Clusts, // number of clusters 1, // trials per k-means 10, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting 0.5, // cut-word-weights percentage 5, // minimal word frequency AllDIdV); // training documents // create graph PGraph Graph=TGGraph::New(); // create vertices TVrtxV VrtxV; for (int ClustN=0; ClustN<State->TmBowDocPart->GetClusts(); ClustN++){ // get cluster PBowDocPartClust Clust=State->TmBowDocPart->GetClust(ClustN); // get best words string TStrFltPrV WordStrWgtPrV; Clust->GetTopWordStrWgtPrV(State->BowDocBs, -1, 1.0, WordStrWgtPrV); TChA BestWordVChA; BestWordVChA+=TInt::GetStr(Clust->GetDocs())+" Docs\n"; TStrV UcWordStrSfV; for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){ // get word TStr UcWordStr=WordStrWgtPrV[WordN].Val1; // remove duplicates bool Ok=true; for (int WordSfN=0; WordSfN<UcWordStrSfV.Len(); WordSfN++){ if (UcWordStrSfV[WordSfN].IsStrIn(UcWordStr)){Ok=false; break;} if (UcWordStr.IsStrIn(UcWordStrSfV[WordSfN])){Ok=false; break;} } if (!Ok){continue;} // add word UcWordStrSfV.Add(UcWordStr); BestWordVChA+=WordStrWgtPrV[WordN].Val1; BestWordVChA+="\n"; // finish if limit reached if (UcWordStrSfV.Len()>=15){break;} } // create vertex TStr ClustNm=BestWordVChA; PVrtx Vrtx=new TGVrtx(ClustNm); Graph->AddVrtx(Vrtx); VrtxV.Add(Vrtx); } // create edges TFltIntIntTrV ClustSimN1N2TrV; State->TmBowDocPart->GetTopClustSimV(ClustSimSumPrc, ClustSimN1N2TrV); for (int ClustSimN=0; ClustSimN<ClustSimN1N2TrV.Len(); ClustSimN++){ double Sim=ClustSimN1N2TrV[ClustSimN].Val1; double ClustN1=ClustSimN1N2TrV[ClustSimN].Val2; double ClustN2=ClustSimN1N2TrV[ClustSimN].Val3; TStr EdgeNm=TFlt::GetStr(Sim, "%.2f"); PEdge Edge=new TGEdge(VrtxV[ClustN1], VrtxV[ClustN2], EdgeNm, false); Graph->AddEdge(Edge); Edge->PutWgt(TMath::Sqr(Sim)); } // place graph State->TmClustGraph=Graph; TRnd Rnd(1); State->TmClustGraph->PlaceSimAnnXY(Rnd, State->TmGks); // get area-partition UpdateClustRectPrV(); // draw graph State->TmGks->Clr(); TmPbPaint(Sender); } }