void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs, const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts, TVec<TStrFltPrV>& EntNmWgtPrVV) const { EntNmWgtPrVV.Clr(); // create bow PBowDocBs BowDocBs=TBowDocBs::New(); // collect documents TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV); DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs); if (DocIdV.Len()<MnDocs){return;} for (int DocN=0; DocN<DocIdV.Len(); DocN++){ int DocId=DocIdV[DocN]; PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId); // create vector of entity-weights TIntFltPrV WIdWgtPrV; for (int EntN=0; EntN<Doc->GetEnts(); EntN++){ int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq); TStr EntNm=SkyGridBs->GetEntNm(EntId); int EntWId=BowDocBs->AddWordStr(EntNm); WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq)); } // create bow-document int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV); TStr DocDescStr=Doc->GetTitleStr(); BowDocBs->PutDocDescStr(DId, DocDescStr); } // k-means clustering PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting PBowDocPart BowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output BowDocBs, // document data BowSim, // similarity function TRnd(1), // random generator Clusts, // number of clusters 1, // trials per k-means 1, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting 0, // cut-word-weights percentage 0); // minimal word frequency EntNmWgtPrVV.Clr(); for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){ PBowDocPartClust Clust=BowDocPart->GetClust(ClustN); TStrFltPrV WordStrWgtPrV; Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV); EntNmWgtPrVV.Add(WordStrWgtPrV); } //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false); }
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, const TIntV& IgnoreIdV, const int& TrainLen) { // feature generators PFtrGenBs FtrGenBs = TFtrGenBs::New(); // CSV parsing stuff PSIn SIn = TFIn::New(FNm); char SsCh = ' '; TStrV FldValV; // read the headers and initialise the feature generators TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { if (FldVal == "NOM") { FtrGenBs->PutClsFtrGen(TFtrGenNominal::New()); } else if (FldVal == "MULTI-NOM") { FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!"); } } else if (!IgnoreIdV.IsIn(FldValN)) { if (FldVal == TFtrGenNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNumeric::New()); } else if (FldVal == TFtrGenNominal::GetType()) { FtrGenBs->AddFtrGen(TFtrGenNominal::New()); } else if (FldVal == TFtrGenToken::GetType()) { FtrGenBs->AddFtrGen(TFtrGenToken::New( TSwSet::New(swstNone), TStemmer::New(stmtNone))); } else if (FldVal == TFtrGenSparseNumeric::GetType()) { FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New()); } else if (FldVal == TFtrGenMultiNom::GetType()) { FtrGenBs->AddFtrGen(TFtrGenMultiNom::New()); } else { TExcept::Throw("Wrong type '" + FldVal + "'!"); } } } const int Flds = FldValV.Len(); // read the lines and feed them to the feature generators int Recs = 0; while (!SIn->Eof()) { if (Recs == TrainLen) { break; } Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines try { TStrV FtrValV; for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { FtrGenBs->UpdateCls(FldVal); } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } FtrGenBs->Update(FtrValV); } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } } // read the file again and feed it to the training set PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs(); // we read and ignore the headers since we parsed them already SIn = TFIn::New(FNm); SsCh = ' '; TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // read the lines and feed them to the training set Recs = 0; while (!SIn->Eof()){ Recs++; printf("%7d\r", Recs); TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false); // make sure line still has the same number of fields as the header EAssertR(FldValV.Len() == Flds, TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!", Recs + 1, FldValV.Len(), Flds)); // go over lines and construct the sparse vector TStrV FtrValV; TStr ClsFtrVal; try { for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) { const TStr& FldVal = FldValV[FldValN]; if (FldValN == ClassId) { ClsFtrVal = FldVal; } else if (!IgnoreIdV.IsIn(FldValN)) { FtrValV.Add(FldVal); } } } catch (PExcept Ex) { TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", Recs+1, Ex->GetMsgStr().CStr())); } // add the feature vector to trainsets FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal); } // prepare training and testing doc ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted()); TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen); BowDocBs->PutTrainDIdV(TrainDIdV); TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV); BowDocBs->PutTestDIdV(TestDIdV); return BowDocBs; }
int main(int argc, char* argv[]){ Try; // create environment Env=TEnv(argc, argv, TNotify::StdNotify); // command line parameters Env.PrepArgs("Bag-Of-Words K-Means"); TStr InBowFNm=Env.GetIfArgPrefixStr("-i:", "", "Input-Bow-File"); TStr OutPartFNm=Env.GetIfArgPrefixStr("-op:", "KMeans.BowPart", "Output-BowPartition-File"); TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "KMeans.Txt", "Output-Txt-File"); TStr OutXmlFNm=Env.GetIfArgPrefixStr("-ox:", "KMeans.Xml", "Output-Xml-File"); int Docs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents"); int Clusts=Env.GetIfArgPrefixInt("-clusts:", 10, "Clusters"); int RndSeed=Env.GetIfArgPrefixInt("-rseed:", 1, "RNG-Seed"); int ClustTrials=Env.GetIfArgPrefixInt("-ctrials:", 1, "Clustering-Trials"); double ConvergEps=Env.GetIfArgPrefixFlt("-ceps:", 10, "Convergence-Epsilon"); double CutWordWgtSumPrc=Env.GetIfArgPrefixFlt("-cutww:", 0.5, "Cut-Word-Weight-Sum-Percentage"); int MnWordFq=Env.GetIfArgPrefixInt("-mnwfq:", 5, "Minimal-Word-Frequency"); bool SaveDocNmP=Env.GetIfArgPrefixBool("-sdnm:", false, "Save-Document-Names"); if (Env.IsEndOfRun()){return 0;} // load data if (InBowFNm.Empty()){ TExcept::Throw("No Input-Bow-File specified!");} PBowDocBs BowDocBs=TBowDocBs::LoadBin(InBowFNm); // get doc-ids TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); if (Docs!=-1){AllDIdV.Trunc(Docs);} // get document partition PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting TSecTm StartTm=TSecTm::GetCurTm(); // get start-time PBowDocPart BowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output BowDocBs, // document data BowSim, // similarity function TRnd(RndSeed), // random generator Clusts, // number of clusters ClustTrials, // trials per k-means ConvergEps, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting CutWordWgtSumPrc, // cut-word-weights percentage MnWordFq, // minimal word frequency AllDIdV); // training documents TSecTm EndTm=TSecTm::GetCurTm(); // get end-time printf("Duration: %d secs\n", TSecTm::GetDSecs(StartTm, EndTm)); // output partition if (!OutPartFNm.Empty()){ TFOut SOut(OutPartFNm); BowDocPart->Save(SOut);} if (!OutTxtFNm.Empty()){ BowDocPart->SaveTxt(OutTxtFNm, BowDocBs, true, 15, 0.5, SaveDocNmP);} if (!OutXmlFNm.Empty()){ BowDocPart->SaveXml(OutXmlFNm, BowDocBs);} return 0; Catch; return 1; }