Ejemplo n.º 1
0
void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs,
 const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts,
 TVec<TStrFltPrV>& EntNmWgtPrVV) const {
  EntNmWgtPrVV.Clr();
  // create bow
  PBowDocBs BowDocBs=TBowDocBs::New();
  // collect documents
  TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV);
  DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs);
  if (DocIdV.Len()<MnDocs){return;}
  for (int DocN=0; DocN<DocIdV.Len(); DocN++){
    int DocId=DocIdV[DocN];
    PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId);
    // create vector of entity-weights
    TIntFltPrV WIdWgtPrV;
    for (int EntN=0; EntN<Doc->GetEnts(); EntN++){
      int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq);
      TStr EntNm=SkyGridBs->GetEntNm(EntId);
      int EntWId=BowDocBs->AddWordStr(EntNm);
      WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq));
    }
    // create bow-document
    int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV);
    TStr DocDescStr=Doc->GetTitleStr();
    BowDocBs->PutDocDescStr(DId, DocDescStr);
  }
  // k-means clustering
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
  PBowDocPart BowDocPart=TBowClust::GetKMeansPart(
   TNotify::StdNotify, // log output
   BowDocBs, // document data
   BowSim, // similarity function
   TRnd(1), // random generator
   Clusts, // number of clusters
   1, // trials per k-means
   1, // convergence epsilon for k-means
   1, // min. documents per cluster
   WordWgtType, // word weighting
   0, // cut-word-weights percentage
   0); // minimal word frequency
  EntNmWgtPrVV.Clr();
  for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){
    PBowDocPartClust Clust=BowDocPart->GetClust(ClustN);
    TStrFltPrV WordStrWgtPrV;
    Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV);
    EntNmWgtPrVV.Add(WordStrWgtPrV);
  }
  //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false);
}
Ejemplo n.º 2
0
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, 
        const TIntV& IgnoreIdV, const int& TrainLen) {

    // feature generators
	PFtrGenBs FtrGenBs = TFtrGenBs::New();
    // CSV parsing stuff
    PSIn SIn = TFIn::New(FNm); 
    char SsCh = ' '; TStrV FldValV;
    // read the headers and initialise the feature generators
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
        const TStr& FldVal = FldValV[FldValN];
        if (FldValN == ClassId) { 
            if (FldVal == "NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenNominal::New());
            } else if (FldVal == "MULTI-NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!");
            }
        } else if (!IgnoreIdV.IsIn(FldValN)) {
            if (FldVal == TFtrGenNumeric::GetType()) {
				FtrGenBs->AddFtrGen(TFtrGenNumeric::New());
            } else if (FldVal == TFtrGenNominal::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenNominal::New());
            } else if (FldVal == TFtrGenToken::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenToken::New(
                    TSwSet::New(swstNone), TStemmer::New(stmtNone)));
            } else if (FldVal == TFtrGenSparseNumeric::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New());
            } else if (FldVal == TFtrGenMultiNom::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong type '" + FldVal + "'!");
            }
        }
    }
    const int Flds = FldValV.Len();
    // read the lines and feed them to the feature generators
    int Recs = 0;
    while (!SIn->Eof()) {
        if (Recs == TrainLen) { break; }
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines
        try {
			TStrV FtrValV;
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
					FtrGenBs->UpdateCls(FldVal);
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
			FtrGenBs->Update(FtrValV);
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
    }
    // read the file again and feed it to the training set
    PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs();
    // we read and ignore the headers since we parsed them already 
    SIn = TFIn::New(FNm); SsCh = ' ';
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    // read the lines and feed them to the training set
    Recs = 0;
    while (!SIn->Eof()){
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines and construct the sparse vector
		TStrV FtrValV; TStr ClsFtrVal;
        try {
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
                    ClsFtrVal = FldVal;
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
        // add the feature vector to trainsets
		FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal);
    }
	// prepare training and testing doc ids
	TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted());
	TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen);
	BowDocBs->PutTrainDIdV(TrainDIdV);
	TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV);
	BowDocBs->PutTestDIdV(TestDIdV);

    return BowDocBs;
}
Ejemplo n.º 3
0
int main(int argc, char* argv[]){
  Try;
  // create environment
  Env=TEnv(argc, argv, TNotify::StdNotify);

  // command line parameters
  Env.PrepArgs("Bag-Of-Words K-Means");
  TStr InBowFNm=Env.GetIfArgPrefixStr("-i:", "", "Input-Bow-File");
  TStr OutPartFNm=Env.GetIfArgPrefixStr("-op:", "KMeans.BowPart", "Output-BowPartition-File");
  TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "KMeans.Txt", "Output-Txt-File");
  TStr OutXmlFNm=Env.GetIfArgPrefixStr("-ox:", "KMeans.Xml", "Output-Xml-File");
  int Docs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents");
  int Clusts=Env.GetIfArgPrefixInt("-clusts:", 10, "Clusters");
  int RndSeed=Env.GetIfArgPrefixInt("-rseed:", 1, "RNG-Seed");
  int ClustTrials=Env.GetIfArgPrefixInt("-ctrials:", 1, "Clustering-Trials");
  double ConvergEps=Env.GetIfArgPrefixFlt("-ceps:", 10, "Convergence-Epsilon");
  double CutWordWgtSumPrc=Env.GetIfArgPrefixFlt("-cutww:", 0.5, "Cut-Word-Weight-Sum-Percentage");
  int MnWordFq=Env.GetIfArgPrefixInt("-mnwfq:", 5, "Minimal-Word-Frequency");
  bool SaveDocNmP=Env.GetIfArgPrefixBool("-sdnm:", false, "Save-Document-Names");
  if (Env.IsEndOfRun()){return 0;}

  // load data
  if (InBowFNm.Empty()){
    TExcept::Throw("No Input-Bow-File specified!");}
  PBowDocBs BowDocBs=TBowDocBs::LoadBin(InBowFNm);

  // get doc-ids
  TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV);
  if (Docs!=-1){AllDIdV.Trunc(Docs);}

  // get document partition
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
  TSecTm StartTm=TSecTm::GetCurTm(); // get start-time
  PBowDocPart BowDocPart=TBowClust::GetKMeansPart(
   TNotify::StdNotify, // log output
   BowDocBs, // document data
   BowSim, // similarity function
   TRnd(RndSeed), // random generator
   Clusts, // number of clusters
   ClustTrials, // trials per k-means
   ConvergEps, // convergence epsilon for k-means
   1, // min. documents per cluster
   WordWgtType, // word weighting
   CutWordWgtSumPrc, // cut-word-weights percentage
   MnWordFq, // minimal word frequency
   AllDIdV); // training documents
  TSecTm EndTm=TSecTm::GetCurTm(); // get end-time
  printf("Duration: %d secs\n", TSecTm::GetDSecs(StartTm, EndTm));

  // output partition
  if (!OutPartFNm.Empty()){
    TFOut SOut(OutPartFNm); BowDocPart->Save(SOut);}
  if (!OutTxtFNm.Empty()){
    BowDocPart->SaveTxt(OutTxtFNm, BowDocBs, true, 15, 0.5, SaveDocNmP);}
  if (!OutXmlFNm.Empty()){
    BowDocPart->SaveXml(OutXmlFNm, BowDocBs);}

  return 0;
  Catch;
  return 1;
}