Пример #1
0
void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){
  TFOut SOut(FNm);
  int Docs=BowDocBs->GetDocs();
  for (int DId=0; DId<Docs; DId++){
    printf("%d/%d\r", DId+1, Docs);
    // output document-name
    TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId));
    SOut.PutStr(DocNm);
    // output categories
    for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){
      int CId=BowDocBs->GetDocCId(DId, CIdN);
      TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId));
      SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm);
    }
    // output words
    if (UseDocStrP){
      TStr DocStr=BowDocBs->GetDocStr(DId);
//      DocStr.DelChAll('\n'); DocStr.DelChAll('\r');
      SOut.PutCh(' '); SOut.PutStr(DocStr);
    } else {
        int DocWIds=BowDocBs->GetDocWIds(DId);
        int WId; double WordFq;
        for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
          BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq);
          TStr WordStr=BowDocBs->GetWordStr(WId);
          for (int WordFqN=0; WordFqN<WordFq; WordFqN++){
            SOut.PutCh(' '); SOut.PutStr(WordStr);
          }
        }
    }
    SOut.PutLn();
  }
  printf("\n");
}
Пример #2
0
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs,
    const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm,
    const TStr& CatFNm, const TIntV& _DIdV) {

  TIntV DIdV;
  if (_DIdV.Empty()) {
      BowDocBs->GetAllDIdV(DIdV);
  } else {
      DIdV = _DIdV;
  }
  // generate map of row-ids to words
  TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat"));
  for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) {
    TStr WdStr = BowDocBs->GetWordStr(WId);
    WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1,  WdStr.CStr()));
  }
  WdMapSOut.Flush();
  // generate map of col-ids to document names
  TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat"));
  for (int DocN = 0; DocN < DIdV.Len(); DocN++) {
    const int DId = DIdV[DocN];
    TStr DocNm = BowDocBs->GetDocNm(DId);
    DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId,  DocNm.CStr()));
  }
  DocMapSOut.Flush();
  // save documents' sparse vectors
  TFOut SOut(FNm);
  for (int DocN = 0; DocN < DIdV.Len(); DocN++){
    const int DId = DIdV[DocN];
    PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId);
    const int DocWIds = DocSpV->GetWIds();
    for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
      const int WId = DocSpV->GetWId(DocWIdN);
      const double WordWgt = DocSpV->GetWgt(DocWIdN);
      SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt));
    }
  }
  SOut.Flush();
  // save documents' category sparse vectors
  if (!CatFNm.Empty()) {
    TFOut CatSOut(CatFNm);
    for (int DocN = 0; DocN < DIdV.Len(); DocN++){
      const int DId = DIdV[DocN];
      const int DocCIds = BowDocBs->GetDocCIds(DId);
      for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){
        const int CId = BowDocBs->GetDocCId(DId, DocCIdN);
        const double CatWgt = 1.0;
        CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt));
      }
    }
    CatSOut.Flush();
  }
}
Пример #3
0
TBowMatrix::TBowMatrix(PBowDocBs BowDocBs, PBowDocWgtBs BowDocWgtBs,
        const TStr& CatNm,  const TIntV& DIdV, TFltV& ClsV): TMatrix() {

    RowN = BowDocBs->GetWords();
    ClsV.Gen(DIdV.Len(), 0);
    ColSpVV.Gen(DIdV.Len(), 0);
    IAssert(BowDocBs->IsCatNm(CatNm));
    int CatId = BowDocBs->GetCId(CatNm);
    for (int i = 0; i < DIdV.Len(); i++) {
        ColSpVV.Add(BowDocWgtBs->GetSpV(DIdV[i]));
        ClsV.Add(BowDocBs->IsCatInDoc(DIdV[i], CatId) ? 0.99 : -0.99);
    }
}
Пример #4
0
void TNmObjBs::GetNmObjDIdV(
 const PBowDocBs& BowDocBs, TIntV& BowDIdV, 
 const TStr& NmObjStr1, const TStr& NmObjStr2) const {
  // get first named-object-id
  int NmObjId1=GetNmObjId(NmObjStr1);
  TIntV NmObjDocIdV1; GetNmObjDocIdV(NmObjId1, NmObjDocIdV1);
  NmObjDocIdV1.Sort();
  // get second named-object-id
  TIntV NmObjDocIdV2;
  if (!NmObjStr2.Empty()){
    int NmObjId2=GetNmObjId(NmObjStr2);
    GetNmObjDocIdV(NmObjId2, NmObjDocIdV2);
    NmObjDocIdV2.Sort();
  }
  // create joint doc-id-vector
  TIntV NmObjDocIdV;
  if (NmObjDocIdV2.Empty()){
    NmObjDocIdV=NmObjDocIdV1;
  } else {
    NmObjDocIdV1.Intrs(NmObjDocIdV2, NmObjDocIdV);
  }
  // traverse named-object-documents to collect bow-document-ids
  BowDIdV.Gen(NmObjDocIdV.Len(), 0);
  for (int NmObjDocIdN=0; NmObjDocIdN<NmObjDocIdV.Len(); NmObjDocIdN++){
    TStr DocNm=GetDocNm(NmObjDocIdV[NmObjDocIdN]);
    int DId=BowDocBs->GetDId(DocNm);
    if (DId!=-1){
      BowDIdV.Add(DId);
    } 
  }
}
Пример #5
0
void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm,
 TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) {
  // open line-doc file
  NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0;
  while (!FIn.Eof()){
    Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    printf("%d\r", Docs);
    // document name
    TChA DocNm;
    Ch=FIn.GetCh();
    if (NamedP){
      while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
        DocNm+=Ch; Ch=FIn.GetCh();}
      DocNm.Trunc();
      if (DocNm.Empty()){Docs--; continue;}
    } else {
        DocNm = TInt::GetStr(Docs);
    }
    // categories
    TStrV CatNmV;
    forever {
      while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();}
      if (Ch=='!'){
        if (!FIn.Eof()){Ch=FIn.GetCh();}
        TChA CatNm;
        while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
          CatNm+=Ch; Ch=FIn.GetCh();}
        if (!CatNm.Empty()){CatNmV.Add(CatNm);}
      } else {
        break;
      }
    }
    // document text
    TChA DocChA;
    while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){
      DocChA+=Ch; Ch=FIn.GetCh();}
    // skip empty documents (empty lines)
    if (DocNm.Empty()&&DocChA.Empty()){
      continue;}
    // add document to document-base
    NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP));
  }
  // return document-base
  BowDocBs->AssertOk();
  printf("\n");
}
Пример #6
0
void TFtrGen::AddWds(const TStr& Prefix, 
        const PBowDocBs& BowDocBs, int& Offset) const {

    const int Vals = GetVals();
    for (int ValN = 0; ValN < Vals; ValN++) {
        const int WId = BowDocBs->AddWordStr(
            TStr::Fmt("%s-%s", Prefix.CStr(), GetVal(ValN).CStr()));
        IAssert(Offset == WId); Offset++;
    }
}
Пример #7
0
PBowMd TBowCentroidMd::New(const PBowDocBs& BowDocBs, 
		const PBowDocWgtBs& BowDocWgtBs, const TStr& CatNm, 
		const TIntV& TrainDIdV) {

	// create model
	TBowCentroidMd* CentroidMd = new TBowCentroidMd(BowDocBs); 
	PBowMd BowMd(CentroidMd); CentroidMd->CatNm = CatNm;
	// compute centroid
	TIntV CatDIdV; const int CId = BowDocBs->GetCId(CatNm);
	for (int TrainDIdN = 0; TrainDIdN < TrainDIdV.Len(); TrainDIdN++) {
		const int DId = TrainDIdV[TrainDIdN];
		if (BowDocBs->IsCatInDoc(DId, CId)) { CatDIdV.Add(DId); }
	}
	PBowSim BowSim = TBowSim::New(bstCos);
	PBowSpV CentroidSpV = TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, CatDIdV);	
	CentroidMd->CentroidV.Gen(BowDocBs->GetWords());
	CentroidMd->CentroidV.PutAll(0.0);
	TBowLinAlg::AddVec(1.0, CentroidSpV, CentroidMd->CentroidV);
	return CentroidMd;
}
Пример #8
0
PLwOntoGround TLwOntoGround::GetOntoGroundNN(const PLwOnto& LwOnto, 
        const PBowDocBs& BowDocBs, const TStr& LangNm) {

    printf("Generating Ontology-Classifier...\n");
    // shortcuts
    PLwTermBs TermBs=LwOnto->GetTermBs();
    const int Terms = TermBs->GetTerms();
    const int LangId = LwOnto->GetLangBs()->GetLangId(LangNm);
    const int Words = BowDocBs->GetWords();
    // create tfidf
    printf("  Creating BowDocWgtBs ...");
    PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
    PBowSim BowSim=TBowSim::New(bstCos);
    printf(" Done.\n");
    // collect documents per ontology-term
    printf("  Collecting documents per ontology-term ... ");
    THash<TInt, PBowSpV> TermIdToConceptSpVH;
    for (int TermN = 0; TermN < Terms; TermN++){
        int TermId = TermBs->GetTermId(TermN);
        PLwTerm Term = TermBs->GetTerm(TermId);
        if (Term->GetLangId() != LangId) { continue; }
        // do nearest neighbour search
        PBowSpV TermSpV = BowDocBs->GetSpVFromHtmlStr(
            Term->GetTermNm(), BowDocWgtBs);
        TFltIntKdV SimDIdKdV;
        BowDocWgtBs->GetSimDIdV(TermSpV, BowSim, SimDIdKdV, false);
        TFltV TermV(Words); TermV.PutAll(0.0);
        for (int SimDIdKdN = 0; SimDIdKdN < SimDIdKdV.Len(); SimDIdKdN++) {
            PBowSpV DocSpV = BowDocWgtBs->GetSpV(SimDIdKdV[SimDIdKdN].Dat);
            const double Sim = SimDIdKdV[SimDIdKdN].Key;
            TBowLinAlg::AddVec(Sim, DocSpV, TermV);
        }
        TermIdToConceptSpVH.AddDat(TermId, TBowSpV::New(-1, TermV, TFlt::Eps));
    }
    printf("  Done.\n");
    // create & return classifier
    PLwOntoGround OntoGround = TLwOntoGround::New(LwOnto, 
        BowDocBs, BowDocWgtBs, TermIdToConceptSpVH);
    printf("Done.\n");
    return OntoGround;
}
Пример #9
0
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs,
		const TStr& DocNm, const TStrV& FtrValV) const {

    TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV);
    // make KdV to PrV
    const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0);
    for (int WIdN = 0; WIdN < WIds; WIdN++) {
        WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat));
    }
    // add the feature vector to trainsets
    BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV);
}
Пример #10
0
PBowMd TBowWinnowMd::NewMulti(
 const PBowDocBs& BowDocBs, const int& TopCats, const double& Beta){
  // create model
  TBowMultiMd* MultiMd=new TBowMultiMd(BowDocBs); PBowMd BowMd(MultiMd);
  // traverse categories
  TIntStrPrV FqCatNmPrV; BowDocBs->GetTopCatV(TopCats, FqCatNmPrV);
  for (int CatN=0; CatN<FqCatNmPrV.Len(); CatN++){
    // get category data
    TStr CatNm=FqCatNmPrV[CatN].Val2;
    int CId=BowDocBs->GetCId(CatNm);
    // output header
    printf("*** Generating model for category: '%s' %d Docs (%d/%d Cats)\n",
     CatNm.CStr(), BowDocBs->GetCatFq(CId), 1+CId, BowDocBs->GetCats());
    // create model
    PBowMd BowMd=New(BowDocBs, CatNm, Beta);
    // add model to model-set
    MultiMd->AddBowMd(BowMd);
  }
  // return model
  return BowMd;
}
Пример #11
0
/////////////////////////////////////////////////
// BagOfWords-Files
void TBowFl::LoadHtmlTxt(
 PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV,
 const bool& RecurseDirP, const int& MxDocs,
 const bool& SaveDocP, const PNotify& Notify) {
  // prepare file-directory traversal
  TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc();
  Notify->OnStatus("Creating Bow from file-path " + FPath + " ...");
  TFFile FFile(FPath, "", RecurseDirP);
  // traverse files
  TStr FNm; int Docs=0; NewDIdV.Clr();
  while (FFile.Next(FNm)){
    Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
    Notify->OnStatus(TStr::Fmt("%d\r", Docs));
    // prepare document-name
    if (TFile::Exists(FNm)) { //B:
        TStr DocNm=FNm.GetLc();
        if (DocNm.IsPrefix(LcNrFPath)){
          DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);}
        // categories
        TStrV CatNmV; TStr CatNm;
        if (DocNm.IsChIn('/')){
          TStr Str; DocNm.SplitOnCh(CatNm, '/', Str);
        } else if (DocNm.IsChIn('\\')){
          TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str);
        }
        if (!CatNm.Empty()){
          CatNmV.Add(CatNm);}
        // load document-content
        TStr DocStr=TStr::LoadTxt(FNm);
        // add document to bow
        NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP));
    }
  }
  Notify->OnStatus(TStr::Fmt("%d", Docs));
  // return results
  Notify->OnStatus("Done.");
  BowDocBs->AssertOk();
}
Пример #12
0
void TSkyGridEnt::GetDocCentroid(const TSkyGridBs* SkyGridBs,
 const PBowDocBs& BowDocBs, const PBowDocWgtBs& BowDocWgtBs,
 const uint64& MnTm, const int& TopWords, const double& TopWordsWgtSumPrc,
 int& Docs, TStrFltPrV& WordStrWgtPrV) const {
  // get doc-ids
  TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV);
  TIntV BowDIdV(DocIdV.Len(), 0);
  for (int DocN=0; DocN<DocIdV.Len(); DocN++){
    int DocId=DocIdV[DocN];
    TStr BowDocNm=TInt::GetStr(DocId);
    int BowDId=BowDocBs->GetDId(BowDocNm);
    BowDIdV.Add(BowDId);
  }
  // create concept vector
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  PBowSpV ConceptSpV=TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, BowDIdV);
  // get docs & word-vector
  Docs=DocIdV.Len();
  ConceptSpV->GetWordStrWgtPrV(BowDocBs, TopWords, TopWordsWgtSumPrc, WordStrWgtPrV);
}
Пример #13
0
PBowMd TBowWinnowMd::New(
 const PBowDocBs& BowDocBs, const TStr& CatNm, const double& Beta){
  // create model
  TBowWinnowMd* WinnowMd=new TBowWinnowMd(BowDocBs); PBowMd BowMd(WinnowMd);
  WinnowMd->CatNm=CatNm;
  WinnowMd->Beta=Beta;
  WinnowMd->VoteTsh=0.5;
  // prepare Winnow parameters
  const double MnExpertWgtSum=1e-15;
  // get cat-id
  int CId=BowDocBs->GetCId(CatNm);
  if (CId==-1){
    TExcept::Throw(TStr::GetStr(CatNm, "Invalid Category Name ('%s')!"));}
  // get training documents
  TIntV TrainDIdV; BowDocBs->GetAllDIdV(TrainDIdV);
  int TrainDocs=TrainDIdV.Len();
  // prepare mini-experts
  int Words=BowDocBs->GetWords();
  WinnowMd->PosExpertWgtV.Gen(Words); WinnowMd->PosExpertWgtV.PutAll(1);
  WinnowMd->NegExpertWgtV.Gen(Words); WinnowMd->NegExpertWgtV.PutAll(1);
  // winnow loop
  double PrevAcc=0; double PrevPrec=0; double PrevRec=0; double PrevF1=0;
  const double MxDiff=-0.005; const int MxWorseIters=3; int WorseIters=0;
  const int MxIters=50; int IterN=0;
  while ((IterN<MxIters)&&(WorseIters<MxWorseIters)){
    IterN++;
    int FalsePos=0; int FalseNeg=0; int TruePos=0; int TrueNeg=0;
    for (int DIdN=0; DIdN<TrainDocs; DIdN++){
      int DId=TrainDIdV[DIdN];
      bool ClassVal=BowDocBs->IsCatInDoc(DId, CId);
      double PosWgt=0; double NegWgt=0;
      double OldSum=0; double NewSum=0;
      int WIds=BowDocBs->GetDocWIds(DId);
      // change only experts of words that occur in the document
      for (int WIdN=0; WIdN<WIds; WIdN++){
        int WId=BowDocBs->GetDocWId(DId, WIdN);
        OldSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId];
        // penalize expert giving wrong class prediction
        if (ClassVal){
          WinnowMd->NegExpertWgtV[WId]*=Beta;
        } else {
          WinnowMd->PosExpertWgtV[WId]*=Beta;
        }
        NewSum+=WinnowMd->PosExpertWgtV[WId]+WinnowMd->NegExpertWgtV[WId];
        PosWgt+=WinnowMd->PosExpertWgtV[WId];
        NegWgt+=WinnowMd->NegExpertWgtV[WId];
      }
      // normalize all experts
      if (NewSum>MnExpertWgtSum){
        for (int WIdN=0; WIdN<WIds; WIdN++){
          int WId=BowDocBs->GetDocWId(DId, WIdN);
          WinnowMd->PosExpertWgtV[WId]*=OldSum/NewSum;
          WinnowMd->NegExpertWgtV[WId]*=OldSum/NewSum;
        }
      }
      bool PredClassVal;
      if (PosWgt+NegWgt==0){PredClassVal=TBool::GetRnd();}
      else {PredClassVal=(PosWgt/(PosWgt+NegWgt))>WinnowMd->VoteTsh;}
      if (PredClassVal==ClassVal){
        if (PredClassVal){TruePos++;} else {TrueNeg++;}
      } else {
        if (PredClassVal){FalsePos++;} else {FalseNeg++;}
      }
    }
    // calculate temporary results
    if (TrainDocs==0){break;}
    double Acc=0; double Prec=0; double Rec=0; double F1=0;
    if (TrainDocs>0){
      Acc=100*(TruePos+TrueNeg)/double(TrainDocs);
      if (TruePos+FalsePos>0){
        Prec=(TruePos/double(TruePos+FalsePos));
        Rec=(TruePos/double(TruePos+FalseNeg));
        if (Prec+Rec>0){
          F1=(2*Prec*Rec/(Prec+Rec));
        }
      }
    }
    // check if the current iteration gave worse results then the previous
    if (((Acc-PrevAcc)<MxDiff)||((F1-PrevF1)<MxDiff)||(((Prec-PrevPrec)<MxDiff)&&
     ((Rec-PrevRec)<MxDiff))){WorseIters++;}
    else {WorseIters=0;}
    PrevAcc=Acc; PrevPrec=Prec; PrevRec=Rec; PrevF1=F1;
    printf("%d. Precision:%0.3f   Recall:%0.3f   F1:%0.3f   Accuracy:%0.3f%%\n",
     IterN, Prec, Rec, F1, Acc);
  }
  // return model
  return BowMd;
}
Пример #14
0
PBowDocBs TFtrGenBs::LoadCsv(TStr& FNm, const int& ClassId, 
        const TIntV& IgnoreIdV, const int& TrainLen) {

    // feature generators
	PFtrGenBs FtrGenBs = TFtrGenBs::New();
    // CSV parsing stuff
    PSIn SIn = TFIn::New(FNm); 
    char SsCh = ' '; TStrV FldValV;
    // read the headers and initialise the feature generators
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
        const TStr& FldVal = FldValV[FldValN];
        if (FldValN == ClassId) { 
            if (FldVal == "NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenNominal::New());
            } else if (FldVal == "MULTI-NOM") {
                FtrGenBs->PutClsFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong class type '" + FldVal + "', should be NOM or MULTI-NOM!");
            }
        } else if (!IgnoreIdV.IsIn(FldValN)) {
            if (FldVal == TFtrGenNumeric::GetType()) {
				FtrGenBs->AddFtrGen(TFtrGenNumeric::New());
            } else if (FldVal == TFtrGenNominal::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenNominal::New());
            } else if (FldVal == TFtrGenToken::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenToken::New(
                    TSwSet::New(swstNone), TStemmer::New(stmtNone)));
            } else if (FldVal == TFtrGenSparseNumeric::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenSparseNumeric::New());
            } else if (FldVal == TFtrGenMultiNom::GetType()) { 
				FtrGenBs->AddFtrGen(TFtrGenMultiNom::New());
            } else {
                TExcept::Throw("Wrong type '" + FldVal + "'!");
            }
        }
    }
    const int Flds = FldValV.Len();
    // read the lines and feed them to the feature generators
    int Recs = 0;
    while (!SIn->Eof()) {
        if (Recs == TrainLen) { break; }
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %d! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines
        try {
			TStrV FtrValV;
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
					FtrGenBs->UpdateCls(FldVal);
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
			FtrGenBs->Update(FtrValV);
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
    }
    // read the file again and feed it to the training set
    PBowDocBs BowDocBs = FtrGenBs->MakeBowDocBs();
    // we read and ignore the headers since we parsed them already 
    SIn = TFIn::New(FNm); SsCh = ' ';
    TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);  
    // read the lines and feed them to the training set
    Recs = 0;
    while (!SIn->Eof()){
        Recs++; printf("%7d\r", Recs);
        TSs::LoadTxtFldV(ssfCommaSep, SIn, SsCh, FldValV, false);
        // make sure line still has the same number of fields as the header
        EAssertR(FldValV.Len() == Flds, 
            TStr::Fmt("Wrong number of fields in line %s! Found %d and expected %d!",
            Recs + 1, FldValV.Len(), Flds));
        // go over lines and construct the sparse vector
		TStrV FtrValV; TStr ClsFtrVal;
        try {
            for (int FldValN = 0; FldValN < FldValV.Len(); FldValN++) {
                const TStr& FldVal = FldValV[FldValN];
                if (FldValN == ClassId) { 
                    ClsFtrVal = FldVal;
                } else if (!IgnoreIdV.IsIn(FldValN)) {
                    FtrValV.Add(FldVal);
                }
            }
        } catch (PExcept Ex) {
            TExcept::Throw(TStr::Fmt("Error in line %d: '%s'!", 
                Recs+1, Ex->GetMsgStr().CStr()));
        }
        // add the feature vector to trainsets
		FtrGenBs->AddBowDoc(BowDocBs, TStr::Fmt("Line-%d", Recs), FtrValV, ClsFtrVal);
    }
	// prepare training and testing doc ids
	TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); IAssert(AllDIdV.IsSorted());
	TIntV TrainDIdV = AllDIdV; TrainDIdV.Trunc(TrainLen);
	BowDocBs->PutTrainDIdV(TrainDIdV);
	TIntV TestDIdV = AllDIdV; TestDIdV.Minus(TrainDIdV);
	BowDocBs->PutTestDIdV(TestDIdV);

    return BowDocBs;
}
Пример #15
0
int main(int argc, char* argv[]){
  Try;
  // create environment
  Env=TEnv(argc, argv, TNotify::StdNotify);

  // command line parameters
  Env.PrepArgs("Text To Bag-Of-Words");
  TStr InFPath=Env.GetIfArgPrefixStr("-ihtml:", "", "Input-Html-Path");
  TStr InMtxFNm=Env.GetIfArgPrefixStr("-imtx:", "", "Input-Matrix-File");
  TStr InTabFNm=Env.GetIfArgPrefixStr("-itab:", "", "Input-Tab-File");
  TStr InTsactFNm=Env.GetIfArgPrefixStr("-itsc:", "", "Input-Transaction-File");
  TStr InSparseFNm=Env.GetIfArgPrefixStr("-ispr:", "", "Input-Sparse-File");
  TStr InSvmLightFNm=Env.GetIfArgPrefixStr("-isvml:", "", "Input-SvmLight-File");
  TStr InCpdFNm=Env.GetIfArgPrefixStr("-icpd:", "", "Input-CompactDocuments-File");
  TStr InTBsFNm=Env.GetIfArgPrefixStr("-itbs:", "", "Input-TextBase-File");
  TStr InLnDocFNm=Env.GetIfArgPrefixStr("-ilndoc:", "", "Input-LineDocuments-File");
  TStr InNmLnDocFNm=Env.GetIfArgPrefixStr("-inlndoc:", "", "Input-Named-LineDocuments-File");
  TStr InReuters21578FPath=Env.GetIfArgPrefixStr("-ir21578:", "", "Input-Reuters21578-Path");
  TStr InCiaWFBFPath=Env.GetIfArgPrefixStr("-iciawfb:", "", "Input-CIA-World-Fact-Book-Path");
  TStr InDaxFNm=Env.GetIfArgPrefixStr("-idax:", "", "Input-DocumentAtlasXML-File");
  TStr OutBowFNm=Env.GetIfArgPrefixStr("-o:", "Out.Bow", "Bow-Output-File (.Bow)");
  bool OutStatP=Env.GetIfArgPrefixBool("-ostat:", true, "Output-Statistics (*.Txt)");
  int Recs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents-To-Process");
  bool RecurseDirP=Env.GetIfArgPrefixBool("-recurse:", false, "Recurse-Directories");
  TStr SwSetTypeNm=Env.GetIfArgPrefixStr("-stopword:", "en523", "Stop-Word-Set "+TSwSet::GetSwSetTypeNmVStr());
  TStr SwSetFNm=Env.GetIfArgPrefixStr("-istopword:", "", "External-Stop-Word-Set-File");
  TStr StemmerTypeNm=Env.GetIfArgPrefixStr("-stemmer:", "porter", "Stemmer "+TStemmer::GetStemmerTypeNmVStr());
  int MxNGramLen=Env.GetIfArgPrefixInt("-ngramlen:", 3, "Max-NGram-Length");
  int MnNGramFq=Env.GetIfArgPrefixInt("-ngramfq:", 5, "Min-NGram-Frequency");
  bool SaveDocP=Env.GetIfArgPrefixBool("-savedoc:", false, "Save-Document-Text");
  if (Env.IsEndOfRun()){return 0;}
  // -idir:f:\data\ciawfb\print -o:CiaWfb.Bow -docs:50
  // -isvml:SvmLightTrain.Dat -o:SvmLight.Bow
  // -ir21578:f:\data\Reuters21578 -o:Reuters21578.Bow
  // -inlndoc:c:\data\yahoocompanies\CompProfilesSymbols.txt
  // -ihtml:c:\data\cordis\fp6

  // bag-of-words to create
  PBowDocBs BowDocBs;

  // load input data
  if (!InFPath.Empty()){ // directory-files
    // prepare stop-words
    PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
    if (!SwSetFNm.Empty()) { SwSet->LoadFromFile(SwSetFNm); }
    // prepare stemmer
    PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
    // load bow
    BowDocBs=TBowFl::LoadHtmlTxt(InFPath, RecurseDirP, Recs,
     SwSet, Stemmer, MxNGramLen, MnNGramFq, SaveDocP);
  } else
  if (!InMtxFNm.Empty()){ // matrix-file
    PBowSimMtx BowSimMtx=TBowSimMtx::LoadTxt(InMtxFNm);
    BowDocBs=TBowFl::LoadFromSimMtx(BowSimMtx);
  } else
  if (!InTabFNm.Empty()){ // tab-file
    BowDocBs=TBowFl::LoadTabTxt(InTabFNm, Recs);
  } else
  if (!InTsactFNm.Empty()){ // transaction-file
    BowDocBs=TBowFl::LoadTsactTxt(InTsactFNm, Recs);
  } else
  if (!InSparseFNm.Empty()){ // sparse-file
    TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup");
    TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var");
    TStr TrainDataFNm=InSparseFNm;
    BowDocBs=TBowFl::LoadSparseTxt(DocDefFNm, WordDefFNm, TrainDataFNm, Recs);
  } else
  if (!InSvmLightFNm.Empty()){ // SvmLight-file
    TStr DocDefFNm=TStr::PutFExt(InSparseFNm, ".tup");
    TStr WordDefFNm=TStr::PutFExt(InSparseFNm, ".var");
    TStr TrainDataFNm=InSvmLightFNm;
    BowDocBs=TBowFl::LoadSvmLightTxt(DocDefFNm, WordDefFNm, TrainDataFNm, "", Recs);
  } else
  if (!InTBsFNm.Empty()){ // Text-Base-file
    //BowDocBs=TBowFl::LoadTBsTxt(InTBsFNm, Recs);
  } else
  if (!InCpdFNm.Empty()){ // Compact-Doc-file
    BowDocBs=TBowFl::LoadCpdTxt(InCpdFNm, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
  } else
  if (!InLnDocFNm.Empty()){ // Line-Documents-file
    BowDocBs=TBowFl::LoadLnDocTxt(InLnDocFNm, false, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP);
  } else
  if (!InNmLnDocFNm.Empty()){ // Named-Line-Documents-file
    BowDocBs=TBowFl::LoadLnDocTxt(InNmLnDocFNm, true, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq, SaveDocP);
  } else
  if (!InReuters21578FPath.Empty()){ // Reuters-21578-file
    BowDocBs=TBowFl::LoadReuters21578Txt(InReuters21578FPath, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
  } else 
  if (!InCiaWFBFPath.Empty()){ // CIA-World-Fact-Book
    BowDocBs=TBowFl::LoadCiaWFBTxt(InCiaWFBFPath, Recs,
     SwSetTypeNm, StemmerTypeNm, MxNGramLen, MnNGramFq);
  } else 
  if (!InDaxFNm.Empty()) { // DocumentAtlasXml-File
    PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
    PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
    BowDocBs=TVizMapXmlDocBs::LoadBowDocBs(InDaxFNm, 
     SwSet, Stemmer, MxNGramLen, MnNGramFq);
  } else {
    TExcept::Throw("No Input-File specified!");
  }

  // save bow-file
  if (!OutBowFNm.Empty()){
    TStr::PutFExtIfEmpty(OutBowFNm, ".Bow");
    printf("Saving Bow to '%s' ...", OutBowFNm.CStr());
    BowDocBs->SaveBin(OutBowFNm);
    printf(" Done.\n");
  }

  // save statistics
  if ((OutStatP)&&(!OutBowFNm.Empty())){
    // save bow-statistics
    TStr OutBowStatFNm=TStr::PutFExt(OutBowFNm, ".Bow.Txt");
    TStr::PutFExt(OutBowStatFNm, ".Bow.Txt");
    if (!OutBowStatFNm.Empty()){
      printf("Saving Bow-Statistics to '%s' ...", OutBowStatFNm.CStr());
      BowDocBs->SaveTxtStat(OutBowStatFNm);
      printf(" Done.\n");
    }
    // save ngram-statistics
    PNGramBs NGramBs=BowDocBs->GetNGramBs();
    TStr OutNGramStatFNm=TStr::PutFExt(OutBowFNm, ".NGram.Txt");
    if (!NGramBs.Empty()){
      printf("Saving NGram-Statistics to '%s' ...", OutNGramStatFNm.CStr());
      NGramBs->SaveTxt(OutNGramStatFNm, true);
      printf(" Done.\n");
    }
  }

  return 0;
  Catch;
  return 1;
}
Пример #16
0
PLwOntoGround TLwOntoGround::GetOntoGround(
 const PLwOnto& LwOnto, const PBowDocBs& BowDocBs,
 const TStr& LangNm, const bool& DocCatIsTermIdP,
 const double& CutWordWgtSumPrc){
  printf("Generating Ontology-Classifier...\n");
  // shortcuts
  PLwTermBs TermBs=LwOnto->GetTermBs();
  int Terms=TermBs->GetTerms();
  PLwLinkBs LinkBs=LwOnto->GetLinkBs();
  PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs();
  int LangId=LwOnto->GetLangBs()->GetLangId(LangNm);
  int Docs=BowDocBs->GetDocs();
  // create tfidf
  printf("  Creating BowDocWgtBs ...");
  PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
  PBowSim BowSim=TBowSim::New(bstCos);
  printf(" Done.\n");
  // collect documents per ontology-term
  printf("  Collecting documents per ontology-term ...\n");
  TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0;
  for (int DId=0; DId<Docs; DId++){
    printf("    Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats);
    for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){
      // get document-category
      int CId=BowDocBs->GetDocCId(DId, DocCIdN);
      TStr CatNm=BowDocBs->GetCatNm(CId);
      // get term-id
      if (DocCatIsTermIdP){
        int TermId=CatNm.GetInt();
        if (TermBs->IsTermId(TermId)){
          TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
        } else {NegCats++;}
      } else {
        if (TermBs->IsTermId(CatNm, LangId)){
          int TermId=TermBs->GetTermId(CatNm, LangId);
          TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
        } else {NegCats++;}
      }
    }
  }
  printf("    Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats);
  printf("  Done.\n");
  // create sub-terms & up-terms vectors
  printf("  Creating sub-terms & up-terms vectors ...");
  TIntIntVH Const_TermIdToSubTermIdVH;
  TIntIntVH TermIdToSubTermIdVH;
  TIntIntVH TermIdToUpTermIdVH;
  for (int TermN=0; TermN<Terms; TermN++){
    int TermId=TermBs->GetTermId(TermN);
    for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){
      int LinkTypeId; int DstTermId;
      LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId);
      TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm();
      if (LinkTypeNm=="NT"){
        Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
        TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
        TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId);
      }
    }
  }
  printf("   Done.\n");
  // create centroids
  printf("  Creating centroids ...\n");
  THash<TInt, PBowSpV> TermIdToConceptSpVH;
  TIntIntVH TermIdToSubTermDIdVH;
  TIntH ProcTermIdH;
  int PrevActiveTerms=-1;
  forever{
    // count active nodes for processing
    int ActiveTerms=0;
    for (int TermN=0; TermN<Terms; TermN++){
      int TermId=TermBs->GetTermId(TermN);
      if ((TermIdToSubTermIdVH.IsKey(TermId))&&
       (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){
        ActiveTerms++;
      }
    }
    // stop if no change from previous round
    printf("    Active-Terms:%d\n", ActiveTerms);
    if (ActiveTerms==PrevActiveTerms){break;}
    PrevActiveTerms=ActiveTerms;
    // reduce active-nodes with zero-ancestors
    for (int TermN=0; TermN<Terms; TermN++){
      int TermId=TermBs->GetTermId(TermN);
      if (ProcTermIdH.IsKey(TermId)){continue;}
      if ((!TermIdToSubTermIdVH.IsKey(TermId))||
       (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){
        printf("    %d/%d\r", 1+TermN, Terms);
        ProcTermIdH.AddKey(TermId);
        // collect document-ids
        TIntV TermDIdV;
        if (TermIdToDIdVH.IsKey(TermId)){
          TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));}
        if (TermIdToSubTermDIdVH.IsKey(TermId)){
          TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));}
        // create concept-vector if any documents
        if (TermDIdV.Len()>0){
          PBowSpV ConceptSpV=
           TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc);
          TermIdToConceptSpVH.AddDat(TermId, ConceptSpV);
        }
        // correct upper-term
        if (TermIdToUpTermIdVH.IsKey(TermId)){
          TIntV& UpTermIdV=TermIdToUpTermIdVH.GetDat(TermId);
          for (int UpTermIdN=0; UpTermIdN<UpTermIdV.Len(); UpTermIdN++){
            int UpTermId=UpTermIdV[UpTermIdN];
            TermIdToSubTermIdVH.GetDat(UpTermId).DelIfIn(TermId);
            if (TermDIdV.Len()>0){
              TermIdToSubTermDIdVH.AddDat(UpTermId).AddV(TermDIdV);}
          }
        }
      }
    }
  }
  printf("  Done.\n");
  // create & return classifier
  PLwOntoGround OntoGround=
   TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH);
  printf("Done.\n");
  return OntoGround;
}
Пример #17
0
void TVizMapContext::PaintCatNms(PGks Gks, const int& KeyWdFontSize, 
        TVec<TFltRect>& PointNmRectV) {
        
    // calculate frequency of categories
    TIntH CatH; TIntFltPrH CatPosH;
    PBowDocBs BowDocBs = VizMapFrame->GetKeyWdBow();
    const int Points = VizMapFrame->GetPoints();
    for (int PointN = 0; PointN < Points; PointN++) {
        PVizMapPoint Point = VizMapFrame->GetPoint(PointN);
        const int DId = Point->GetDocId();
        const int CIds = BowDocBs->GetDocCIds(DId);
        for (int CIdN = 0; CIdN < CIds; CIdN++) {
            const int CId = BowDocBs->GetDocCId(DId, CIdN);
            CatH.AddDat(CId)++;
            CatPosH.AddDat(CId).Val1 += Point->GetPointX();
            CatPosH.AddDat(CId).Val2 += Point->GetPointY();
        }
        
    }
    CatH.SortByDat(false); 

    // draw the top cats
    const int TopCats = Points > 100 ? 6 : 4; 
    TFltRect ZoomRect = GetZoomRect();    
    Gks->SetFont(TGksFont::New("ARIAL", KeyWdFontSize + 3, ColorCatNmFont));
    TVec<TFltRect> CatNmRectV; TVec<TFltV> CatNmPosV;
    const int MnSize = TInt::GetMn(Gks->GetWidth(), Gks->GetHeight());
    const int MnDist = TFlt::Round(0.3 * double(MnSize));
    int Cats = 0, CatKeyId = CatH.FFirstKeyId();
    while (CatH.FNextKeyId(CatKeyId)) {
        if (Cats == TopCats) { break; } 
        if (double(CatH[CatKeyId]) / double(Points) < 0.05) { break; } 
        const int CId = CatH.GetKey(CatKeyId);
        // get name
        TStr CatNm = BowDocBs->GetCatNm(CId);
        if (CatFullNmH.IsKey(CatNm)) {
            CatNm = CatFullNmH.GetDat(CatNm);
        } else { continue; }
        // get position
        TFltPr CatPos = CatPosH.GetDat(CId);
        const int CatCount = CatH.GetDat(CId); IAssert(CatCount > 0);
        const double CatX = CatPos.Val1 / double(CatCount);
        const double CatY = CatPos.Val2 / double(CatCount);
        // is it within the zoom?
        if (!ZoomRect.IsXYIn(CatX, CatY)) { continue; }
        // calculate string size on the screen
        const int HalfTxtWidth = Gks->GetTxtWidth(CatNm) / 2;
        const int HalfTxtHeight = Gks->GetTxtHeight(CatNm) / 2;
        // get coordinates in pixels
        const int X = GetScreenCoord(CatX , ZoomRect.GetMnX(), 
            ZoomRect.GetXLen(), Gks->GetWidth());
        const int Y = GetScreenCoord(CatY, ZoomRect.GetMnY(), 
            ZoomRect.GetYLen(), Gks->GetHeight());
        // is it to close to any of the most prominent categories
        int CatNmDist = MnSize; TFltV CatNmPos = TFltV::GetV(double(X), double(Y));
        for (int CatNmPosN = 0; CatNmPosN < CatNmPosV.Len(); CatNmPosN++) {
            const double Dist = TLinAlg::EuclDist(CatNmPosV[CatNmPosN], CatNmPos);
            CatNmDist = TInt::GetMn(TFlt::Round(Dist), CatNmDist);
        }
        if (CatNmDist < MnDist) { continue; }
        // does it overlap with any of hte most prominent categories
        TFltRect CatNmRect(X - HalfTxtWidth, Y - HalfTxtHeight,
            X + HalfTxtWidth, Y + HalfTxtHeight);
        bool DoDraw = true; const int Rects = CatNmRectV.Len();
        for (int RectN = 0; (RectN < Rects) && DoDraw; RectN++) {
            DoDraw = !TFltRect::Intersection(CatNmRect, CatNmRectV[RectN]); }
        if (!DoDraw) { continue; }
        // draw it!
        Gks->PutTxt(CatNm, X - HalfTxtWidth, Y - HalfTxtHeight); 
        // remember string area
        CatNmRectV.Add(CatNmRect); Cats++;
        // remember string position
        CatNmPosV.Add(CatNmPos);
    }
    PointNmRectV.AddV(CatNmRectV);
}
Пример #18
0
PBowMd TBowMd::LoadBin(const TStr& FNm, const PBowDocBs& BowDocBs){
  PBowMd BowMd; {TFIn SIn(FNm); BowMd=Load(SIn);}
  if ((!BowDocBs.Empty())&&(BowMd->GetBowDocBsSig()!=BowDocBs->GetSig())){
    TExcept::Throw("Bow-Model and Bow-Data signatures don't match!");}
  return BowMd;
}