Ejemplo n.º 1
0
Archivo: bowfl.cpp Proyecto: Accio/snap
void TBowFl::SaveLnDocTxt(const PBowDocBs& BowDocBs, const TStr& FNm, const bool& UseDocStrP){
  TFOut SOut(FNm);
  int Docs=BowDocBs->GetDocs();
  for (int DId=0; DId<Docs; DId++){
    printf("%d/%d\r", DId+1, Docs);
    // output document-name
    TStr DocNm=TStr::GetFNmStr(BowDocBs->GetDocNm(DId));
    SOut.PutStr(DocNm);
    // output categories
    for (int CIdN=0; CIdN<BowDocBs->GetDocCIds(DId); CIdN++){
      int CId=BowDocBs->GetDocCId(DId, CIdN);
      TStr CatNm=TStr::GetFNmStr(BowDocBs->GetCatNm(CId));
      SOut.PutCh(' '); SOut.PutCh('!'); SOut.PutStr(CatNm);
    }
    // output words
    if (UseDocStrP){
      TStr DocStr=BowDocBs->GetDocStr(DId);
//      DocStr.DelChAll('\n'); DocStr.DelChAll('\r');
      SOut.PutCh(' '); SOut.PutStr(DocStr);
    } else {
        int DocWIds=BowDocBs->GetDocWIds(DId);
        int WId; double WordFq;
        for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
          BowDocBs->GetDocWIdFq(DId, DocWIdN, WId, WordFq);
          TStr WordStr=BowDocBs->GetWordStr(WId);
          for (int WordFqN=0; WordFqN<WordFq; WordFqN++){
            SOut.PutCh(' '); SOut.PutStr(WordStr);
          }
        }
    }
    SOut.PutLn();
  }
  printf("\n");
}
Ejemplo n.º 2
0
void TBowFl::SaveSparseMatlabTxt(const PBowDocBs& BowDocBs,
    const PBowDocWgtBs& BowDocWgtBs, const TStr& FNm,
    const TStr& CatFNm, const TIntV& _DIdV) {

  TIntV DIdV;
  if (_DIdV.Empty()) {
      BowDocBs->GetAllDIdV(DIdV);
  } else {
      DIdV = _DIdV;
  }
  // generate map of row-ids to words
  TFOut WdMapSOut(TStr::PutFExt(FNm, ".row-to-word-map.dat"));
  for (int WId = 0; WId < BowDocWgtBs->GetWords(); WId++) {
    TStr WdStr = BowDocBs->GetWordStr(WId);
    WdMapSOut.PutStrLn(TStr::Fmt("%d %s", WId+1,  WdStr.CStr()));
  }
  WdMapSOut.Flush();
  // generate map of col-ids to document names
  TFOut DocMapSOut(TStr::PutFExt(FNm, ".col-to-docName-map.dat"));
  for (int DocN = 0; DocN < DIdV.Len(); DocN++) {
    const int DId = DIdV[DocN];
    TStr DocNm = BowDocBs->GetDocNm(DId);
    DocMapSOut.PutStrLn(TStr::Fmt("%d %d %s", DocN, DId,  DocNm.CStr()));
  }
  DocMapSOut.Flush();
  // save documents' sparse vectors
  TFOut SOut(FNm);
  for (int DocN = 0; DocN < DIdV.Len(); DocN++){
    const int DId = DIdV[DocN];
    PBowSpV DocSpV = BowDocWgtBs->GetSpV(DId);
    const int DocWIds = DocSpV->GetWIds();
    for (int DocWIdN=0; DocWIdN<DocWIds; DocWIdN++){
      const int WId = DocSpV->GetWId(DocWIdN);
      const double WordWgt = DocSpV->GetWgt(DocWIdN);
      SOut.PutStrLn(TStr::Fmt("%d %d %.16f", WId+1, DocN+1, WordWgt));
    }
  }
  SOut.Flush();
  // save documents' category sparse vectors
  if (!CatFNm.Empty()) {
    TFOut CatSOut(CatFNm);
    for (int DocN = 0; DocN < DIdV.Len(); DocN++){
      const int DId = DIdV[DocN];
      const int DocCIds = BowDocBs->GetDocCIds(DId);
      for (int DocCIdN=0; DocCIdN<DocCIds; DocCIdN++){
        const int CId = BowDocBs->GetDocCId(DId, DocCIdN);
        const double CatWgt = 1.0;
        CatSOut.PutStrLn(TStr::Fmt("%d %d %.16f", CId+1, DocN+1, CatWgt));
      }
    }
    CatSOut.Flush();
  }
}
Ejemplo n.º 3
0
PLwOntoGround TLwOntoGround::GetOntoGround(
 const PLwOnto& LwOnto, const PBowDocBs& BowDocBs,
 const TStr& LangNm, const bool& DocCatIsTermIdP,
 const double& CutWordWgtSumPrc){
  printf("Generating Ontology-Classifier...\n");
  // shortcuts
  PLwTermBs TermBs=LwOnto->GetTermBs();
  int Terms=TermBs->GetTerms();
  PLwLinkBs LinkBs=LwOnto->GetLinkBs();
  PLwLinkTypeBs LinkTypeBs=LwOnto->GetLinkTypeBs();
  int LangId=LwOnto->GetLangBs()->GetLangId(LangNm);
  int Docs=BowDocBs->GetDocs();
  // create tfidf
  printf("  Creating BowDocWgtBs ...");
  PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, bwwtNrmTFIDF);
  PBowSim BowSim=TBowSim::New(bstCos);
  printf(" Done.\n");
  // collect documents per ontology-term
  printf("  Collecting documents per ontology-term ...\n");
  TIntIntVH TermIdToDIdVH; int PosCats=0; int NegCats=0;
  for (int DId=0; DId<Docs; DId++){
    printf("    Docs:%d/%d Pos:%d Neg:%d\r", 1+DId, Docs, PosCats, NegCats);
    for (int DocCIdN=0; DocCIdN<BowDocBs->GetDocCIds(DId); DocCIdN++){
      // get document-category
      int CId=BowDocBs->GetDocCId(DId, DocCIdN);
      TStr CatNm=BowDocBs->GetCatNm(CId);
      // get term-id
      if (DocCatIsTermIdP){
        int TermId=CatNm.GetInt();
        if (TermBs->IsTermId(TermId)){
          TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
        } else {NegCats++;}
      } else {
        if (TermBs->IsTermId(CatNm, LangId)){
          int TermId=TermBs->GetTermId(CatNm, LangId);
          TermIdToDIdVH.AddDat(TermId).Add(DId); PosCats++;
        } else {NegCats++;}
      }
    }
  }
  printf("    Docs:%d/%d Pos:%d Neg:%d\n", Docs, Docs, PosCats, NegCats);
  printf("  Done.\n");
  // create sub-terms & up-terms vectors
  printf("  Creating sub-terms & up-terms vectors ...");
  TIntIntVH Const_TermIdToSubTermIdVH;
  TIntIntVH TermIdToSubTermIdVH;
  TIntIntVH TermIdToUpTermIdVH;
  for (int TermN=0; TermN<Terms; TermN++){
    int TermId=TermBs->GetTermId(TermN);
    for (int LinkN=0; LinkN<LinkBs->GetFromLinks(TermId); LinkN++){
      int LinkTypeId; int DstTermId;
      LinkBs->GetFromLink(TermId, LinkN, LinkTypeId, DstTermId);
      TStr LinkTypeNm=LinkTypeBs->GetLinkType(LinkTypeId)->GetLinkTypeNm();
      if (LinkTypeNm=="NT"){
        Const_TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
        TermIdToSubTermIdVH.AddDat(TermId).Add(DstTermId);
        TermIdToUpTermIdVH.AddDat(DstTermId).Add(TermId);
      }
    }
  }
  printf("   Done.\n");
  // create centroids
  printf("  Creating centroids ...\n");
  THash<TInt, PBowSpV> TermIdToConceptSpVH;
  TIntIntVH TermIdToSubTermDIdVH;
  TIntH ProcTermIdH;
  int PrevActiveTerms=-1;
  forever{
    // count active nodes for processing
    int ActiveTerms=0;
    for (int TermN=0; TermN<Terms; TermN++){
      int TermId=TermBs->GetTermId(TermN);
      if ((TermIdToSubTermIdVH.IsKey(TermId))&&
       (TermIdToSubTermIdVH.GetDat(TermId).Len()>0)){
        ActiveTerms++;
      }
    }
    // stop if no change from previous round
    printf("    Active-Terms:%d\n", ActiveTerms);
    if (ActiveTerms==PrevActiveTerms){break;}
    PrevActiveTerms=ActiveTerms;
    // reduce active-nodes with zero-ancestors
    for (int TermN=0; TermN<Terms; TermN++){
      int TermId=TermBs->GetTermId(TermN);
      if (ProcTermIdH.IsKey(TermId)){continue;}
      if ((!TermIdToSubTermIdVH.IsKey(TermId))||
       (TermIdToSubTermIdVH.GetDat(TermId).Len()==0)){
        printf("    %d/%d\r", 1+TermN, Terms);
        ProcTermIdH.AddKey(TermId);
        // collect document-ids
        TIntV TermDIdV;
        if (TermIdToDIdVH.IsKey(TermId)){
          TermDIdV.AddV(TermIdToDIdVH.GetDat(TermId));}
        if (TermIdToSubTermDIdVH.IsKey(TermId)){
          TermDIdV.AddV(TermIdToSubTermDIdVH.GetDat(TermId));}
        // create concept-vector if any documents
        if (TermDIdV.Len()>0){
          PBowSpV ConceptSpV=
           TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, TermDIdV, CutWordWgtSumPrc);
          TermIdToConceptSpVH.AddDat(TermId, ConceptSpV);
        }
        // correct upper-term
        if (TermIdToUpTermIdVH.IsKey(TermId)){
          TIntV& UpTermIdV=TermIdToUpTermIdVH.GetDat(TermId);
          for (int UpTermIdN=0; UpTermIdN<UpTermIdV.Len(); UpTermIdN++){
            int UpTermId=UpTermIdV[UpTermIdN];
            TermIdToSubTermIdVH.GetDat(UpTermId).DelIfIn(TermId);
            if (TermDIdV.Len()>0){
              TermIdToSubTermDIdVH.AddDat(UpTermId).AddV(TermDIdV);}
          }
        }
      }
    }
  }
  printf("  Done.\n");
  // create & return classifier
  PLwOntoGround OntoGround=
   TLwOntoGround::New(LwOnto, BowDocBs, BowDocWgtBs, TermIdToConceptSpVH);
  printf("Done.\n");
  return OntoGround;
}
Ejemplo n.º 4
0
void TVizMapContext::PaintCatNms(PGks Gks, const int& KeyWdFontSize, 
        TVec<TFltRect>& PointNmRectV) {
        
    // calculate frequency of categories
    TIntH CatH; TIntFltPrH CatPosH;
    PBowDocBs BowDocBs = VizMapFrame->GetKeyWdBow();
    const int Points = VizMapFrame->GetPoints();
    for (int PointN = 0; PointN < Points; PointN++) {
        PVizMapPoint Point = VizMapFrame->GetPoint(PointN);
        const int DId = Point->GetDocId();
        const int CIds = BowDocBs->GetDocCIds(DId);
        for (int CIdN = 0; CIdN < CIds; CIdN++) {
            const int CId = BowDocBs->GetDocCId(DId, CIdN);
            CatH.AddDat(CId)++;
            CatPosH.AddDat(CId).Val1 += Point->GetPointX();
            CatPosH.AddDat(CId).Val2 += Point->GetPointY();
        }
        
    }
    CatH.SortByDat(false); 

    // draw the top cats
    const int TopCats = Points > 100 ? 6 : 4; 
    TFltRect ZoomRect = GetZoomRect();    
    Gks->SetFont(TGksFont::New("ARIAL", KeyWdFontSize + 3, ColorCatNmFont));
    TVec<TFltRect> CatNmRectV; TVec<TFltV> CatNmPosV;
    const int MnSize = TInt::GetMn(Gks->GetWidth(), Gks->GetHeight());
    const int MnDist = TFlt::Round(0.3 * double(MnSize));
    int Cats = 0, CatKeyId = CatH.FFirstKeyId();
    while (CatH.FNextKeyId(CatKeyId)) {
        if (Cats == TopCats) { break; } 
        if (double(CatH[CatKeyId]) / double(Points) < 0.05) { break; } 
        const int CId = CatH.GetKey(CatKeyId);
        // get name
        TStr CatNm = BowDocBs->GetCatNm(CId);
        if (CatFullNmH.IsKey(CatNm)) {
            CatNm = CatFullNmH.GetDat(CatNm);
        } else { continue; }
        // get position
        TFltPr CatPos = CatPosH.GetDat(CId);
        const int CatCount = CatH.GetDat(CId); IAssert(CatCount > 0);
        const double CatX = CatPos.Val1 / double(CatCount);
        const double CatY = CatPos.Val2 / double(CatCount);
        // is it within the zoom?
        if (!ZoomRect.IsXYIn(CatX, CatY)) { continue; }
        // calculate string size on the screen
        const int HalfTxtWidth = Gks->GetTxtWidth(CatNm) / 2;
        const int HalfTxtHeight = Gks->GetTxtHeight(CatNm) / 2;
        // get coordinates in pixels
        const int X = GetScreenCoord(CatX , ZoomRect.GetMnX(), 
            ZoomRect.GetXLen(), Gks->GetWidth());
        const int Y = GetScreenCoord(CatY, ZoomRect.GetMnY(), 
            ZoomRect.GetYLen(), Gks->GetHeight());
        // is it to close to any of the most prominent categories
        int CatNmDist = MnSize; TFltV CatNmPos = TFltV::GetV(double(X), double(Y));
        for (int CatNmPosN = 0; CatNmPosN < CatNmPosV.Len(); CatNmPosN++) {
            const double Dist = TLinAlg::EuclDist(CatNmPosV[CatNmPosN], CatNmPos);
            CatNmDist = TInt::GetMn(TFlt::Round(Dist), CatNmDist);
        }
        if (CatNmDist < MnDist) { continue; }
        // does it overlap with any of hte most prominent categories
        TFltRect CatNmRect(X - HalfTxtWidth, Y - HalfTxtHeight,
            X + HalfTxtWidth, Y + HalfTxtHeight);
        bool DoDraw = true; const int Rects = CatNmRectV.Len();
        for (int RectN = 0; (RectN < Rects) && DoDraw; RectN++) {
            DoDraw = !TFltRect::Intersection(CatNmRect, CatNmRectV[RectN]); }
        if (!DoDraw) { continue; }
        // draw it!
        Gks->PutTxt(CatNm, X - HalfTxtWidth, Y - HalfTxtHeight); 
        // remember string area
        CatNmRectV.Add(CatNmRect); Cats++;
        // remember string position
        CatNmPosV.Add(CatNmPos);
    }
    PointNmRectV.AddV(CatNmRectV);
}