Esempio n. 1
0
void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs,
 const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts,
 TVec<TStrFltPrV>& EntNmWgtPrVV) const {
  EntNmWgtPrVV.Clr();
  // create bow
  PBowDocBs BowDocBs=TBowDocBs::New();
  // collect documents
  TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV);
  DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs);
  if (DocIdV.Len()<MnDocs){return;}
  for (int DocN=0; DocN<DocIdV.Len(); DocN++){
    int DocId=DocIdV[DocN];
    PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId);
    // create vector of entity-weights
    TIntFltPrV WIdWgtPrV;
    for (int EntN=0; EntN<Doc->GetEnts(); EntN++){
      int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq);
      TStr EntNm=SkyGridBs->GetEntNm(EntId);
      int EntWId=BowDocBs->AddWordStr(EntNm);
      WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq));
    }
    // create bow-document
    int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV);
    TStr DocDescStr=Doc->GetTitleStr();
    BowDocBs->PutDocDescStr(DId, DocDescStr);
  }
  // k-means clustering
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
  PBowDocPart BowDocPart=TBowClust::GetKMeansPart(
   TNotify::StdNotify, // log output
   BowDocBs, // document data
   BowSim, // similarity function
   TRnd(1), // random generator
   Clusts, // number of clusters
   1, // trials per k-means
   1, // convergence epsilon for k-means
   1, // min. documents per cluster
   WordWgtType, // word weighting
   0, // cut-word-weights percentage
   0); // minimal word frequency
  EntNmWgtPrVV.Clr();
  for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){
    PBowDocPartClust Clust=BowDocPart->GetClust(ClustN);
    TStrFltPrV WordStrWgtPrV;
    Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV);
    EntNmWgtPrVV.Add(WordStrWgtPrV);
  }
  //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false);
}
Esempio n. 2
0
bool test(PGraph &graph, bool followOut, bool followIn) {
  printf("\n================================\nFollowOut: %d, FollowIn: %d\n", followOut, followIn);
  int iters = 10;
  for (int k = 0; k < iters; k++) {
    TRnd rnd = TRnd((int)time(0));
    int start = graph->GetRndNId(rnd);
    rnd.PutSeed(0);
//    int target = graph->GetRndNId(rnd);
//    printf("Start node: %d, target node: %d\n", start, target);
    int target = -1;
    printf("Start node: %d\n", start);

    struct timeval tv1, tv2;
    gettimeofday(&tv1, NULL);

    /* Hybrid */
    TBreathFS<PGraph> bfs_hybrid(graph, true);
    int maxDist_hybrid = bfs_hybrid.DoBfsHybrid(start, followOut, followIn, target);

    gettimeofday(&tv2, NULL);
    double time_hybrid = timeInSeconds(tv1, tv2);

    /* Original */
    gettimeofday(&tv1, NULL);

    TBreathFS<PGraph> bfs(graph, true);
    int maxDist = bfs.DoBfs(start, followOut, followIn, target);

    gettimeofday(&tv2, NULL);
    double time = timeInSeconds(tv1, tv2);

    /* Check results */
    if (maxDist_hybrid != maxDist) {
      printf("MaxDist incorrect.\n");
      return false;
    }
    if (target == -1) {
      if (!checkResults<PGraph>(bfs_hybrid, bfs)) {
        printf("NIdDistH values incorrect!\n");
        return false;
      }
    }

    printf("Execution times: Original: %.2f, Hybrid: %.2f\n", time, time_hybrid);
  }
  return true;
}
Esempio n. 3
0
void TWebTravelHomeNet::StartTravel(){
  PutConstrs();
  TStrV UrlStrV(300000, 0);
  TIntIntH UserIdToDocsH(1000);
  PSIn SIn=PSIn(new TFIn(InFNm));
  TILx Lx(SIn, TFSet()|iloRetEoln);
  TChA UrlStr;
  Lx.GetSym(syInt, syEof);
  while ((Lx.Sym!=syEof)&&(Lx.SymLnN<200000)){
//  while (Lx.Sym!=syEof){
    int UserId=Lx.Int; Lx.GetSym(syComma);
    Lx.GetInt(); Lx.GetSym(syComma);
    Lx.GetInt(); Lx.GetSym(syComma);
    Lx.GetInt(); Lx.GetSym(syComma);
    Lx.GetInt(); Lx.GetSym(syComma);
    TStr Method=Lx.GetIdStr(); Lx.GetSym(syComma);  // GET, POST
    UrlStr.Clr(); UrlStr+=Lx.GetIdStr(); Lx.GetSym(syComma); // http, ftp
    UrlStr+="://";
    UrlStr+=Lx.GetStrToCh(','); Lx.GetSym(syComma); // domain name
    UrlStr+=Lx.GetStrToEoln(); Lx.GetEoln(); // path
    if ((UserId==TgUserId)&&IsUrlOk(UrlStr)&&(Method=="GET")){
      UserIdToDocsH.AddDat(UserId)++;
      UrlStrV.Add(UrlStr);
    }
    Lx.GetSym(syInt, syEof);
    if (Lx.SymLnN%100000==0){OnNotify(TInt::GetStr(Lx.SymLnN)+ " docs");}
  }
  int UserIdToDocsP=UserIdToDocsH.FFirstKeyId();
  while (UserIdToDocsH.FNextKeyId(UserIdToDocsP)){
    int UserId=UserIdToDocsH.GetKey(UserIdToDocsP);
    int Docs=UserIdToDocsH[UserIdToDocsP];
    TStr MsgStr=TStr("User ")+TInt::GetStr(UserId)+": "+
     TInt::GetStr(Docs)+" Docs.";
    OnNotify(MsgStr);
  }
  UrlStrV.Shuffle(TRnd());
  for (int UrlStrN=0; UrlStrN<UrlStrV.Len(); UrlStrN++){
    Go(UrlStrV[UrlStrN]);
  }
}
Esempio n. 4
0
File: mkcca.cpp Progetto: Accio/snap
void TMKCCASemSpace::CalcKCCA(const TVec<TBowMatrix>& BowMatrixV, const int& Dims, 
        const double& Kapa, const int& CGMxIter, const int& HorstMxIter, 
        TVec<TFltVV>& AlphaVV) {

    // some basic constants
    const int Langs = BowMatrixV.Len();
    IAssert(Langs > 1);
    const int Docs = BowMatrixV[0].GetCols();
    // reservate space on the output
    AlphaVV.Gen(Langs);
    for (int LangN = 0; LangN < Langs; LangN++) {
        IAssert(BowMatrixV[LangN].GetCols() == Docs);
        AlphaVV[LangN].Gen(Docs, Dims);
        AlphaVV[LangN].PutAll(0.0);
    }

    // par vektorjev
    TFltV x(Docs);  // vektor dolzine docs
    TLAMisc::FillRnd(x, TRnd(1)); // napolnemo z random elementi
    TLinAlg::Normalize(x); // normiramo vektor
    TFltV y(BowMatrixV[1].GetRows()); // nek vektor
    // spreminanje elemnta v matriki
    AlphaVV[1](5,4) = 4.43;
    // skalarni produkt 5 stolpca matrike alphaVV[1] z vektorjem x
    TLinAlg::DotProduct(AlphaVV[1], 5, x);
    // pristeje 0.4 * prvi stolpec drugemu stoplcu
    TLinAlg::AddVec(0.4, AlphaVV[1], 0, AlphaVV[1], 1);
    // pristeje 0.4 * prvi stoplec vektorju x
    TLinAlg::AddVec(0.4, AlphaVV[1], 0, x);
    // naracunamo normo petega stoplca
    double Norm = TLinAlg::Norm(AlphaVV[1], 5);
    // preberemo cetrto vrstico matrike
    TFltV z; AlphaVV[1].GetRow(4, z);
    // preberemo cetrti stoplec matrike
    TFltV zz; AlphaVV[1].GetCol(4, zz);
    // mnozimo vektor x z matriko BowMatrixV[1]
    BowMatrixV[1].Multiply(x, y);
    BowMatrixV[1].MultiplyT(y, x);
    BowMatrixV[1].Multiply(AlphaVV[1], 0, y);
}
Esempio n. 5
0
int main(int argc, char* argv[]){
  Try;
  // create environment
  Env=TEnv(argc, argv, TNotify::StdNotify);

  // command line parameters
  Env.PrepArgs("Bag-Of-Words K-Means");
  TStr InBowFNm=Env.GetIfArgPrefixStr("-i:", "", "Input-Bow-File");
  TStr OutPartFNm=Env.GetIfArgPrefixStr("-op:", "KMeans.BowPart", "Output-BowPartition-File");
  TStr OutTxtFNm=Env.GetIfArgPrefixStr("-ot:", "KMeans.Txt", "Output-Txt-File");
  TStr OutXmlFNm=Env.GetIfArgPrefixStr("-ox:", "KMeans.Xml", "Output-Xml-File");
  int Docs=Env.GetIfArgPrefixInt("-docs:", -1, "Documents");
  int Clusts=Env.GetIfArgPrefixInt("-clusts:", 10, "Clusters");
  int RndSeed=Env.GetIfArgPrefixInt("-rseed:", 1, "RNG-Seed");
  int ClustTrials=Env.GetIfArgPrefixInt("-ctrials:", 1, "Clustering-Trials");
  double ConvergEps=Env.GetIfArgPrefixFlt("-ceps:", 10, "Convergence-Epsilon");
  double CutWordWgtSumPrc=Env.GetIfArgPrefixFlt("-cutww:", 0.5, "Cut-Word-Weight-Sum-Percentage");
  int MnWordFq=Env.GetIfArgPrefixInt("-mnwfq:", 5, "Minimal-Word-Frequency");
  bool SaveDocNmP=Env.GetIfArgPrefixBool("-sdnm:", false, "Save-Document-Names");
  if (Env.IsEndOfRun()){return 0;}

  // load data
  if (InBowFNm.Empty()){
    TExcept::Throw("No Input-Bow-File specified!");}
  PBowDocBs BowDocBs=TBowDocBs::LoadBin(InBowFNm);

  // get doc-ids
  TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV);
  if (Docs!=-1){AllDIdV.Trunc(Docs);}

  // get document partition
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
  TSecTm StartTm=TSecTm::GetCurTm(); // get start-time
  PBowDocPart BowDocPart=TBowClust::GetKMeansPart(
   TNotify::StdNotify, // log output
   BowDocBs, // document data
   BowSim, // similarity function
   TRnd(RndSeed), // random generator
   Clusts, // number of clusters
   ClustTrials, // trials per k-means
   ConvergEps, // convergence epsilon for k-means
   1, // min. documents per cluster
   WordWgtType, // word weighting
   CutWordWgtSumPrc, // cut-word-weights percentage
   MnWordFq, // minimal word frequency
   AllDIdV); // training documents
  TSecTm EndTm=TSecTm::GetCurTm(); // get end-time
  printf("Duration: %d secs\n", TSecTm::GetDSecs(StartTm, EndTm));

  // output partition
  if (!OutPartFNm.Empty()){
    TFOut SOut(OutPartFNm); BowDocPart->Save(SOut);}
  if (!OutTxtFNm.Empty()){
    BowDocPart->SaveTxt(OutTxtFNm, BowDocBs, true, 15, 0.5, SaveDocNmP);}
  if (!OutXmlFNm.Empty()){
    BowDocPart->SaveXml(OutXmlFNm, BowDocBs);}

  return 0;
  Catch;
  return 1;
}
Esempio n. 6
0
void __fastcall TContexterF::VizualizeBtClick(TObject *Sender){
  if (!State->BowDocBs.Empty()){
    // parameters
    int Clusts=TStr(VizClustsEd->Text.c_str()).GetInt(10);
    double ClustSimSumPrc=TStr(VizClustSimSumPrcEd->Text.c_str()).GetFlt(0.3)/100;

    // get doc-ids
    TIntV AllDIdV; State->BowDocBs->GetAllDIdV(AllDIdV);

    // get document partition
    PBowSim BowSim=TBowSim::New(bstCos); // similarity object
    TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
    State->TmBowDocPart=TBowClust::GetKMeansPart(
     TNotify::StdNotify, // log output
     State->BowDocBs, // document data
     BowSim, // similarity function
     TRnd(1), // random generator
     Clusts, // number of clusters
     1, // trials per k-means
     10, // convergence epsilon for k-means
     1, // min. documents per cluster
     WordWgtType, // word weighting
     0.5, // cut-word-weights percentage
     5, // minimal word frequency
     AllDIdV); // training documents

    // create graph
    PGraph Graph=TGGraph::New();

    // create vertices
    TVrtxV VrtxV;
    for (int ClustN=0; ClustN<State->TmBowDocPart->GetClusts(); ClustN++){
      // get cluster
      PBowDocPartClust Clust=State->TmBowDocPart->GetClust(ClustN);
      // get best words string
      TStrFltPrV WordStrWgtPrV;
      Clust->GetTopWordStrWgtPrV(State->BowDocBs, -1, 1.0, WordStrWgtPrV);
      TChA BestWordVChA;
      BestWordVChA+=TInt::GetStr(Clust->GetDocs())+" Docs\n";
      TStrV UcWordStrSfV;
      for (int WordN=0; WordN<WordStrWgtPrV.Len(); WordN++){
        // get word
        TStr UcWordStr=WordStrWgtPrV[WordN].Val1;
        // remove duplicates
        bool Ok=true;
        for (int WordSfN=0; WordSfN<UcWordStrSfV.Len(); WordSfN++){
          if (UcWordStrSfV[WordSfN].IsStrIn(UcWordStr)){Ok=false; break;}
          if (UcWordStr.IsStrIn(UcWordStrSfV[WordSfN])){Ok=false; break;}
        }
        if (!Ok){continue;}
        // add word
        UcWordStrSfV.Add(UcWordStr);
        BestWordVChA+=WordStrWgtPrV[WordN].Val1;
        BestWordVChA+="\n";
        // finish if limit reached
        if (UcWordStrSfV.Len()>=15){break;}
      }
      // create vertex
      TStr ClustNm=BestWordVChA;
      PVrtx Vrtx=new TGVrtx(ClustNm);
      Graph->AddVrtx(Vrtx);
      VrtxV.Add(Vrtx);
    }

    // create edges
    TFltIntIntTrV ClustSimN1N2TrV;
    State->TmBowDocPart->GetTopClustSimV(ClustSimSumPrc, ClustSimN1N2TrV);
    for (int ClustSimN=0; ClustSimN<ClustSimN1N2TrV.Len(); ClustSimN++){
      double Sim=ClustSimN1N2TrV[ClustSimN].Val1;
      double ClustN1=ClustSimN1N2TrV[ClustSimN].Val2;
      double ClustN2=ClustSimN1N2TrV[ClustSimN].Val3;
      TStr EdgeNm=TFlt::GetStr(Sim, "%.2f");
      PEdge Edge=new TGEdge(VrtxV[ClustN1], VrtxV[ClustN2], EdgeNm, false);
      Graph->AddEdge(Edge);
      Edge->PutWgt(TMath::Sqr(Sim));
    }

    // place graph
    State->TmClustGraph=Graph;
    TRnd Rnd(1);
    State->TmClustGraph->PlaceSimAnnXY(Rnd, State->TmGks);

    // get area-partition
    UpdateClustRectPrV();

    // draw graph
    State->TmGks->Clr();
    TmPbPaint(Sender);
  }
}