Пример #1
0
void TCpDoc::SaveAcmTechNewsToCpd(
 const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // processing xml files
  TFFile FFile(TStrV()+InFPath, TStrV()+".Html"+".Htm", "", true);
  TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\r", FNm.CStr());
    PSIn SIn=TFIn::New(FNm);
    THtmlLx Lx(SIn);
    while (Lx.GetSym()!=hsyEof){
      //printf("%d\r", Docs);
      if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
      Lx.MoveToBTagOrEof("<SPAN>");
      if (Lx.GetArg("CLASS")!="title"){continue;}
      Lx.MoveToBTagOrEof("<A>");
      TStr TitleStr=Lx.GetStrToETag("<A>", false); TitleStr="";
      Lx.MoveToETagOrEof("<SPAN>");
      Lx.MoveToBTagOrEof("<P>");
      TStr ParStr=Lx.GetStrToETag("<P>", false);
      if (!ParStr.Empty()){
        Docs++;
        PCpDoc CpDoc=TCpDoc::New(TInt::GetStr(Docs), TitleStr, ParStr);
        CpDoc->Save(*SOut);
      }
    }
  }
  printf("\n");
}
PBowDocBs TCordisEuProjBs::GetBowDocBsFromEuProjDesc() const {
  printf("Generating Bag-Of-Words...\n");
  // create document vector
  TStrV HtmlStrV;
  int EuProjs=GetEuProjs();
  for (int EuProjN=0; EuProjN<EuProjs; EuProjN++){
    PCordisEuProj EuProj=GetEuProj(EuProjN);
    // get document & word ids
    TStr EuProjNm=EuProj->GetEuProjAcrStr();
    TStr EuProjHtmlStr=EuProj->GetTitleStr()+" "+EuProj->GetEuProjDescHtmlStr();
    HtmlStrV.Add(EuProjHtmlStr);
  }
  // create ngrams
  PSwSet SwSet=TSwSet::GetSwSet(swstEnglish523);
  PNGramBs NGramBs=TNGramBs::GetNGramBsFromHtmlStrV(HtmlStrV, 3, 3, SwSet);
  NGramBs->SaveTxt("NGram.Txt");
  // create bag-of-words
  printf("\n");
  PBowDocBs BowDocBs=TBowDocBs::New();
  BowDocBs->PutNGramBs(NGramBs);
  {for (int EuProjN=0; EuProjN<EuProjs; EuProjN++){
    if (EuProjN%100==0){printf("%d/%d\r", EuProjN, EuProjs);}
    PCordisEuProj EuProj=GetEuProj(EuProjN);
    TStr DocNm=EuProj->GetEuProjAcrStr();
    TStr HtmlStr=EuProj->GetTitleStr()+" "+EuProj->GetEuProjDescHtmlStr();
    BowDocBs->AddHtmlDoc(DocNm, TStrV(), HtmlStr);
  }}
  BowDocBs->AssertOk();
  // return bag-of-words
  printf("\nDone.\n");
  return BowDocBs;
}
Пример #3
0
PBowDocBs TRSet::GetBowDocBs(
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq) const {
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // prepare n-grams
  TStrV HtmlStrV(GetHits(), 0);
  for (int HitN=0; HitN<GetHits(); HitN++){
    TStr TitleStr=GetHitTitleStr(HitN);
    TStr CtxStr=GetHitCtxStr(HitN);
    TStr HtmlStr=TitleStr+". "+CtxStr;
    HtmlStrV.Add(HtmlStr);
  }
  PNGramBs NGramBs=TNGramBs::GetNGramBsFromHtmlStrV(
   HtmlStrV, MxNGramLen, MnNGramFq, SwSet, Stemmer);
  // create document-base
  printf("Create Bag-Of-Words Base ... ");
  PBowDocBs BowDocBs=TBowDocBs::New();
  BowDocBs->PutNGramBs(NGramBs);
  for (int HitN=0; HitN<GetHits(); HitN++){
    BowDocBs->AddHtmlDoc(GetHitTitleStr(HitN), TStrV(), HtmlStrV[HitN], true);
  }
  BowDocBs->AssertOk();
  printf("Done.\n");
  // return bag-of-words
  return BowDocBs;
}
Пример #4
0
PBowDocBs TSkyGridBs::GetBowDocBs(
 const int& MxNGramLen, const int& MnNGramFq) const {
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(swstEn523);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(stmtPorter);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    TStrV HtmlStrV;
    TSkyGridIdDocPrV IdDocPrV; GetIdDocPrV(IdDocPrV);
    for (int DocN=0; DocN<IdDocPrV.Len(); DocN++){
      PSkyGridDoc Doc=IdDocPrV[DocN].Val2;
      TStr DocStr=Doc->GetHeadlineStr();
      HtmlStrV.Add(DocStr);
    }
    NGramBs=TNGramBs::GetNGramBsFromHtmlStrV(
     HtmlStrV, MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create bow
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  TSkyGridIdDocPrV IdDocPrV; GetIdDocPrV(IdDocPrV);
  for (int DocN=0; DocN<IdDocPrV.Len(); DocN++){
    int DocId=IdDocPrV[DocN].Val1;
    PSkyGridDoc Doc=IdDocPrV[DocN].Val2;
    TStr DocStr=Doc->GetHeadlineStr();
    BowDocBs->AddHtmlDoc(TInt::GetStr(DocId), TStrV(), DocStr);
  }
  // return bow
  return BowDocBs;
}
Пример #5
0
PBowDocBs TNmObjBs::GetBowDocBs(const int& MnNmObjFq) const {
  printf("Generating Bag-Of-Words...\n");
  // create bag-of-words
  PBowDocBs BowDocBs=TBowDocBs::New();
  // traverse documents
  for (int DocId=0; DocId<GetDocs(); DocId++){
    if (DocId%100==0){printf("%d\r", DocId);}
    TStr DocNm=GetDocNm(DocId);
    TStr DateStr=GetDocDateStr(DocId);
    TStrV WordStrV;
    int DocNmObjs=GetDocNmObjs(DocId);
    for (int DocNmObjN=0; DocNmObjN<DocNmObjs; DocNmObjN++){
      int NmObjId; int TermFq; GetDocNmObjId(DocId, DocNmObjN, NmObjId, TermFq);
      if ((MnNmObjFq==-1)||(GetNmObjDocs(NmObjId)>=MnNmObjFq)){
        TStr NmObjStr=GetNmObjStr(NmObjId);
        for (int TermOccN=0; TermOccN<TermFq; TermOccN++){
          WordStrV.Add(NmObjStr);
        }
      }
    }
    if (!WordStrV.Empty()){
      int DId=BowDocBs->AddDoc(DocNm, TStrV(), WordStrV);
      BowDocBs->PutDateStr(DId, DateStr);
   }
  }
  // return bag-of-words
  BowDocBs->AssertOk();
  printf("\nDone.\n");
  return BowDocBs;
}
Пример #6
0
TStrV TTable::GetDstNodeFltAttrV() const {
  TStrV FltNA = TStrV(FltCols.Len(),0);
  for (int i = 0; i < DstNodeAttrV.Len(); i++) {
    TStr Attr = DstNodeAttrV[i];
    if (GetColType(Attr) == FLT) {
      FltNA.Add(Attr);
    }
  }
  return FltNA;
}
Пример #7
0
TStrV TTable::GetDstNodeStrAttrV() const {
  TStrV StrNA = TStrV(StrColMaps.Len(),0);
  for (int i = 0; i < DstNodeAttrV.Len(); i++) {
    TStr Attr = DstNodeAttrV[i];
    if (GetColType(Attr) == STR) {
      StrNA.Add(Attr);
    }
  }
  return StrNA;
}
Пример #8
0
TStrV TTable::GetEdgeStrAttrV() const {
  TStrV StrEA = TStrV(StrColMaps.Len(),0);
  for (int i = 0; i < EdgeAttrV.Len(); i++) {
    TStr Attr = EdgeAttrV[i];
    if (GetColType(Attr) == STR) {
      StrEA.Add(Attr);
    }
  }
  return StrEA;
}
Пример #9
0
TStrV TTable::GetEdgeFltAttrV() const {
  TStrV FltEA = TStrV(FltCols.Len(),0);;
  for (int i = 0; i < EdgeAttrV.Len(); i++) {
    TStr Attr = EdgeAttrV[i];
    if (GetColType(Attr) == FLT) {
      FltEA.Add(Attr);
    }
  }
  return FltEA;
}
Пример #10
0
TStrV TTable::GetEdgeIntAttrV() const {
  TStrV IntEA = TStrV(IntCols.Len(),0);
  for (int i = 0; i < EdgeAttrV.Len(); i++) {
    TStr Attr = EdgeAttrV[i];
    if (GetColType(Attr) == INT) {
      IntEA.Add(Attr);
    }
  }
  return IntEA;
}
Пример #11
0
TStrV TTable::GetDstNodeIntAttrV() const {
  TStrV IntNA = TStrV(IntCols.Len(),0);
  for (int i = 0; i < DstNodeAttrV.Len(); i++) {
    TStr Attr = DstNodeAttrV[i];
    if (GetColType(Attr) == INT) {
      IntNA.Add(Attr);
    }
  }
  return IntNA;
}
Пример #12
0
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs,
		const TStr& DocNm, const TStrV& FtrValV) const {

    TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV);
    // make KdV to PrV
    const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0);
    for (int WIdN = 0; WIdN < WIds; WIdN++) {
        WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat));
    }
    // add the feature vector to trainsets
    BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV);
}
Пример #13
0
void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs,
 const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts,
 TVec<TStrFltPrV>& EntNmWgtPrVV) const {
  EntNmWgtPrVV.Clr();
  // create bow
  PBowDocBs BowDocBs=TBowDocBs::New();
  // collect documents
  TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV);
  DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs);
  if (DocIdV.Len()<MnDocs){return;}
  for (int DocN=0; DocN<DocIdV.Len(); DocN++){
    int DocId=DocIdV[DocN];
    PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId);
    // create vector of entity-weights
    TIntFltPrV WIdWgtPrV;
    for (int EntN=0; EntN<Doc->GetEnts(); EntN++){
      int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq);
      TStr EntNm=SkyGridBs->GetEntNm(EntId);
      int EntWId=BowDocBs->AddWordStr(EntNm);
      WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq));
    }
    // create bow-document
    int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV);
    TStr DocDescStr=Doc->GetTitleStr();
    BowDocBs->PutDocDescStr(DId, DocDescStr);
  }
  // k-means clustering
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
  PBowDocPart BowDocPart=TBowClust::GetKMeansPart(
   TNotify::StdNotify, // log output
   BowDocBs, // document data
   BowSim, // similarity function
   TRnd(1), // random generator
   Clusts, // number of clusters
   1, // trials per k-means
   1, // convergence epsilon for k-means
   1, // min. documents per cluster
   WordWgtType, // word weighting
   0, // cut-word-weights percentage
   0); // minimal word frequency
  EntNmWgtPrVV.Clr();
  for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){
    PBowDocPartClust Clust=BowDocPart->GetClust(ClustN);
    TStrFltPrV WordStrWgtPrV;
    Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV);
    EntNmWgtPrVV.Add(WordStrWgtPrV);
  }
  //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false);
}
Пример #14
0
void TSkyGridEnt::GetDocCentroid(const TSkyGridBs* SkyGridBs,
 const int& TopWords, const double& TopWordsWgtSumPrc,
 TStrFltPrV& WordStrWgtPrV) const {
  // create bow
  PSwSet SwSet=TSwSet::GetSwSet(swstEn523);
  PStemmer Stemmer=TStemmer::GetStemmer(stmtPorter);
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NULL);
  for (int DocN=0; DocN<GetDocIds(); DocN++){
    int DocId=GetDocId(DocN);
    PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId);
    TStr DocStr=Doc->GetHeadlineStr();
    BowDocBs->AddHtmlDoc(TInt::GetStr(DocId), TStrV(), DocStr);
  }
  // create word-weights
  TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting
  PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, WordWgtType, 0, 0);
  // create concept vector
  PBowSim BowSim=TBowSim::New(bstCos); // similarity object
  TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV);
  PBowSpV ConceptSpV=TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, AllDIdV);
  // get word-vector
  ConceptSpV->GetWordStrWgtPrV(BowDocBs, TopWords, TopWordsWgtSumPrc, WordStrWgtPrV);
}
Пример #15
0
PBowDocBs TGgSchRSet::GetBowDocBs() const {
  // prepare stop-words, stemming
  PSwSet SwSet=TSwSet::GetSwSet(swstEn523);
  PStemmer Stemmer=TStemmer::New(stmtPorter, true);
  // prepare n-grams
  TStrV HtmlStrV(GetHits(), 0);
  for (int HitN=0; HitN<GetHits(); HitN++){
    TStr HtmlStr=GetHit(HitN)->TitleStr;
    HtmlStrV.Add(HtmlStr);
  }
  PNGramBs NGramBs=TNGramBs::GetNGramBsFromHtmlStrV(
   HtmlStrV, 3, 3, SwSet, Stemmer);
  // create document-base
  printf("Create Bag-Of-Words Base ... ");
  PBowDocBs BowDocBs=TBowDocBs::New();
  BowDocBs->PutNGramBs(NGramBs);
  for (int HitN=0; HitN<GetHits(); HitN++){
    BowDocBs->AddHtmlDoc(TInt::GetStr(HitN), TStrV(), HtmlStrV[HitN], true);
  }
  BowDocBs->AssertOk();
  printf("Done.\n");
  // return bag-of-words
  return BowDocBs;
}
Пример #16
0
PBowDocBs TBowFl::LoadTBsTxt(
 const TStr& TBsFNm, const int& MxDocs,
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq){
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    NGramBs=TNGramBs::GetNGramBsFromTBs(
     TBsFNm, MxDocs,
     MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create document-base
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  // open input text-base
  TStr TxtBsNm=TBsFNm.GetFBase();
  TStr TxtBsFPath=TBsFNm.GetFPath();
  PTxtBs TxtBs=TTxtBs::New(TxtBsNm, TxtBsFPath, faRdOnly);
  // traverse documents
  TBlobPt TxtBsTrvBlobPt=TxtBs->FFirstDocId(); TBlobPt TxtBsDocId; int Docs=0;
  while (TxtBs->FNextDocId(TxtBsTrvBlobPt, TxtBsDocId)){
    Docs++; if (Docs%100==0){printf("%d\r", Docs);}
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    // get document data
    TStr DocNm; TStr DocStr;
    TxtBs->GetDocNmStr(TxtBsDocId, DocNm, DocStr);
    // add document to bow
    BowDocBs->AddHtmlDoc(DocNm, TStrV(), DocStr, false);
  }
  // return results
  BowDocBs->AssertOk();
  return BowDocBs;
}
Пример #17
0
void TAlignPair::AddSent(const TStr& Sent1, const TStr& Sent2) {
    TStr DocNm = TInt::GetStr(BowDocBs1->GetDocs());
    BowDocBs1->AddHtmlDoc(DocNm, TStrV(), Sent1, false);
    BowDocBs2->AddHtmlDoc(DocNm, TStrV(), Sent2, false);
    IAssert(BowDocBs1->GetDocs() == BowDocBs2->GetDocs());
}
Пример #18
0
/////////////////////////////////////////////////
// Google-Focused-Crawl
PGgFCrawl TGgFCrawl::GetFCrawl(
 const TStr& SrcUrlStr, const int& MxCands, const TStr& ProxyStr){
  // collect related urls
  printf("Expand source URL: %s\n", SrcUrlStr.CStr());
  PRSet SrcUrlRSet=
   TGg::WebSearch(TStr("related:")+SrcUrlStr, -1, TNotify::NullNotify, ProxyStr);
  // create & prepare focused-crawl
  PGgFCrawl FCrawl=TGgFCrawl::New();
  FCrawl->SrcUrlStr=SrcUrlStr;
  FCrawl->DstRSet=TRSet::New(SrcUrlRSet);
  // fill hits
  for (int HitN=0; HitN<SrcUrlRSet->GetHits(); HitN++){
    if ((MxCands!=-1)&&(FCrawl->DstRSet->GetHits()>MxCands)){break;}
    TStr HitUrlStr=SrcUrlRSet->GetHitUrlStr(HitN);
    printf("Expand URL: %s\n", HitUrlStr.CStr());
    PRSet RelUrlRSet=
     TGg::WebSearch(TStr("related:")+HitUrlStr, -1, TNotify::NullNotify, ProxyStr);
    FCrawl->DstRSet->Merge(RelUrlRSet);
  }
  // save related urls
  //TRSet::SaveXml(DstRSet, OutXmlUrlFNm);

  // collect related web-pages
  TGgWebFetchSaver WebFetchSaver(100);
  WebFetchSaver.PutProxyStr(ProxyStr);
  // get source-url web-page
  {bool Ok; TStr MsgStr;
  TWebFetchBlocking::GetWebPg(
   SrcUrlStr, Ok, MsgStr, FCrawl->SrcWebPg, NULL, ProxyStr);
  if (!Ok){FCrawl->SrcWebPg=NULL;}}
  // get related-urls web-page
  int FetchHits=FCrawl->DstRSet->GetHits();
  if ((MxCands!=-1)&&(MxCands<FetchHits)){FetchHits=MxCands;}
  for (int HitN=0; HitN<FetchHits; HitN++){
    TStr HitUrlStr=FCrawl->DstRSet->GetHitUrlStr(HitN);
    WebFetchSaver.FetchUrl(HitUrlStr);
  }
  TSysMsg::Loop();

  // save crawled web-pages
  for (int WebPgN=0; WebPgN<WebFetchSaver.GetWebPgs(); WebPgN++){
    PWebPg WebPg=WebFetchSaver.GetWebPg(WebPgN);
    FCrawl->UrlStrToWebPgH.AddDat(WebPg->GetUrlStr(), WebPg);
  }

  // create bag-of-words
  FCrawl->BowDocBs=TBowDocBs::New();
  FCrawl->SrcDId=FCrawl->BowDocBs->AddHtmlDoc(
   SrcUrlStr, TStrV(), FCrawl->SrcWebPg->GetHttpBodyAsStr());
  for (int WebPgN=0; WebPgN<WebFetchSaver.GetWebPgs(); WebPgN++){
    PWebPg WebPg=WebFetchSaver.GetWebPg(WebPgN);
    FCrawl->BowDocBs->AddHtmlDoc(
     WebPg->GetUrlStr(0), TStrV(), WebPg->GetHttpBodyAsStr());
  }

  // calculate similarities to the source document
  PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(FCrawl->BowDocBs, bwwtNrmTFIDF);
  PBowSim BowSim=TBowSim::New(bstCos);
  FCrawl->SimDIdKdV; FCrawl->SumSim=0;
  for (int DIdN=0; DIdN<BowDocWgtBs->GetDocs(); DIdN++){
    int DId=BowDocWgtBs->GetDId(DIdN);
    if (DId!=FCrawl->SrcDId){
      double Sim=BowSim->GetSim(
       BowDocWgtBs->GetSpV(FCrawl->SrcDId), BowDocWgtBs->GetSpV(DId));
      FCrawl->SimDIdKdV.Add(TFltIntKd(Sim, DId));
      FCrawl->SumSim+=Sim;
    }
  }
  FCrawl->SimDIdKdV.Sort(false);
  // set crawl ok
  FCrawl->Ok=true;
  // return focused-crawl
  return FCrawl;
}