void TCpDoc::SaveAcmTechNewsToCpd( const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // processing xml files TFFile FFile(TStrV()+InFPath, TStrV()+".Html"+".Htm", "", true); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\r", FNm.CStr()); PSIn SIn=TFIn::New(FNm); THtmlLx Lx(SIn); while (Lx.GetSym()!=hsyEof){ //printf("%d\r", Docs); if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} Lx.MoveToBTagOrEof("<SPAN>"); if (Lx.GetArg("CLASS")!="title"){continue;} Lx.MoveToBTagOrEof("<A>"); TStr TitleStr=Lx.GetStrToETag("<A>", false); TitleStr=""; Lx.MoveToETagOrEof("<SPAN>"); Lx.MoveToBTagOrEof("<P>"); TStr ParStr=Lx.GetStrToETag("<P>", false); if (!ParStr.Empty()){ Docs++; PCpDoc CpDoc=TCpDoc::New(TInt::GetStr(Docs), TitleStr, ParStr); CpDoc->Save(*SOut); } } } printf("\n"); }
PBowDocBs TCordisEuProjBs::GetBowDocBsFromEuProjDesc() const { printf("Generating Bag-Of-Words...\n"); // create document vector TStrV HtmlStrV; int EuProjs=GetEuProjs(); for (int EuProjN=0; EuProjN<EuProjs; EuProjN++){ PCordisEuProj EuProj=GetEuProj(EuProjN); // get document & word ids TStr EuProjNm=EuProj->GetEuProjAcrStr(); TStr EuProjHtmlStr=EuProj->GetTitleStr()+" "+EuProj->GetEuProjDescHtmlStr(); HtmlStrV.Add(EuProjHtmlStr); } // create ngrams PSwSet SwSet=TSwSet::GetSwSet(swstEnglish523); PNGramBs NGramBs=TNGramBs::GetNGramBsFromHtmlStrV(HtmlStrV, 3, 3, SwSet); NGramBs->SaveTxt("NGram.Txt"); // create bag-of-words printf("\n"); PBowDocBs BowDocBs=TBowDocBs::New(); BowDocBs->PutNGramBs(NGramBs); {for (int EuProjN=0; EuProjN<EuProjs; EuProjN++){ if (EuProjN%100==0){printf("%d/%d\r", EuProjN, EuProjs);} PCordisEuProj EuProj=GetEuProj(EuProjN); TStr DocNm=EuProj->GetEuProjAcrStr(); TStr HtmlStr=EuProj->GetTitleStr()+" "+EuProj->GetEuProjDescHtmlStr(); BowDocBs->AddHtmlDoc(DocNm, TStrV(), HtmlStr); }} BowDocBs->AssertOk(); // return bag-of-words printf("\nDone.\n"); return BowDocBs; }
PBowDocBs TRSet::GetBowDocBs( const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq) const { // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // prepare n-grams TStrV HtmlStrV(GetHits(), 0); for (int HitN=0; HitN<GetHits(); HitN++){ TStr TitleStr=GetHitTitleStr(HitN); TStr CtxStr=GetHitCtxStr(HitN); TStr HtmlStr=TitleStr+". "+CtxStr; HtmlStrV.Add(HtmlStr); } PNGramBs NGramBs=TNGramBs::GetNGramBsFromHtmlStrV( HtmlStrV, MxNGramLen, MnNGramFq, SwSet, Stemmer); // create document-base printf("Create Bag-Of-Words Base ... "); PBowDocBs BowDocBs=TBowDocBs::New(); BowDocBs->PutNGramBs(NGramBs); for (int HitN=0; HitN<GetHits(); HitN++){ BowDocBs->AddHtmlDoc(GetHitTitleStr(HitN), TStrV(), HtmlStrV[HitN], true); } BowDocBs->AssertOk(); printf("Done.\n"); // return bag-of-words return BowDocBs; }
PBowDocBs TSkyGridBs::GetBowDocBs( const int& MxNGramLen, const int& MnNGramFq) const { // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(swstEn523); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(stmtPorter); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ TStrV HtmlStrV; TSkyGridIdDocPrV IdDocPrV; GetIdDocPrV(IdDocPrV); for (int DocN=0; DocN<IdDocPrV.Len(); DocN++){ PSkyGridDoc Doc=IdDocPrV[DocN].Val2; TStr DocStr=Doc->GetHeadlineStr(); HtmlStrV.Add(DocStr); } NGramBs=TNGramBs::GetNGramBsFromHtmlStrV( HtmlStrV, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create bow PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); TSkyGridIdDocPrV IdDocPrV; GetIdDocPrV(IdDocPrV); for (int DocN=0; DocN<IdDocPrV.Len(); DocN++){ int DocId=IdDocPrV[DocN].Val1; PSkyGridDoc Doc=IdDocPrV[DocN].Val2; TStr DocStr=Doc->GetHeadlineStr(); BowDocBs->AddHtmlDoc(TInt::GetStr(DocId), TStrV(), DocStr); } // return bow return BowDocBs; }
PBowDocBs TNmObjBs::GetBowDocBs(const int& MnNmObjFq) const { printf("Generating Bag-Of-Words...\n"); // create bag-of-words PBowDocBs BowDocBs=TBowDocBs::New(); // traverse documents for (int DocId=0; DocId<GetDocs(); DocId++){ if (DocId%100==0){printf("%d\r", DocId);} TStr DocNm=GetDocNm(DocId); TStr DateStr=GetDocDateStr(DocId); TStrV WordStrV; int DocNmObjs=GetDocNmObjs(DocId); for (int DocNmObjN=0; DocNmObjN<DocNmObjs; DocNmObjN++){ int NmObjId; int TermFq; GetDocNmObjId(DocId, DocNmObjN, NmObjId, TermFq); if ((MnNmObjFq==-1)||(GetNmObjDocs(NmObjId)>=MnNmObjFq)){ TStr NmObjStr=GetNmObjStr(NmObjId); for (int TermOccN=0; TermOccN<TermFq; TermOccN++){ WordStrV.Add(NmObjStr); } } } if (!WordStrV.Empty()){ int DId=BowDocBs->AddDoc(DocNm, TStrV(), WordStrV); BowDocBs->PutDateStr(DId, DateStr); } } // return bag-of-words BowDocBs->AssertOk(); printf("\nDone.\n"); return BowDocBs; }
TStrV TTable::GetDstNodeFltAttrV() const { TStrV FltNA = TStrV(FltCols.Len(),0); for (int i = 0; i < DstNodeAttrV.Len(); i++) { TStr Attr = DstNodeAttrV[i]; if (GetColType(Attr) == FLT) { FltNA.Add(Attr); } } return FltNA; }
TStrV TTable::GetDstNodeStrAttrV() const { TStrV StrNA = TStrV(StrColMaps.Len(),0); for (int i = 0; i < DstNodeAttrV.Len(); i++) { TStr Attr = DstNodeAttrV[i]; if (GetColType(Attr) == STR) { StrNA.Add(Attr); } } return StrNA; }
TStrV TTable::GetEdgeStrAttrV() const { TStrV StrEA = TStrV(StrColMaps.Len(),0); for (int i = 0; i < EdgeAttrV.Len(); i++) { TStr Attr = EdgeAttrV[i]; if (GetColType(Attr) == STR) { StrEA.Add(Attr); } } return StrEA; }
TStrV TTable::GetEdgeFltAttrV() const { TStrV FltEA = TStrV(FltCols.Len(),0);; for (int i = 0; i < EdgeAttrV.Len(); i++) { TStr Attr = EdgeAttrV[i]; if (GetColType(Attr) == FLT) { FltEA.Add(Attr); } } return FltEA; }
TStrV TTable::GetEdgeIntAttrV() const { TStrV IntEA = TStrV(IntCols.Len(),0); for (int i = 0; i < EdgeAttrV.Len(); i++) { TStr Attr = EdgeAttrV[i]; if (GetColType(Attr) == INT) { IntEA.Add(Attr); } } return IntEA; }
TStrV TTable::GetDstNodeIntAttrV() const { TStrV IntNA = TStrV(IntCols.Len(),0); for (int i = 0; i < DstNodeAttrV.Len(); i++) { TStr Attr = DstNodeAttrV[i]; if (GetColType(Attr) == INT) { IntNA.Add(Attr); } } return IntNA; }
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs, const TStr& DocNm, const TStrV& FtrValV) const { TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV); // make KdV to PrV const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0); for (int WIdN = 0; WIdN < WIds; WIdN++) { WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat)); } // add the feature vector to trainsets BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV); }
void TSkyGridEnt::GetEntClustV(const TSkyGridBs* SkyGridBs, const uint64& MnTm, const int& MnDocs, const int& MxDocs, const int& Clusts, TVec<TStrFltPrV>& EntNmWgtPrVV) const { EntNmWgtPrVV.Clr(); // create bow PBowDocBs BowDocBs=TBowDocBs::New(); // collect documents TIntV DocIdV; GetDocIdV(SkyGridBs, MnTm, 0, DocIdV); DocIdV.Reverse(); DocIdV.Shuffle(TRnd(1)); DocIdV.Trunc(MxDocs); if (DocIdV.Len()<MnDocs){return;} for (int DocN=0; DocN<DocIdV.Len(); DocN++){ int DocId=DocIdV[DocN]; PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId); // create vector of entity-weights TIntFltPrV WIdWgtPrV; for (int EntN=0; EntN<Doc->GetEnts(); EntN++){ int EntId; int EntFq; Doc->GetEntNmFq(EntN, EntId, EntFq); TStr EntNm=SkyGridBs->GetEntNm(EntId); int EntWId=BowDocBs->AddWordStr(EntNm); WIdWgtPrV.Add(TIntFltPr(EntWId, EntFq)); } // create bow-document int DId=BowDocBs->AddDoc(TInt::GetStr(DocId), TStrV(), WIdWgtPrV); TStr DocDescStr=Doc->GetTitleStr(); BowDocBs->PutDocDescStr(DId, DocDescStr); } // k-means clustering PBowSim BowSim=TBowSim::New(bstCos); // similarity object TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting PBowDocPart BowDocPart=TBowClust::GetKMeansPart( TNotify::StdNotify, // log output BowDocBs, // document data BowSim, // similarity function TRnd(1), // random generator Clusts, // number of clusters 1, // trials per k-means 1, // convergence epsilon for k-means 1, // min. documents per cluster WordWgtType, // word weighting 0, // cut-word-weights percentage 0); // minimal word frequency EntNmWgtPrVV.Clr(); for (int ClustN=0; ClustN<BowDocPart->GetClusts(); ClustN++){ PBowDocPartClust Clust=BowDocPart->GetClust(ClustN); TStrFltPrV WordStrWgtPrV; Clust->GetTopWordStrWgtPrV(BowDocBs, 25, 0.5, WordStrWgtPrV); EntNmWgtPrVV.Add(WordStrWgtPrV); } //BowDocPart->SaveTxt("Clusts.Txt", BowDocBs, true, 25, 0.5, false); }
void TSkyGridEnt::GetDocCentroid(const TSkyGridBs* SkyGridBs, const int& TopWords, const double& TopWordsWgtSumPrc, TStrFltPrV& WordStrWgtPrV) const { // create bow PSwSet SwSet=TSwSet::GetSwSet(swstEn523); PStemmer Stemmer=TStemmer::GetStemmer(stmtPorter); PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NULL); for (int DocN=0; DocN<GetDocIds(); DocN++){ int DocId=GetDocId(DocN); PSkyGridDoc Doc=SkyGridBs->GetDoc(DocId); TStr DocStr=Doc->GetHeadlineStr(); BowDocBs->AddHtmlDoc(TInt::GetStr(DocId), TStrV(), DocStr); } // create word-weights TBowWordWgtType WordWgtType=bwwtNrmTFIDF; // define weighting PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(BowDocBs, WordWgtType, 0, 0); // create concept vector PBowSim BowSim=TBowSim::New(bstCos); // similarity object TIntV AllDIdV; BowDocBs->GetAllDIdV(AllDIdV); PBowSpV ConceptSpV=TBowClust::GetConceptSpV(BowDocWgtBs, BowSim, AllDIdV); // get word-vector ConceptSpV->GetWordStrWgtPrV(BowDocBs, TopWords, TopWordsWgtSumPrc, WordStrWgtPrV); }
PBowDocBs TGgSchRSet::GetBowDocBs() const { // prepare stop-words, stemming PSwSet SwSet=TSwSet::GetSwSet(swstEn523); PStemmer Stemmer=TStemmer::New(stmtPorter, true); // prepare n-grams TStrV HtmlStrV(GetHits(), 0); for (int HitN=0; HitN<GetHits(); HitN++){ TStr HtmlStr=GetHit(HitN)->TitleStr; HtmlStrV.Add(HtmlStr); } PNGramBs NGramBs=TNGramBs::GetNGramBsFromHtmlStrV( HtmlStrV, 3, 3, SwSet, Stemmer); // create document-base printf("Create Bag-Of-Words Base ... "); PBowDocBs BowDocBs=TBowDocBs::New(); BowDocBs->PutNGramBs(NGramBs); for (int HitN=0; HitN<GetHits(); HitN++){ BowDocBs->AddHtmlDoc(TInt::GetStr(HitN), TStrV(), HtmlStrV[HitN], true); } BowDocBs->AssertOk(); printf("Done.\n"); // return bag-of-words return BowDocBs; }
PBowDocBs TBowFl::LoadTBsTxt( const TStr& TBsFNm, const int& MxDocs, const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq){ // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ NGramBs=TNGramBs::GetNGramBsFromTBs( TBsFNm, MxDocs, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create document-base PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); // open input text-base TStr TxtBsNm=TBsFNm.GetFBase(); TStr TxtBsFPath=TBsFNm.GetFPath(); PTxtBs TxtBs=TTxtBs::New(TxtBsNm, TxtBsFPath, faRdOnly); // traverse documents TBlobPt TxtBsTrvBlobPt=TxtBs->FFirstDocId(); TBlobPt TxtBsDocId; int Docs=0; while (TxtBs->FNextDocId(TxtBsTrvBlobPt, TxtBsDocId)){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} // get document data TStr DocNm; TStr DocStr; TxtBs->GetDocNmStr(TxtBsDocId, DocNm, DocStr); // add document to bow BowDocBs->AddHtmlDoc(DocNm, TStrV(), DocStr, false); } // return results BowDocBs->AssertOk(); return BowDocBs; }
void TAlignPair::AddSent(const TStr& Sent1, const TStr& Sent2) { TStr DocNm = TInt::GetStr(BowDocBs1->GetDocs()); BowDocBs1->AddHtmlDoc(DocNm, TStrV(), Sent1, false); BowDocBs2->AddHtmlDoc(DocNm, TStrV(), Sent2, false); IAssert(BowDocBs1->GetDocs() == BowDocBs2->GetDocs()); }
///////////////////////////////////////////////// // Google-Focused-Crawl PGgFCrawl TGgFCrawl::GetFCrawl( const TStr& SrcUrlStr, const int& MxCands, const TStr& ProxyStr){ // collect related urls printf("Expand source URL: %s\n", SrcUrlStr.CStr()); PRSet SrcUrlRSet= TGg::WebSearch(TStr("related:")+SrcUrlStr, -1, TNotify::NullNotify, ProxyStr); // create & prepare focused-crawl PGgFCrawl FCrawl=TGgFCrawl::New(); FCrawl->SrcUrlStr=SrcUrlStr; FCrawl->DstRSet=TRSet::New(SrcUrlRSet); // fill hits for (int HitN=0; HitN<SrcUrlRSet->GetHits(); HitN++){ if ((MxCands!=-1)&&(FCrawl->DstRSet->GetHits()>MxCands)){break;} TStr HitUrlStr=SrcUrlRSet->GetHitUrlStr(HitN); printf("Expand URL: %s\n", HitUrlStr.CStr()); PRSet RelUrlRSet= TGg::WebSearch(TStr("related:")+HitUrlStr, -1, TNotify::NullNotify, ProxyStr); FCrawl->DstRSet->Merge(RelUrlRSet); } // save related urls //TRSet::SaveXml(DstRSet, OutXmlUrlFNm); // collect related web-pages TGgWebFetchSaver WebFetchSaver(100); WebFetchSaver.PutProxyStr(ProxyStr); // get source-url web-page {bool Ok; TStr MsgStr; TWebFetchBlocking::GetWebPg( SrcUrlStr, Ok, MsgStr, FCrawl->SrcWebPg, NULL, ProxyStr); if (!Ok){FCrawl->SrcWebPg=NULL;}} // get related-urls web-page int FetchHits=FCrawl->DstRSet->GetHits(); if ((MxCands!=-1)&&(MxCands<FetchHits)){FetchHits=MxCands;} for (int HitN=0; HitN<FetchHits; HitN++){ TStr HitUrlStr=FCrawl->DstRSet->GetHitUrlStr(HitN); WebFetchSaver.FetchUrl(HitUrlStr); } TSysMsg::Loop(); // save crawled web-pages for (int WebPgN=0; WebPgN<WebFetchSaver.GetWebPgs(); WebPgN++){ PWebPg WebPg=WebFetchSaver.GetWebPg(WebPgN); FCrawl->UrlStrToWebPgH.AddDat(WebPg->GetUrlStr(), WebPg); } // create bag-of-words FCrawl->BowDocBs=TBowDocBs::New(); FCrawl->SrcDId=FCrawl->BowDocBs->AddHtmlDoc( SrcUrlStr, TStrV(), FCrawl->SrcWebPg->GetHttpBodyAsStr()); for (int WebPgN=0; WebPgN<WebFetchSaver.GetWebPgs(); WebPgN++){ PWebPg WebPg=WebFetchSaver.GetWebPg(WebPgN); FCrawl->BowDocBs->AddHtmlDoc( WebPg->GetUrlStr(0), TStrV(), WebPg->GetHttpBodyAsStr()); } // calculate similarities to the source document PBowDocWgtBs BowDocWgtBs=TBowDocWgtBs::New(FCrawl->BowDocBs, bwwtNrmTFIDF); PBowSim BowSim=TBowSim::New(bstCos); FCrawl->SimDIdKdV; FCrawl->SumSim=0; for (int DIdN=0; DIdN<BowDocWgtBs->GetDocs(); DIdN++){ int DId=BowDocWgtBs->GetDId(DIdN); if (DId!=FCrawl->SrcDId){ double Sim=BowSim->GetSim( BowDocWgtBs->GetSpV(FCrawl->SrcDId), BowDocWgtBs->GetSpV(DId)); FCrawl->SimDIdKdV.Add(TFltIntKd(Sim, DId)); FCrawl->SumSim+=Sim; } } FCrawl->SimDIdKdV.Sort(false); // set crawl ok FCrawl->Ok=true; // return focused-crawl return FCrawl; }