void TCpDoc::SaveAcmTechNewsToCpd( const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // processing xml files TFFile FFile(TStrV()+InFPath, TStrV()+".Html"+".Htm", "", true); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\r", FNm.CStr()); PSIn SIn=TFIn::New(FNm); THtmlLx Lx(SIn); while (Lx.GetSym()!=hsyEof){ //printf("%d\r", Docs); if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} Lx.MoveToBTagOrEof("<SPAN>"); if (Lx.GetArg("CLASS")!="title"){continue;} Lx.MoveToBTagOrEof("<A>"); TStr TitleStr=Lx.GetStrToETag("<A>", false); TitleStr=""; Lx.MoveToETagOrEof("<SPAN>"); Lx.MoveToBTagOrEof("<P>"); TStr ParStr=Lx.GetStrToETag("<P>", false); if (!ParStr.Empty()){ Docs++; PCpDoc CpDoc=TCpDoc::New(TInt::GetStr(Docs), TitleStr, ParStr); CpDoc->Save(*SOut); } } } printf("\n"); }
PNGramBs TNGramBs::GetNGramBsFromReuters21578( const TStr& FPath, const int& MxDocs, const int& MxNGramLen, const int& MnNGramFq, const PSwSet& SwSet, const PStemmer& Stemmer){ // create n-gram-base /* printf("Generating frequent n-grams (MaxLen:%d MinFq:%d) ...\n", MxNGramLen, MnNGramFq); */ PNGramBs NGramBs=TNGramBs::New(MxNGramLen, MnNGramFq, SwSet, Stemmer); // interations over document-set while (!NGramBs->IsFinished()){ TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ /* printf("Processing file '%s'\n", FNm.CStr()); */ TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; /* printf(" Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */ if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); // get html string PXmlTok Tok=Doc->GetTagTok("REUTERS|TEXT"); TStr HtmlStr=Tok->GetTokStr(false); // extract words & update n-gram-base _UpdateNGramBsFromHtmlStr(NGramBs, HtmlStr, SwSet, Stemmer); } if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} } NGramBs->ConcPass(); /* printf(" Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */ } /* printf("\nDone.\n"); */ // return return NGramBs; }
PNGramBs TNGramBs::GetNGramBsFromHtmlFPathV( const TStr& FPath, const bool& RecurseDirP, const int& MxDocs, const int& MxNGramLen, const int& MnNGramFq, const PSwSet& SwSet, const PStemmer& Stemmer){ // create n-gram-base /* printf("Generating frequent n-grams (MaxLen:%d MinFq:%d) ...\n", MxNGramLen, MnNGramFq); */ PNGramBs NGramBs=TNGramBs::New(MxNGramLen, MnNGramFq, SwSet, Stemmer); // interations over document-set while (!NGramBs->IsFinished()){ // prepare file-directory traversal TFFile FFile(FPath, "", RecurseDirP); // traverse files TStr FNm; int Docs=0; while (FFile.Next(FNm)){ Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} /* if (Docs%100==0){ */ /* printf(" Pass %d: %6d\r", NGramBs->GetPassN(), Docs);} */ if (TFile::Exists(FNm)) { // load html TStr HtmlStr=TStr::LoadTxt(FNm); // extract words & update ngram-base _UpdateNGramBsFromHtmlStr(NGramBs, HtmlStr, SwSet, Stemmer); } } NGramBs->ConcPass(); /* printf(" Pass %d: %6d\n", NGramBs->GetPassN()-1, Docs); */ } /* printf("Done.\n"); */ // return return NGramBs; }
void TFile::DelWc(const TStr& WcStr, const bool& RecurseDirP){ // collect file-names TStrV FNmV; TFFile FFile(WcStr, RecurseDirP); TStr FNm; while (FFile.Next(FNm)){ FNmV.Add(FNm);} // delete files for (int FNmN=0; FNmN<FNmV.Len(); FNmN++){ Del(FNmV[FNmN], false);} }
///////////////////////////////////////////////// // SkyGrid-Document void TSkyGridBinDoc::SaveBinDocV( const TStr& InXmlFPath, const TStr& OutBinFNm, const int& MxDocs){ printf("Processing SkyGrid-News-Xml files from '%s'...\n", InXmlFPath.CStr()); TFOut SOut(OutBinFNm); TFFile FFile(InXmlFPath, true); TStr FNm; int Docs=0; int DateDocs=0; uint64 PrevTm=0; while (FFile.Next(FNm)){ if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} //printf(" Processing '%s' ...", FNm.CStr()); PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm); PXmlTok ContentTok=XmlDoc->GetTagTok("item|content"); TStr SwIdStr=ContentTok->GetTagTok("swid")->GetArgVal("value"); TStr UrlStr=ContentTok->GetTagTok("url")->GetTokStr(false); TStr TitleStr=ContentTok->GetTagTok("title")->GetTokStr(false); TStr FetchedValStr=ContentTok->GetTagTok("fetched")->GetArgVal("value"); TXmlTokV EntityTokV; ContentTok->GetTagTokV("annotations|entity", EntityTokV); TStr BodyStr=ContentTok->GetTagTok("body")->GetTokStr(false); // extract date TStr DateStr=SwIdStr.GetSubStr(0, 7); TStr YearStr=DateStr.GetSubStr(0, 3); TStr MonthStr=DateStr.GetSubStr(4, 5); TStr DayStr=DateStr.GetSubStr(6, 7); TTm DateTm(YearStr.GetInt(), MonthStr.GetInt(), DayStr.GetInt()); uint64 Tm=TTm::GetMSecsFromTm(DateTm); // extract entities TStrIntH EntNmToFqH; for (int EntityTokN=0; EntityTokN<EntityTokV.Len(); EntityTokN++){ PXmlTok EntityTok=EntityTokV[EntityTokN]; if (!EntityTok->IsTag("entity")){continue;} TStr CanonicalNm=EntityTok->GetArgVal("canonical", ""); TStr TextStr=EntityTok->GetArgVal("text", ""); TStr TypeNm=EntityTok->GetArgVal("type", ""); TStr EntNm=CanonicalNm.Empty() ? TextStr : CanonicalNm; EntNmToFqH.AddDat(EntNm)++; } TIntStrPrV FqEntNmPrV; EntNmToFqH.GetDatKeyPrV(FqEntNmPrV); FqEntNmPrV.Sort(false); // extract headline TChA HeadlineChA=BodyStr.GetSubStr(0, 250); while ((HeadlineChA.Len()>0)&&(HeadlineChA.LastCh()!=' ')){ HeadlineChA.Trunc(HeadlineChA.Len()-1);} HeadlineChA+="..."; // create document TSkyGridBinDoc Doc(SwIdStr, Tm, TitleStr, HeadlineChA, FqEntNmPrV); // save document Doc.Save(SOut); // screen log if (PrevTm!=Tm){ if (PrevTm!=0){printf("\n");} PrevTm=Tm; DateDocs=0; } Docs++; DateDocs++; printf(" %s [Day:%d / All:%d]\r", DateStr.CStr(), DateDocs, Docs); } printf("\nDone.\n"); }
PBowDocBs TBowFl::LoadReuters21578Txt( const TStr& FPath, const int& MxDocs, const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq, const bool& SaveDocP, const PNotify& Notify){ // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ NGramBs=TNGramBs::GetNGramBsFromReuters21578( FPath, MxDocs, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create document-base PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); // traverse directory TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\n", FNm.CStr()); TIntH DocWIdToFqH(100); TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} // get document-name PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); TStr DocNm=DocTok->GetArgVal("NEWID"); // get document-categories TStrV CatNmV; TXmlTokV TopicsTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopicsTokV); for (int TopicsTokN=0; TopicsTokN<TopicsTokV.Len(); TopicsTokN++){ TStr CatNm=TopicsTokV[TopicsTokN]->GetTokStr(false); CatNmV.Add(CatNm); } // get document-contents PXmlTok DocStrTok=Doc->GetTagTok("REUTERS|TEXT"); TStr DocStr=DocStrTok->GetTokStr(false); // add document to bow int DId=BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP); // train & test data if ((DocTok->GetArgVal("LEWISSPLIT")=="TRAIN")&&(DocTok->GetArgVal("TOPICS")=="YES")){ BowDocBs->AddTrainDId(DId);} if ((DocTok->GetArgVal("LEWISSPLIT")=="TEST")&&(DocTok->GetArgVal("TOPICS")=="YES")){ BowDocBs->AddTestDId(DId);} } if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} } // return results BowDocBs->AssertOk(); return BowDocBs; }
void TCpDoc::SaveReuters21578ToCpd( const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse directory with .sgm files TFFile FFile(InFPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\n", FNm.CStr()); TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} // create reuters document PCpDoc CpDoc=TCpDoc::New(); // load xml document PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); // document id CpDoc->DocNm=DocTok->GetArgVal("NEWID"); // date CpDoc->DateStr=Doc->GetTagTok("REUTERS|DATE")->GetTokStr(false); // document title PXmlTok TitleTok=Doc->GetTagTok("REUTERS|TEXT|TITLE"); if (!TitleTok.Empty()){ CpDoc->TitleStr=TitleTok->GetTokStr(false);} // dateline PXmlTok DatelineTok=Doc->GetTagTok("REUTERS|TEXT|DATELINE"); if (!DatelineTok.Empty()){ CpDoc->DatelineStr=DatelineTok->GetTokStr(false);} // get text string TStr TextStr; PXmlTok BodyTok=Doc->GetTagTok("REUTERS|TEXT|BODY"); if (!BodyTok.Empty()){ TextStr=BodyTok->GetTokStr(false); } else { // if <BODY> doesn't exist, take the whole <TEXT> PXmlTok TextTok=Doc->GetTagTok("REUTERS|TEXT"); if (!TextTok.Empty()){ TextStr=TextTok->GetTokStr(false); } } CpDoc->ParStrV.Add(TextStr, 1); // topic categories TXmlTokV TopCatTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopCatTokV); for (int TokN=0; TokN<TopCatTokV.Len(); TokN++){ TStr CatNm=TopCatTokV[TokN]->GetTokStr(false); CpDoc->TopCdNmV.Add(CatNm); } // save cpd document CpDoc->Save(*SOut); } if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} } }
void TFFile::GetFNmV( const TStr& FPath, const TStrV& FExtV, const bool& RecurseP, TStrV& FNmV){ // prepare file-directory traversal TStrV FPathV; FPathV.Add(FPath); TFFile FFile(FPathV, FExtV, "", RecurseP); TStr FNm; // traverse directory FNmV.Clr(); while (FFile.Next(FNm)){ FNmV.Add(FNm); } }
PAlignPairBs TAlignPairBs::LoadAcXml(const TStr& FPath, const int& MxSents) { // prepare base PAlignPairBs AlignPairBs = TAlignPairBs::New(); // iterate over files and load language to language alignments TFFile FFile(FPath, ".xml", false); TStr FNm; while (FFile.Next(FNm)) { PAlignPair AlignPair = TAlignPair::LoadAcXml(FNm, MxSents); AlignPairBs->AddAlignPair(AlignPair); } // finish return AlignPairBs; }
void FData::NewDataFile(int _Type, int _Size) { /* Cria um buffer para novo arquivo */ if(!ExistFileType(_Type)) { Files.push_back(FFile()); Files[Files.size()-1].Type = _Type; int Id = GetId(_Type); Files[Id].Buffer = new char[_Size]; Files[Id].SizeBuffer = _Size; } UpdateMaxFileType(); DefSerieMain(); }
TDzsBs::TDzsBs(const TStr& FPath, const TStr& WebAlias): TBook("Dzs-Base", "Dzs", "Lexicon"), DzsBsDocV(), Bix(TBix::New()){ TStrV FPathV; FPathV.Add(FPath); TStrV FExtV; FExtV.Add("xml"); FExtV.Add("html"); FExtV.Add("htm"); FExtV.Add("txt"); TFFile FFile(FPathV, FExtV, true); TStr FNm; int FNmN=0; while (FFile.Next(FNm)){ printf("%d\r", ++FNmN); PDzsBsDoc DzsBsDoc; if (FNm.GetFExt().GetUc()==".XML"){ PXmlDoc XmlDoc=TXmlDoc::LoadTxt(FNm); DzsBsDoc=TDzsBsDoc::GetDzsBsDoc(FNm, XmlDoc, FPath, WebAlias); } else { PHtmlDoc HtmlDoc=THtmlDoc::LoadTxt(FNm, hdtAll, false); DzsBsDoc=TDzsBsDoc::GetDzsBsDoc(FNm, HtmlDoc, FPath, WebAlias); } AddDoc(DzsBsDoc); } }
///////////////////////////////////////////////// // BagOfWords-Files void TBowFl::LoadHtmlTxt( PBowDocBs BowDocBs, const TStr& FPath, TIntV& NewDIdV, const bool& RecurseDirP, const int& MxDocs, const bool& SaveDocP, const PNotify& Notify) { // prepare file-directory traversal TStr LcNrFPath=TStr::GetNrFPath(FPath).GetLc(); Notify->OnStatus("Creating Bow from file-path " + FPath + " ..."); TFFile FFile(FPath, "", RecurseDirP); // traverse files TStr FNm; int Docs=0; NewDIdV.Clr(); while (FFile.Next(FNm)){ Docs++; if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} Notify->OnStatus(TStr::Fmt("%d\r", Docs)); // prepare document-name if (TFile::Exists(FNm)) { //B: TStr DocNm=FNm.GetLc(); if (DocNm.IsPrefix(LcNrFPath)){ DocNm=DocNm.GetSubStr(LcNrFPath.Len(), DocNm.Len()-1);} // categories TStrV CatNmV; TStr CatNm; if (DocNm.IsChIn('/')){ TStr Str; DocNm.SplitOnCh(CatNm, '/', Str); } else if (DocNm.IsChIn('\\')){ TStr Str; DocNm.SplitOnCh(CatNm, '\\', Str); } if (!CatNm.Empty()){ CatNmV.Add(CatNm);} // load document-content TStr DocStr=TStr::LoadTxt(FNm); // add document to bow NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP)); } } Notify->OnStatus(TStr::Fmt("%d", Docs)); // return results Notify->OnStatus("Done."); BowDocBs->AssertOk(); }
void TCpDoc::SaveHtmlToCpd( const TStr& InHtmlFPath, const TStr& OutCpdFNm, const bool& /*RecurseDirP*/, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // prepare file-directory traversal TStrV FPathV; FPathV.Add(InHtmlFPath); TStrV FExtV; FExtV.Add("html"); FExtV.Add("htm"); FExtV.Add("xml"); TFFile FFile(FPathV, FExtV, "", true); TStr FNm; // traverse files printf("Processing '%s' ...\n", InHtmlFPath.CStr()); int Docs=0; while (FFile.Next(FNm)){ // get file-name if ((MxDocs!=-1)&&(FFile.GetFNmN()>MxDocs)){break;} Docs++; if (FFile.GetFNmN()%10==0){printf("%d\r", Docs);} // load html PCpDoc CpDoc=TCpDoc::LoadHtmlDoc(FNm); // save cpd document CpDoc->Save(*SOut); } printf("%d\nDone.\n", Docs); }
PPlBs TPlBs::LoadTxtPl(const TStr& FNmWc){ PPlBs PlBs=TPlBs::New(); printf("Loading Prolog from '%s'...\n", FNmWc.CStr()); TFFile FFile(FNmWc); TStr FNm; while (FFile.Next(FNm)){ printf(" ...loading '%s'\n", FNm.CStr()); // process prolog-file PSIn SIn=TFIn::New(FNm); TILx Lx(SIn, TFSet()|/*iloList|*/iloCmtAlw|iloCsSens|iloUniStr|iloExcept); Lx.GetSym(TFSet(TupExpect)|syEof); while (Lx.Sym!=syEof){ TPlVal TupVal=TPlBs::ParseTup(Lx, TFSet()|syPeriod, PlBs); int FuncId=PlBs->TupV[TupVal.GetTupId()]->GetFuncId(); int Arity=PlBs->TupV[TupVal.GetTupId()]->GetVals(); PlBs->FuncIdArityPrToTupIdVH. AddDat(TIntPr(FuncId, Arity)).Add(TupVal.GetTupId()); Lx.GetSym(TFSet(TupExpect)|syEof); } //break; } printf("Done.\n"); return PlBs; }
void TCpDoc::SaveReuters2000ToCpd( const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // processing xml files TStrStrH DocIdToDateStrH; TFFile FFile(InFPath, ".XML", true); TStr FNm; while (FFile.Next(FNm)){ if ((MxDocs!=-1)&&(FFile.GetFNmN()>=MxDocs)){break;} if ((1+FFile.GetFNmN())%100==0){ printf("Processing file '%s' (%d)\r", FNm.CStr(), 1+FFile.GetFNmN());} PXmlDoc Doc=TXmlDoc::LoadTxt(FNm); // get document PCpDoc CpDoc=TCpDoc::New(); LoadReuters2000DocFromXml(FNm, CpDoc->DocNm, CpDoc->DateStr, CpDoc->TitleStr, CpDoc->HeadlineStr, CpDoc->BylineStr, CpDoc->DatelineStr, CpDoc->ParStrV, CpDoc->TopCdNmV, CpDoc->GeoCdNmV, CpDoc->IndCdNmV); // save cpd document CpDoc->Save(*SOut); } printf("\n"); }
void TCpDoc::SaveAsfaToCpd(const TStr& InFPath, const TStr& OutCpdFNm){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse files TStrH AccessionIdH; TFFile FFile(TStr::GetNrFPath(InFPath)+"*.Asfa"); TStr AsfaFNm; while (FFile.Next(AsfaFNm)){ printf("Processing file '%s'\n", AsfaFNm.CStr()); PSIn SIn=TFIn::New(AsfaFNm); TILx Lx(SIn, TFSet(iloRetEoln, iloExcept)); Lx.GetSym(syLn, syEof); while (Lx.Sym!=syEof){ // Query Line TStr QueryLnStr=Lx.Str; TStrV QueryStrV; QueryLnStr.SplitOnAllCh('\t', QueryStrV, false); IAssert(QueryStrV[0]=="Query"); // RecordNo Line Lx.GetSym(syLn); TStr RecNoLnStr=Lx.Str; TStrV RecNoStrV; RecNoLnStr.SplitOnAllCh('\t', RecNoStrV, false); IAssert(RecNoStrV[0]=="RecordNo"); //int RecN=RecNoStrV[1].GetInt(); // fields (format: Short-Name Tab Long-Name Tab Value-String) TStr TitleStr, AbstractStr, PublicationYearStr, AccessionId; TStrV AuthorNmV; TStrV TermNmV1, TermNmV2; while (true){ Lx.GetSym(syLn); TStr FldLnStr=Lx.Str; TStrV FldStrV; FldLnStr.SplitOnAllCh('\t', FldStrV, false); if (FldStrV[0]=="----"){ if (!AccessionIdH.IsKey(AccessionId)){ AccessionIdH.AddKey(AccessionId); // create & save cpd document PCpDoc CpDoc=TCpDoc::New(); CpDoc->DocNm=AccessionId; CpDoc->DateStr=PublicationYearStr; CpDoc->TitleStr=TitleStr; CpDoc->ParStrV.Add(AbstractStr); CpDoc->TopCdNmV=TermNmV1; CpDoc->GeoCdNmV=TermNmV2; CpDoc->IndCdNmV=AuthorNmV; CpDoc->Save(*SOut); } else {/*printf("[%s]", AccessionId.CStr());*/} break; } else if (FldStrV[0]=="TI"){ TitleStr=FldStrV[2]; } else if (FldStrV[0]=="TI"){ TitleStr=FldStrV[2]; } else if (FldStrV[0]=="AU"){ FldStrV[2].SplitOnAllCh(';', AuthorNmV); for (int StrN=0; StrN<AuthorNmV.Len(); StrN++){AuthorNmV[StrN].ToTrunc();} } else if (FldStrV[0]=="AB"){ AbstractStr=FldStrV[2]; } else if (FldStrV[0]=="PY"){ PublicationYearStr=FldStrV[2]; } else if (FldStrV[0]=="DE"){ FldStrV[2].SplitOnAllCh(';', TermNmV1); for (int StrN=0; StrN<TermNmV1.Len(); StrN++){TermNmV1[StrN].ToTrunc();} } else if (FldStrV[0]=="CL"){ FldStrV[2].SplitOnAllCh(';', TermNmV2); for (int StrN=0; StrN<TermNmV2.Len(); StrN++){TermNmV2[StrN].ToTrunc();} } else if (FldStrV[0]=="AN"){ AccessionId=FldStrV[2]; } } printf("%d\r", AccessionIdH.Len()); Lx.GetSym(syLn, syEof); } } }