PNGramBs TNGramBs::GetNGramBsFromReuters21578( const TStr& FPath, const int& MxDocs, const int& MxNGramLen, const int& MnNGramFq, const PSwSet& SwSet, const PStemmer& Stemmer){ // create n-gram-base /* printf("Generating frequent n-grams (MaxLen:%d MinFq:%d) ...\n", MxNGramLen, MnNGramFq); */ PNGramBs NGramBs=TNGramBs::New(MxNGramLen, MnNGramFq, SwSet, Stemmer); // interations over document-set while (!NGramBs->IsFinished()){ TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ /* printf("Processing file '%s'\n", FNm.CStr()); */ TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; /* printf(" Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */ if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); // get html string PXmlTok Tok=Doc->GetTagTok("REUTERS|TEXT"); TStr HtmlStr=Tok->GetTokStr(false); // extract words & update n-gram-base _UpdateNGramBsFromHtmlStr(NGramBs, HtmlStr, SwSet, Stemmer); } if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} } NGramBs->ConcPass(); /* printf(" Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */ } /* printf("\nDone.\n"); */ // return return NGramBs; }
PBowDocBs TBowFl::LoadReuters21578Txt( const TStr& FPath, const int& MxDocs, const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq, const bool& SaveDocP, const PNotify& Notify){ // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ NGramBs=TNGramBs::GetNGramBsFromReuters21578( FPath, MxDocs, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create document-base PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); // traverse directory TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\n", FNm.CStr()); TIntH DocWIdToFqH(100); TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} // get document-name PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); TStr DocNm=DocTok->GetArgVal("NEWID"); // get document-categories TStrV CatNmV; TXmlTokV TopicsTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopicsTokV); for (int TopicsTokN=0; TopicsTokN<TopicsTokV.Len(); TopicsTokN++){ TStr CatNm=TopicsTokV[TopicsTokN]->GetTokStr(false); CatNmV.Add(CatNm); } // get document-contents PXmlTok DocStrTok=Doc->GetTagTok("REUTERS|TEXT"); TStr DocStr=DocStrTok->GetTokStr(false); // add document to bow int DId=BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP); // train & test data if ((DocTok->GetArgVal("LEWISSPLIT")=="TRAIN")&&(DocTok->GetArgVal("TOPICS")=="YES")){ BowDocBs->AddTrainDId(DId);} if ((DocTok->GetArgVal("LEWISSPLIT")=="TEST")&&(DocTok->GetArgVal("TOPICS")=="YES")){ BowDocBs->AddTestDId(DId);} } if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} } // return results BowDocBs->AssertOk(); return BowDocBs; }
void TCpDoc::SaveReuters21578ToCpd( const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){ // create output file PSOut SOut=TFOut::New(OutCpdFNm); // traverse directory with .sgm files TFFile FFile(InFPath, ".SGM", false); TStr FNm; int Docs=0; while (FFile.Next(FNm)){ printf("Processing file '%s'\n", FNm.CStr()); TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV); for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){ Docs++; if (Docs%100==0){printf("%d\r", Docs);} if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} // create reuters document PCpDoc CpDoc=TCpDoc::New(); // load xml document PXmlDoc Doc=LDocV[LDocN]; PXmlTok DocTok=Doc->GetTok(); // document id CpDoc->DocNm=DocTok->GetArgVal("NEWID"); // date CpDoc->DateStr=Doc->GetTagTok("REUTERS|DATE")->GetTokStr(false); // document title PXmlTok TitleTok=Doc->GetTagTok("REUTERS|TEXT|TITLE"); if (!TitleTok.Empty()){ CpDoc->TitleStr=TitleTok->GetTokStr(false);} // dateline PXmlTok DatelineTok=Doc->GetTagTok("REUTERS|TEXT|DATELINE"); if (!DatelineTok.Empty()){ CpDoc->DatelineStr=DatelineTok->GetTokStr(false);} // get text string TStr TextStr; PXmlTok BodyTok=Doc->GetTagTok("REUTERS|TEXT|BODY"); if (!BodyTok.Empty()){ TextStr=BodyTok->GetTokStr(false); } else { // if <BODY> doesn't exist, take the whole <TEXT> PXmlTok TextTok=Doc->GetTagTok("REUTERS|TEXT"); if (!TextTok.Empty()){ TextStr=TextTok->GetTokStr(false); } } CpDoc->ParStrV.Add(TextStr, 1); // topic categories TXmlTokV TopCatTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopCatTokV); for (int TokN=0; TokN<TopCatTokV.Len(); TokN++){ TStr CatNm=TopCatTokV[TokN]->GetTokStr(false); CpDoc->TopCdNmV.Add(CatNm); } // save cpd document CpDoc->Save(*SOut); } if ((MxDocs!=-1)&&(Docs>MxDocs)){break;} } }