Beispiel #1
0
PNGramBs TNGramBs::GetNGramBsFromReuters21578(
 const TStr& FPath, const int& MxDocs,
 const int& MxNGramLen, const int& MnNGramFq,
 const PSwSet& SwSet, const PStemmer& Stemmer){
  // create n-gram-base
  /* printf("Generating frequent n-grams (MaxLen:%d MinFq:%d) ...\n", MxNGramLen, MnNGramFq); */
  PNGramBs NGramBs=TNGramBs::New(MxNGramLen, MnNGramFq, SwSet, Stemmer);
  // interations over document-set
  while (!NGramBs->IsFinished()){
    TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0;
    while (FFile.Next(FNm)){
      /* printf("Processing file '%s'\n", FNm.CStr()); */
      TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
      for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
        Docs++;
        /* printf("  Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */
        if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
        PXmlDoc Doc=LDocV[LDocN];
        PXmlTok DocTok=Doc->GetTok();
        // get html string
        PXmlTok Tok=Doc->GetTagTok("REUTERS|TEXT");
        TStr HtmlStr=Tok->GetTokStr(false);
        // extract words & update n-gram-base
        _UpdateNGramBsFromHtmlStr(NGramBs, HtmlStr, SwSet, Stemmer);
      }
      if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    }
    NGramBs->ConcPass();
    /* printf("  Pass %2d: %6d Docs\r", NGramBs->GetPassN(), Docs); */
  }
  /* printf("\nDone.\n"); */
  // return
  return NGramBs;
}
Beispiel #2
0
PBowDocBs TBowFl::LoadReuters21578Txt(
 const TStr& FPath, const int& MxDocs,
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq, const bool& SaveDocP,
 const PNotify& Notify){
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    NGramBs=TNGramBs::GetNGramBsFromReuters21578(
     FPath, MxDocs,
     MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create document-base
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  // traverse directory
  TFFile FFile(FPath, ".SGM", false); TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\n", FNm.CStr());
    TIntH DocWIdToFqH(100);
    TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
    for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
      Docs++; if (Docs%100==0){printf("%d\r", Docs);}
      if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
      // get document-name
      PXmlDoc Doc=LDocV[LDocN];
      PXmlTok DocTok=Doc->GetTok();
      TStr DocNm=DocTok->GetArgVal("NEWID");
      // get document-categories
      TStrV CatNmV;
      TXmlTokV TopicsTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopicsTokV);
      for (int TopicsTokN=0; TopicsTokN<TopicsTokV.Len(); TopicsTokN++){
        TStr CatNm=TopicsTokV[TopicsTokN]->GetTokStr(false);
        CatNmV.Add(CatNm);
      }
      // get document-contents
      PXmlTok DocStrTok=Doc->GetTagTok("REUTERS|TEXT");
      TStr DocStr=DocStrTok->GetTokStr(false);
      // add document to bow
      int DId=BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocStr, SaveDocP);
      // train & test data
      if ((DocTok->GetArgVal("LEWISSPLIT")=="TRAIN")&&(DocTok->GetArgVal("TOPICS")=="YES")){
        BowDocBs->AddTrainDId(DId);}
      if ((DocTok->GetArgVal("LEWISSPLIT")=="TEST")&&(DocTok->GetArgVal("TOPICS")=="YES")){
        BowDocBs->AddTestDId(DId);}
    }
    if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
  }
  // return results
  BowDocBs->AssertOk();
  return BowDocBs;
}
Beispiel #3
0
void TCpDoc::SaveReuters21578ToCpd(
 const TStr& InFPath, const TStr& OutCpdFNm, const int& MxDocs){
  // create output file
  PSOut SOut=TFOut::New(OutCpdFNm);
  // traverse directory with .sgm files
  TFFile FFile(InFPath, ".SGM", false); TStr FNm; int Docs=0;
  while (FFile.Next(FNm)){
    printf("Processing file '%s'\n", FNm.CStr());
    TXmlDocV LDocV; TXmlDoc::LoadTxt(FNm, LDocV);
    for (int LDocN=0; LDocN<LDocV.Len(); LDocN++){
      Docs++; if (Docs%100==0){printf("%d\r", Docs);}
      if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
      // create reuters document
      PCpDoc CpDoc=TCpDoc::New();
      // load xml document
      PXmlDoc Doc=LDocV[LDocN];
      PXmlTok DocTok=Doc->GetTok();
      // document id
      CpDoc->DocNm=DocTok->GetArgVal("NEWID");
      // date
      CpDoc->DateStr=Doc->GetTagTok("REUTERS|DATE")->GetTokStr(false);
      // document title
      PXmlTok TitleTok=Doc->GetTagTok("REUTERS|TEXT|TITLE");
      if (!TitleTok.Empty()){
        CpDoc->TitleStr=TitleTok->GetTokStr(false);}
      // dateline
      PXmlTok DatelineTok=Doc->GetTagTok("REUTERS|TEXT|DATELINE");
      if (!DatelineTok.Empty()){
        CpDoc->DatelineStr=DatelineTok->GetTokStr(false);}
      // get text string
      TStr TextStr;
      PXmlTok BodyTok=Doc->GetTagTok("REUTERS|TEXT|BODY");
      if (!BodyTok.Empty()){
        TextStr=BodyTok->GetTokStr(false);
      } else {
        // if <BODY> doesn't exist, take the whole <TEXT>
        PXmlTok TextTok=Doc->GetTagTok("REUTERS|TEXT");
        if (!TextTok.Empty()){
          TextStr=TextTok->GetTokStr(false);
        }
      }
      CpDoc->ParStrV.Add(TextStr, 1);
      // topic categories
      TXmlTokV TopCatTokV; Doc->GetTagTokV("REUTERS|TOPICS|D", TopCatTokV);
      for (int TokN=0; TokN<TopCatTokV.Len(); TokN++){
        TStr CatNm=TopCatTokV[TokN]->GetTokStr(false);
        CpDoc->TopCdNmV.Add(CatNm);
      }
      // save cpd document
      CpDoc->Save(*SOut);
    }
    if ((MxDocs!=-1)&&(Docs>MxDocs)){break;}
  }
}