Esempio n. 1
0
PRSet TRSet::NewNews(const TStr& UrlStr, const TStr& HtmlStr){
  // prepare object
  PRSet RSet=TRSet::New();

  // prepare html browsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  THtmlLx HtmlLx(HtmlSIn);

  // extract header info
  HtmlLx.MoveToStrOrEof("Results");
  TStr FromResultStr=HtmlLx.GetStrInTag("<B>", true);
  TStr ToResultStr=HtmlLx.GetStrInTag("<B>", true);
  TStr AllHitsStr=HtmlLx.GetStrInTag("<B>", true);
  AllHitsStr.DelChAll(',');
  TStr QueryStr=HtmlLx.GetStrInTag("<B>", true);

  // traverse hits
  forever {
    HtmlLx.MoveToBTagOrEof("<TABLE>", "WIDTH", "75%", "<DIV>", "CLASS", "n");
    if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<TABLE>"))){break;}
    HtmlLx.MoveToBTagOrEof("<A>");
    if (HtmlLx.Sym!=hsyBTag){break;}
    TStr HitUrlStr=HtmlLx.GetArg("HREF");
    TStr IdStr=HtmlLx.GetArg("ID");
    // if image
    if ((!IdStr.Empty())&&(IdStr.LastCh()=='i')){
      HtmlLx.MoveToBTagOrEof("<A>");
      if (HtmlLx.Sym!=hsyBTag){break;}
      HitUrlStr=HtmlLx.GetArg("HREF");
    }
    TStr HitTitleStr=HtmlLx.GetStrToETag("<A>", true);
    TStr HitSrcNm=HtmlLx.GetStrToBTag("<NOBR>", true);
    if (HitSrcNm.IsSuffix(" -")){
      HitSrcNm=HitSrcNm.GetSubStr(0, HitSrcNm.Len()-3);}
    HtmlLx.MoveToETagOrEof("<NOBR>");
    TStr HitCtxStr=HtmlLx.GetStrToETag("<TABLE>", true);
    RSet->AddHit(HitUrlStr, HitTitleStr, HitSrcNm, HitCtxStr);
  }

  // extract footer info
  TStr NextUrlStr;
  if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<DIV>")){
    TStr NextRelUrlStr=HtmlLx.GetHRefBeforeStr("Next");
    if (!NextRelUrlStr.Empty()){
      PUrl NextUrl=TUrl::New(NextRelUrlStr, UrlStr);
      if (NextUrl->IsOk()){
        NextUrlStr=NextUrl->GetUrlStr();
      }
    }
  }

  // put components
  RSet->PutUrlStr(UrlStr);
  RSet->PutNextUrlStr(NextUrlStr);
  RSet->PutQueryStr(QueryStr);
  RSet->PutAllHits(AllHitsStr.GetInt(-1));

  // return
  return RSet;
}
Esempio n. 2
0
/////////////////////////////////////////////////
// Google-Result-Set
PRSet TRSet::NewWeb(const TStr& UrlStr, const TStr& HtmlStr){
  // prepare object
  PRSet RSet=TRSet::New();

  // prepare html browsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  // HtmlStr.SaveTxt("RSet.Html"); // when testing
  THtmlLx HtmlLx(HtmlSIn);

  // extract header info
  HtmlLx.MoveToStrOrEof("Results");
  TStr FromResultStr=HtmlLx.GetStrInTag("<B>", true);
  TStr ToResultStr=HtmlLx.GetStrInTag("<B>", true);
  TStr AllHitsStr=HtmlLx.GetStrInTag("<B>", true);
  AllHitsStr.DelChAll(',');
  TStr QueryStr=HtmlLx.GetStrInTag("<B>", true);

  // traverse hits
  forever {
    HtmlLx.MoveToBTagOrEof("<DIV>", "CLASS", "g", "<BR>", "CLEAR", "all");
    if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<DIV>"))){break;}
    //HtmlLx.MoveToBTagOrEof("<P>", "CLASS", "g", "<DIV>", "CLASS", "n");
    //if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<P>"))){break;}
    HtmlLx.MoveToBTagOrEof("<A>");
    if (HtmlLx.Sym!=hsyBTag){break;}
    TStr HitUrlStr=HtmlLx.GetArg("HREF");
    TStr HitTitleStr=HtmlLx.GetStrToETag("<A>", true);
    //HtmlLx.MoveToBTagOrEof("<FONT>");
    HtmlLx.MoveToBTagOrEof("<DIV>");
    //TStr HitCtxStr=HtmlLx.GetStrToBTag("<FONT>", "COLOR", "#008000", true);
    //TStr HitCtxStr=HtmlLx.GetStrToBTag("<SPAN>", "CLASS", "a", true);
    TStr HitCtxStr=HtmlLx.GetStrToBTag("<BR>", true);
    RSet->AddHit(HitUrlStr, HitTitleStr, "", HitCtxStr);
  }

  // extract footer info
  TStr NextUrlStr;
  if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<BR>")){
    TStr NextRelUrlStr=HtmlLx.GetHRefBeforeStr("Next");
    if (!NextRelUrlStr.Empty()){
      PUrl NextUrl=TUrl::New(NextRelUrlStr, UrlStr);
      if (NextUrl->IsOk()){
        NextUrlStr=NextUrl->GetUrlStr();
      }
    }
  }

  // put components
  RSet->PutUrlStr(UrlStr);
  RSet->PutNextUrlStr(NextUrlStr);
  RSet->PutQueryStr(QueryStr);
  RSet->PutAllHits(AllHitsStr.GetInt(-1));

  // return
  return RSet;
}
void TTokenizerHtml::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	THtmlLx HtmlLx(SIn, false);
    // traverse html string symbols
	while (HtmlLx.Sym!=hsyEof){
		if (HtmlLx.Sym==hsyStr){
			TStr UcStr = HtmlLx.UcChA;
			// check if stop word
			if ((SwSet.Empty()) || (!SwSet->IsIn(UcStr))) {
				TStr TokenStr = ToUcP ? UcStr : TStr(HtmlLx.ChA);
				if (!Stemmer.Empty()) { 
					TokenStr = Stemmer->GetStem(TokenStr); }
				TokenV.Add(TokenStr.GetLc());
			}
		}
		// get next symbol
		HtmlLx.GetSym();
	}

 //   // tokenize
 //   TStrV TokenStrV; Tokenizer->GetTokens(TextStr, TokenStrV);
 //   // transform words to IDs
 //   const int Tokens = TokenStrV.Len();
 //   TIntV TokenIdV(Tokens, 0);
 //   for (int TokenN = 0; TokenN < Tokens; TokenN++) {
 //       // add token to the hashtable of all tokens
 //       const int TokenId = WordH.AddKey(TokenStrV[TokenN].GetUc());
 //       // keep track of it's count
 //       WordH[TokenId]++;
 //       // and prepare a token vector for ngram base
 //       TokenIdV.Add(TokenId);
 //   }
	//// extract the n-grams
 //   TNGramDescV NGramDescV;
	//NGramBs->AddDocTokIdV(TokenIdV, StoreThreshold, NGramDescV);
 //   // get string representations of n-grams above threshold
 //   TStrH NGramH;
 //   for (int NGramDescN = 0; NGramDescN < NGramDescV.Len(); NGramDescN++) {
 //       const TNGramDesc& NGramDesc = NGramDescV[NGramDescN];
 //       // make it into a string
 //       const TIntV& NGramTokenIdV = NGramDesc.TokIdV;
 //       TChA NGramChA = WordH.GetKey(NGramTokenIdV[0]);
 //       for (int NGramTokenIdN = 1; NGramTokenIdN < NGramTokenIdV.Len(); NGramTokenIdN++) {
 //           NGramChA += ' '; NGramChA += WordH.GetKey(NGramTokenIdV[NGramTokenIdN]);
 //       }
 //       // remember the ngram, if not stopword
 //       if (!SwSet->IsIn(NGramChA)) { NGramH.AddDat(NGramChA); }
 //   }
 //   // remember n-grams above threshold
 //   int NGramKeyId = NGramH.FFirstKeyId();
 //   while (NGramH.FNextKeyId(NGramKeyId)) {
 //       const TStr& NGramStr = NGramH.GetKey(NGramKeyId);
 //       // add to the result list
 //       ConceptV.Add(TOgNewsConcept(NGramStr, EmtpyStr));
 //   }
}
Esempio n. 4
0
void TFtrGenToken::GetTokenV(const TStr& Str, TStrV& TokenStrV) const {
    THtmlLx HtmlLx(TStrIn::New(Str));
    while (HtmlLx.Sym != hsyEof){
        if (HtmlLx.Sym == hsyStr){ 
            TStr TokenStr = HtmlLx.UcChA;
            if (SwSet.Empty() || !SwSet->IsIn(TokenStr)) { 
                if (!Stemmer.Empty()) { 
                    TokenStr = Stemmer->GetStem(TokenStr); } 
                TokenStrV.Add(TokenStr);
            }
        }
        // get next symbol
        HtmlLx.GetSym();
    }
}
Esempio n. 5
0
void THtml::GetTokens(const PSIn& SIn, TStrV& TokenV) const {
	THtmlLx HtmlLx(SIn, false);
    // traverse html string symbols
	while (HtmlLx.Sym!=hsyEof){
		if (HtmlLx.Sym==hsyStr){
			TStr UcStr = HtmlLx.UcChA;
			// check if stop word
			if ((SwSet.Empty()) || (!SwSet->IsIn(UcStr))) {
				TStr TokenStr = ToUcP ? UcStr : TStr(HtmlLx.ChA);
				if (!Stemmer.Empty()) { 
					TokenStr = Stemmer->GetStem(TokenStr); }
				TokenV.Add(TokenStr.GetLc());
			}
		}
		// get next symbol
		HtmlLx.GetSym();
	}
}
Esempio n. 6
0
void TNytNGramBs::GetNGramStrV(const TStr& HtmlStr, TStrV& NGramStrV){
  NGramStrV.Clr();
  // prepare html parsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  THtmlLx HtmlLx(HtmlSIn);
  // process text
  TStrV StrV;
  while (HtmlLx.GetSym()!=hsyEof){
    TStr Str=HtmlLx.ChA;
    Str.ToLc();
    switch (HtmlLx.Sym){
      case hsyUndef:
      case hsyUrl:
      case hsyMTag:
      case hsySSym:
        StrV.Clr();
        break;
      case hsyStr:
      case hsyNum:
        NGramStrV.Add(Str);
        StrV.Add(Str);
        for (int NGramLen=2; NGramLen<=4; NGramLen++){
          if (StrV.Len()<NGramLen){break;}
          TStrV TermStrV(NGramLen, 0);
          for (int StrN=StrV.Len()-NGramLen; StrN<StrV.Len(); StrN++){
            TermStrV.Add(StrV[StrN]);
          }
          int NGramId;
          if (IsNGram(TermStrV, NGramId)){
            TStr NGramStr=GetNGramStr(NGramId);
            NGramStrV.Add(NGramStr);
          }
        } 
        break;
      case hsyBTag:
      case hsyETag:
        StrV.Clr();
        break;
      case hsyEof: break;
      default: Fail;
    }
  }
}
Esempio n. 7
0
PAmazonItem TAmazonItem::GetFromWebPg(const PWebPg& WebPg){
  TStr UrlStr=WebPg->GetUrlStr();
  TStr ItemId=TAmazonItem::GetItemId(WebPg->GetUrl());
  TStr HtmlStr=WebPg->GetHttpBodyAsStr();
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  THtmlLx HtmlLx(HtmlSIn);
  THtmlLxSym Sym; TChA ChA;

  // move to title
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<FONT>")){
      TStr FaceArg=HtmlLx.GetArg("FACE", "");
      TStr SizeArg=HtmlLx.GetArg("SIZE", "");
      if ((FaceArg=="verdana,arial,helvetica")&&(SizeArg.Empty())){break;}
    }
  }
  // extract title
  TChA TitleChA;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;}
    if (!TitleChA.Empty()){TitleChA+=HtmlLx.GetPreSpaceStr();}
    TitleChA+=ChA;
  }
  TStr TitleStr=TitleChA;
  //printf("'%s'\n", TitleStr.CStr());
  // extract authors
  TStrV AuthorNmV;
  TChA AuthorNmChA;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<A>")){
      do {
        HtmlLx.GetSym();
        Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
        if (Sym==hsyStr){
          if (!AuthorNmChA.Empty()){AuthorNmChA+=HtmlLx.GetPreSpaceStr();}
          AuthorNmChA+=ChA;
        }
      } while (!((Sym==hsyETag)&&(ChA=="<A>")));
      AuthorNmV.Add(AuthorNmChA); AuthorNmChA.Clr();
    }
    if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;}
  }
  for (int AuthorNmN=0; AuthorNmN<AuthorNmV.Len(); AuthorNmN++){
    //printf("'%s'\n", AuthorNmV[AuthorNmN].CStr());
  }
  // move to x-sell
  TStrQ PrevStrQ(3);
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if (Sym==hsyStr){
      PrevStrQ.Push(ChA);
      if ((PrevStrQ.Len()==3)&&(PrevStrQ[0]=="Customers")
       &&(PrevStrQ[1]=="who")&&(PrevStrQ[2]=="bought")){break;}
    } else {
      PrevStrQ.Clr();
    }
  }
  // extract x-sell pointers
  TStrV NextItemIdV;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<A>")){
      TStr RelUrlStr=HtmlLx.GetArg("HREF");
      PUrl Url=TUrl::New(RelUrlStr, UrlStr);
      TStr NextItemId=TAmazonItem::GetItemId(Url);
      NextItemIdV.Add(NextItemId);
    }
    if ((Sym==hsyETag)&&(ChA=="<UL>")){break;}
  }
  for (int NextItemIdN=0; NextItemIdN<NextItemIdV.Len(); NextItemIdN++){
    //printf("'%s'\n", NextItemIdV[NextItemIdN].CStr());
  }

  // construct item object
  PAmazonItem AmazonItem=PAmazonItem(new
   TAmazonItem(ItemId, TitleStr, AuthorNmV, NextItemIdV));
  return AmazonItem;
}
Esempio n. 8
0
/////////////////////////////////////////////////
// Google-Scholar-Result-Set
PGgSchRSet TGgSchRSet::NewScholar(const TStr& UrlStr, const TStr& HtmlStr){
  // prepare object
  PGgSchRSet RSet=TGgSchRSet::New();

  // prepare html browsing
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  THtmlLx HtmlLx(HtmlSIn);

  // extract header info
  HtmlLx.MoveToStrOrEof("Results");
  TStr FromResultStr=HtmlLx.GetStrInTag("<B>", true);
  TStr ToResultStr=HtmlLx.GetStrInTag("<B>", true);
  TStr AllHitsStr=HtmlLx.GetStrInTag("<B>", true);
  AllHitsStr.DelChAll(',');
  TStr QueryStr=HtmlLx.GetStrInTag("<B>", true);

  // traverse hits
  HtmlLx.MoveToBTagOrEof("<P>");
  forever {
    if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<P>"))){break;}
    HtmlLx.GetSym();
    if (HtmlLx.Sym==hsyBTag){
      TStr FullBTagStr=HtmlLx.GetFullBTagStr();
      if (FullBTagStr=="<FONT SIZE=\"-2\">"){
        TStr PubTypeNm=HtmlLx.GetStrInTag("<B>", true);
      } else
      if (FullBTagStr=="<SPAN CLASS=\"w\">"){
      } else {
        break;
      }
      TStr TitleStr=HtmlLx.GetStrToBTag("<BR>", true).GetTrunc();
      if (TitleStr.IsPrefix("[PS] ")){
        TitleStr=TitleStr.GetSubStr(5, TitleStr.Len()).GetTrunc();}
      TStr AuthNmVPubStr=HtmlLx.GetStrToBTag("<BR>", true);
      TStrV AuthNmV; TStr PubNm; TStr PubYearStr;
      TGgSchRef::GetAuthNmVPubStr(AuthNmVPubStr, AuthNmV, PubNm, PubYearStr);

      TStr CitedByUrlStr; int Citations=0;
      HtmlLx.MoveToBTag3OrEof("<A>", "<P>", "<DIV>");
      if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.ChA=="<A>")){
        TStr CitedByRelUrlStr=HtmlLx.GetArg("HREF");
        TStr AStr=HtmlLx.GetStrToETag("<A>", true);
        if (AStr.IsPrefix("Cited by ")){
          PUrl CitedByUrl=TUrl::New(CitedByRelUrlStr, UrlStr);
          if (CitedByUrl->IsOk()){
            CitedByUrlStr=CitedByUrl->GetUrlStr();
            Citations=AStr.GetSubStr(TStr("Cited by ").Len(), AStr.Len()).GetInt(0);
          }
        }
        HtmlLx.MoveToBTag2OrEof("<P>", "<DIV>");
      }
      PGgSchRef Ref=
       TGgSchRef::New(TitleStr, AuthNmV, PubNm, PubYearStr, Citations, CitedByUrlStr);
      RSet->AddHit(Ref);
      //printf("%4s - Cit %d - %s (Auth %d)\n",
      // PubYearStr.CStr(), Citations, TitleStr.CStr(), AuthNmV.Len());
    } else {
      break;
    }
    //RSet->AddHit(HitUrlStr, HitTitleStr, HitSrcNm, HitCtxStr);
  }

  // extract footer info
  TStr NextUrlStr;
  if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<DIV>")){
    TStr NextRelUrlStr=HtmlLx.GetHRefBeforeStr("Next");
    if (!NextRelUrlStr.Empty()){
      PUrl NextUrl=TUrl::New(NextRelUrlStr, UrlStr);
      if (NextUrl->IsOk()){
        NextUrlStr=NextUrl->GetUrlStr();
      }
    }
  }

  // put components
  RSet->PutUrlStr(UrlStr);
  RSet->PutNextUrlStr(NextUrlStr);
  RSet->PutQueryStr(QueryStr);
  RSet->PutAllHits(AllHitsStr.GetInt(-1));

  // return
  return RSet;
}
Esempio n. 9
0
void TNGramBs::GetNGramIdV(
 const TStr& HtmlStr, TIntV& NGramIdV, TIntPrV& NGramBEChXPrV) const {
  // create MxNGramLen queues
  TVec<TIntQ> WIdQV(MxNGramLen);
  TVec<TIntPrQ> BEChXPrQV(MxNGramLen);
  for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){
    WIdQV[NGramLen].Gen(100*NGramLen, NGramLen+1);
    BEChXPrQV[NGramLen].Gen(100*NGramLen, NGramLen+1);
  }
  bool AllWIdQClrP=true;
  // extract words from text-string
  PSIn HtmlSIn=TStrIn::New(HtmlStr, false);
  THtmlLx HtmlLx(HtmlSIn);
  while (HtmlLx.Sym!=hsyEof){
    if ((HtmlLx.Sym==hsyStr)||(HtmlLx.Sym==hsyNum)){
      // get word-string & word-id
      TStr WordStr=HtmlLx.UcChA;
      int WId; int SymBChX=HtmlLx.SymBChX; int SymEChX=HtmlLx.SymEChX;
      if ((SwSet.Empty())||(!SwSet->IsIn(WordStr))){
        if (!Stemmer.Empty()){
          WordStr=Stemmer->GetStem(WordStr);}
        if (IsWord(WordStr, WId)){
          if (!IsSkipWord(WId)){
            NGramIdV.Add(0+WId); // add single word
            NGramBEChXPrV.Add(TIntPr(SymBChX, SymEChX)); // add positions
            for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){
              TIntQ& WIdQ=WIdQV[NGramLen];
              TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen];
              WIdQ.Push(WId); BEChXPrQ.Push(TIntPr(SymBChX, SymEChX));
              AllWIdQClrP=false;
              // if queue full
              if (WIdQ.Len()==NGramLen+1){
                // create sequence
                TIntV WIdV; WIdQ.GetSubValVec(0, WIdQ.Len()-1, WIdV);
                TIntPrV BEChXPrV; BEChXPrQ.GetSubValVec(0, BEChXPrQ.Len()-1, BEChXPrV);
                // add ngram-id or reset queues
                int WIdVP;
                if (WIdVToFqH.IsKey(WIdV, WIdVP)){ // if sequence is frequent
                  int NGramId=GetWords()+WIdVP; // get sequence ngram-id
                  NGramIdV.Add(NGramId); // add sequence ngram-id
                  NGramBEChXPrV.Add(TIntPr(BEChXPrV[0].Val1, BEChXPrV.Last().Val2)); // add positions
                }
              }
            }
          }
        } else {
          // break queue sequences if infrequent word occures
          if (!AllWIdQClrP){
            for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){
              TIntQ& WIdQ=WIdQV[NGramLen];
              TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen];
              if (!WIdQ.Empty()){WIdQ.Clr(); BEChXPrQ.Clr();}
            }
            AllWIdQClrP=true;
          }
        }
      }
    }
    // get next symbol
    HtmlLx.GetSym();
  }
}