PRSet TRSet::NewNews(const TStr& UrlStr, const TStr& HtmlStr){ // prepare object PRSet RSet=TRSet::New(); // prepare html browsing PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); // extract header info HtmlLx.MoveToStrOrEof("Results"); TStr FromResultStr=HtmlLx.GetStrInTag("<B>", true); TStr ToResultStr=HtmlLx.GetStrInTag("<B>", true); TStr AllHitsStr=HtmlLx.GetStrInTag("<B>", true); AllHitsStr.DelChAll(','); TStr QueryStr=HtmlLx.GetStrInTag("<B>", true); // traverse hits forever { HtmlLx.MoveToBTagOrEof("<TABLE>", "WIDTH", "75%", "<DIV>", "CLASS", "n"); if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<TABLE>"))){break;} HtmlLx.MoveToBTagOrEof("<A>"); if (HtmlLx.Sym!=hsyBTag){break;} TStr HitUrlStr=HtmlLx.GetArg("HREF"); TStr IdStr=HtmlLx.GetArg("ID"); // if image if ((!IdStr.Empty())&&(IdStr.LastCh()=='i')){ HtmlLx.MoveToBTagOrEof("<A>"); if (HtmlLx.Sym!=hsyBTag){break;} HitUrlStr=HtmlLx.GetArg("HREF"); } TStr HitTitleStr=HtmlLx.GetStrToETag("<A>", true); TStr HitSrcNm=HtmlLx.GetStrToBTag("<NOBR>", true); if (HitSrcNm.IsSuffix(" -")){ HitSrcNm=HitSrcNm.GetSubStr(0, HitSrcNm.Len()-3);} HtmlLx.MoveToETagOrEof("<NOBR>"); TStr HitCtxStr=HtmlLx.GetStrToETag("<TABLE>", true); RSet->AddHit(HitUrlStr, HitTitleStr, HitSrcNm, HitCtxStr); } // extract footer info TStr NextUrlStr; if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<DIV>")){ TStr NextRelUrlStr=HtmlLx.GetHRefBeforeStr("Next"); if (!NextRelUrlStr.Empty()){ PUrl NextUrl=TUrl::New(NextRelUrlStr, UrlStr); if (NextUrl->IsOk()){ NextUrlStr=NextUrl->GetUrlStr(); } } } // put components RSet->PutUrlStr(UrlStr); RSet->PutNextUrlStr(NextUrlStr); RSet->PutQueryStr(QueryStr); RSet->PutAllHits(AllHitsStr.GetInt(-1)); // return return RSet; }
///////////////////////////////////////////////// // Google-Result-Set PRSet TRSet::NewWeb(const TStr& UrlStr, const TStr& HtmlStr){ // prepare object PRSet RSet=TRSet::New(); // prepare html browsing PSIn HtmlSIn=TStrIn::New(HtmlStr); // HtmlStr.SaveTxt("RSet.Html"); // when testing THtmlLx HtmlLx(HtmlSIn); // extract header info HtmlLx.MoveToStrOrEof("Results"); TStr FromResultStr=HtmlLx.GetStrInTag("<B>", true); TStr ToResultStr=HtmlLx.GetStrInTag("<B>", true); TStr AllHitsStr=HtmlLx.GetStrInTag("<B>", true); AllHitsStr.DelChAll(','); TStr QueryStr=HtmlLx.GetStrInTag("<B>", true); // traverse hits forever { HtmlLx.MoveToBTagOrEof("<DIV>", "CLASS", "g", "<BR>", "CLEAR", "all"); if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<DIV>"))){break;} //HtmlLx.MoveToBTagOrEof("<P>", "CLASS", "g", "<DIV>", "CLASS", "n"); //if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<P>"))){break;} HtmlLx.MoveToBTagOrEof("<A>"); if (HtmlLx.Sym!=hsyBTag){break;} TStr HitUrlStr=HtmlLx.GetArg("HREF"); TStr HitTitleStr=HtmlLx.GetStrToETag("<A>", true); //HtmlLx.MoveToBTagOrEof("<FONT>"); HtmlLx.MoveToBTagOrEof("<DIV>"); //TStr HitCtxStr=HtmlLx.GetStrToBTag("<FONT>", "COLOR", "#008000", true); //TStr HitCtxStr=HtmlLx.GetStrToBTag("<SPAN>", "CLASS", "a", true); TStr HitCtxStr=HtmlLx.GetStrToBTag("<BR>", true); RSet->AddHit(HitUrlStr, HitTitleStr, "", HitCtxStr); } // extract footer info TStr NextUrlStr; if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<BR>")){ TStr NextRelUrlStr=HtmlLx.GetHRefBeforeStr("Next"); if (!NextRelUrlStr.Empty()){ PUrl NextUrl=TUrl::New(NextRelUrlStr, UrlStr); if (NextUrl->IsOk()){ NextUrlStr=NextUrl->GetUrlStr(); } } } // put components RSet->PutUrlStr(UrlStr); RSet->PutNextUrlStr(NextUrlStr); RSet->PutQueryStr(QueryStr); RSet->PutAllHits(AllHitsStr.GetInt(-1)); // return return RSet; }
void TTokenizerHtml::GetTokens(const PSIn& SIn, TStrV& TokenV) const { THtmlLx HtmlLx(SIn, false); // traverse html string symbols while (HtmlLx.Sym!=hsyEof){ if (HtmlLx.Sym==hsyStr){ TStr UcStr = HtmlLx.UcChA; // check if stop word if ((SwSet.Empty()) || (!SwSet->IsIn(UcStr))) { TStr TokenStr = ToUcP ? UcStr : TStr(HtmlLx.ChA); if (!Stemmer.Empty()) { TokenStr = Stemmer->GetStem(TokenStr); } TokenV.Add(TokenStr.GetLc()); } } // get next symbol HtmlLx.GetSym(); } // // tokenize // TStrV TokenStrV; Tokenizer->GetTokens(TextStr, TokenStrV); // // transform words to IDs // const int Tokens = TokenStrV.Len(); // TIntV TokenIdV(Tokens, 0); // for (int TokenN = 0; TokenN < Tokens; TokenN++) { // // add token to the hashtable of all tokens // const int TokenId = WordH.AddKey(TokenStrV[TokenN].GetUc()); // // keep track of it's count // WordH[TokenId]++; // // and prepare a token vector for ngram base // TokenIdV.Add(TokenId); // } //// extract the n-grams // TNGramDescV NGramDescV; //NGramBs->AddDocTokIdV(TokenIdV, StoreThreshold, NGramDescV); // // get string representations of n-grams above threshold // TStrH NGramH; // for (int NGramDescN = 0; NGramDescN < NGramDescV.Len(); NGramDescN++) { // const TNGramDesc& NGramDesc = NGramDescV[NGramDescN]; // // make it into a string // const TIntV& NGramTokenIdV = NGramDesc.TokIdV; // TChA NGramChA = WordH.GetKey(NGramTokenIdV[0]); // for (int NGramTokenIdN = 1; NGramTokenIdN < NGramTokenIdV.Len(); NGramTokenIdN++) { // NGramChA += ' '; NGramChA += WordH.GetKey(NGramTokenIdV[NGramTokenIdN]); // } // // remember the ngram, if not stopword // if (!SwSet->IsIn(NGramChA)) { NGramH.AddDat(NGramChA); } // } // // remember n-grams above threshold // int NGramKeyId = NGramH.FFirstKeyId(); // while (NGramH.FNextKeyId(NGramKeyId)) { // const TStr& NGramStr = NGramH.GetKey(NGramKeyId); // // add to the result list // ConceptV.Add(TOgNewsConcept(NGramStr, EmtpyStr)); // } }
void TFtrGenToken::GetTokenV(const TStr& Str, TStrV& TokenStrV) const { THtmlLx HtmlLx(TStrIn::New(Str)); while (HtmlLx.Sym != hsyEof){ if (HtmlLx.Sym == hsyStr){ TStr TokenStr = HtmlLx.UcChA; if (SwSet.Empty() || !SwSet->IsIn(TokenStr)) { if (!Stemmer.Empty()) { TokenStr = Stemmer->GetStem(TokenStr); } TokenStrV.Add(TokenStr); } } // get next symbol HtmlLx.GetSym(); } }
void THtml::GetTokens(const PSIn& SIn, TStrV& TokenV) const { THtmlLx HtmlLx(SIn, false); // traverse html string symbols while (HtmlLx.Sym!=hsyEof){ if (HtmlLx.Sym==hsyStr){ TStr UcStr = HtmlLx.UcChA; // check if stop word if ((SwSet.Empty()) || (!SwSet->IsIn(UcStr))) { TStr TokenStr = ToUcP ? UcStr : TStr(HtmlLx.ChA); if (!Stemmer.Empty()) { TokenStr = Stemmer->GetStem(TokenStr); } TokenV.Add(TokenStr.GetLc()); } } // get next symbol HtmlLx.GetSym(); } }
void TNytNGramBs::GetNGramStrV(const TStr& HtmlStr, TStrV& NGramStrV){ NGramStrV.Clr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); // process text TStrV StrV; while (HtmlLx.GetSym()!=hsyEof){ TStr Str=HtmlLx.ChA; Str.ToLc(); switch (HtmlLx.Sym){ case hsyUndef: case hsyUrl: case hsyMTag: case hsySSym: StrV.Clr(); break; case hsyStr: case hsyNum: NGramStrV.Add(Str); StrV.Add(Str); for (int NGramLen=2; NGramLen<=4; NGramLen++){ if (StrV.Len()<NGramLen){break;} TStrV TermStrV(NGramLen, 0); for (int StrN=StrV.Len()-NGramLen; StrN<StrV.Len(); StrN++){ TermStrV.Add(StrV[StrN]); } int NGramId; if (IsNGram(TermStrV, NGramId)){ TStr NGramStr=GetNGramStr(NGramId); NGramStrV.Add(NGramStr); } } break; case hsyBTag: case hsyETag: StrV.Clr(); break; case hsyEof: break; default: Fail; } } }
PAmazonItem TAmazonItem::GetFromWebPg(const PWebPg& WebPg){ TStr UrlStr=WebPg->GetUrlStr(); TStr ItemId=TAmazonItem::GetItemId(WebPg->GetUrl()); TStr HtmlStr=WebPg->GetHttpBodyAsStr(); PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); THtmlLxSym Sym; TChA ChA; // move to title while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<FONT>")){ TStr FaceArg=HtmlLx.GetArg("FACE", ""); TStr SizeArg=HtmlLx.GetArg("SIZE", ""); if ((FaceArg=="verdana,arial,helvetica")&&(SizeArg.Empty())){break;} } } // extract title TChA TitleChA; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;} if (!TitleChA.Empty()){TitleChA+=HtmlLx.GetPreSpaceStr();} TitleChA+=ChA; } TStr TitleStr=TitleChA; //printf("'%s'\n", TitleStr.CStr()); // extract authors TStrV AuthorNmV; TChA AuthorNmChA; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<A>")){ do { HtmlLx.GetSym(); Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if (Sym==hsyStr){ if (!AuthorNmChA.Empty()){AuthorNmChA+=HtmlLx.GetPreSpaceStr();} AuthorNmChA+=ChA; } } while (!((Sym==hsyETag)&&(ChA=="<A>"))); AuthorNmV.Add(AuthorNmChA); AuthorNmChA.Clr(); } if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;} } for (int AuthorNmN=0; AuthorNmN<AuthorNmV.Len(); AuthorNmN++){ //printf("'%s'\n", AuthorNmV[AuthorNmN].CStr()); } // move to x-sell TStrQ PrevStrQ(3); while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if (Sym==hsyStr){ PrevStrQ.Push(ChA); if ((PrevStrQ.Len()==3)&&(PrevStrQ[0]=="Customers") &&(PrevStrQ[1]=="who")&&(PrevStrQ[2]=="bought")){break;} } else { PrevStrQ.Clr(); } } // extract x-sell pointers TStrV NextItemIdV; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<A>")){ TStr RelUrlStr=HtmlLx.GetArg("HREF"); PUrl Url=TUrl::New(RelUrlStr, UrlStr); TStr NextItemId=TAmazonItem::GetItemId(Url); NextItemIdV.Add(NextItemId); } if ((Sym==hsyETag)&&(ChA=="<UL>")){break;} } for (int NextItemIdN=0; NextItemIdN<NextItemIdV.Len(); NextItemIdN++){ //printf("'%s'\n", NextItemIdV[NextItemIdN].CStr()); } // construct item object PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem(ItemId, TitleStr, AuthorNmV, NextItemIdV)); return AmazonItem; }
///////////////////////////////////////////////// // Google-Scholar-Result-Set PGgSchRSet TGgSchRSet::NewScholar(const TStr& UrlStr, const TStr& HtmlStr){ // prepare object PGgSchRSet RSet=TGgSchRSet::New(); // prepare html browsing PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); // extract header info HtmlLx.MoveToStrOrEof("Results"); TStr FromResultStr=HtmlLx.GetStrInTag("<B>", true); TStr ToResultStr=HtmlLx.GetStrInTag("<B>", true); TStr AllHitsStr=HtmlLx.GetStrInTag("<B>", true); AllHitsStr.DelChAll(','); TStr QueryStr=HtmlLx.GetStrInTag("<B>", true); // traverse hits HtmlLx.MoveToBTagOrEof("<P>"); forever { if (!((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<P>"))){break;} HtmlLx.GetSym(); if (HtmlLx.Sym==hsyBTag){ TStr FullBTagStr=HtmlLx.GetFullBTagStr(); if (FullBTagStr=="<FONT SIZE=\"-2\">"){ TStr PubTypeNm=HtmlLx.GetStrInTag("<B>", true); } else if (FullBTagStr=="<SPAN CLASS=\"w\">"){ } else { break; } TStr TitleStr=HtmlLx.GetStrToBTag("<BR>", true).GetTrunc(); if (TitleStr.IsPrefix("[PS] ")){ TitleStr=TitleStr.GetSubStr(5, TitleStr.Len()).GetTrunc();} TStr AuthNmVPubStr=HtmlLx.GetStrToBTag("<BR>", true); TStrV AuthNmV; TStr PubNm; TStr PubYearStr; TGgSchRef::GetAuthNmVPubStr(AuthNmVPubStr, AuthNmV, PubNm, PubYearStr); TStr CitedByUrlStr; int Citations=0; HtmlLx.MoveToBTag3OrEof("<A>", "<P>", "<DIV>"); if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.ChA=="<A>")){ TStr CitedByRelUrlStr=HtmlLx.GetArg("HREF"); TStr AStr=HtmlLx.GetStrToETag("<A>", true); if (AStr.IsPrefix("Cited by ")){ PUrl CitedByUrl=TUrl::New(CitedByRelUrlStr, UrlStr); if (CitedByUrl->IsOk()){ CitedByUrlStr=CitedByUrl->GetUrlStr(); Citations=AStr.GetSubStr(TStr("Cited by ").Len(), AStr.Len()).GetInt(0); } } HtmlLx.MoveToBTag2OrEof("<P>", "<DIV>"); } PGgSchRef Ref= TGgSchRef::New(TitleStr, AuthNmV, PubNm, PubYearStr, Citations, CitedByUrlStr); RSet->AddHit(Ref); //printf("%4s - Cit %d - %s (Auth %d)\n", // PubYearStr.CStr(), Citations, TitleStr.CStr(), AuthNmV.Len()); } else { break; } //RSet->AddHit(HitUrlStr, HitTitleStr, HitSrcNm, HitCtxStr); } // extract footer info TStr NextUrlStr; if ((HtmlLx.Sym==hsyBTag)&&(HtmlLx.UcChA=="<DIV>")){ TStr NextRelUrlStr=HtmlLx.GetHRefBeforeStr("Next"); if (!NextRelUrlStr.Empty()){ PUrl NextUrl=TUrl::New(NextRelUrlStr, UrlStr); if (NextUrl->IsOk()){ NextUrlStr=NextUrl->GetUrlStr(); } } } // put components RSet->PutUrlStr(UrlStr); RSet->PutNextUrlStr(NextUrlStr); RSet->PutQueryStr(QueryStr); RSet->PutAllHits(AllHitsStr.GetInt(-1)); // return return RSet; }
void TNGramBs::GetNGramIdV( const TStr& HtmlStr, TIntV& NGramIdV, TIntPrV& NGramBEChXPrV) const { // create MxNGramLen queues TVec<TIntQ> WIdQV(MxNGramLen); TVec<TIntPrQ> BEChXPrQV(MxNGramLen); for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ WIdQV[NGramLen].Gen(100*NGramLen, NGramLen+1); BEChXPrQV[NGramLen].Gen(100*NGramLen, NGramLen+1); } bool AllWIdQClrP=true; // extract words from text-string PSIn HtmlSIn=TStrIn::New(HtmlStr, false); THtmlLx HtmlLx(HtmlSIn); while (HtmlLx.Sym!=hsyEof){ if ((HtmlLx.Sym==hsyStr)||(HtmlLx.Sym==hsyNum)){ // get word-string & word-id TStr WordStr=HtmlLx.UcChA; int WId; int SymBChX=HtmlLx.SymBChX; int SymEChX=HtmlLx.SymEChX; if ((SwSet.Empty())||(!SwSet->IsIn(WordStr))){ if (!Stemmer.Empty()){ WordStr=Stemmer->GetStem(WordStr);} if (IsWord(WordStr, WId)){ if (!IsSkipWord(WId)){ NGramIdV.Add(0+WId); // add single word NGramBEChXPrV.Add(TIntPr(SymBChX, SymEChX)); // add positions for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ TIntQ& WIdQ=WIdQV[NGramLen]; TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen]; WIdQ.Push(WId); BEChXPrQ.Push(TIntPr(SymBChX, SymEChX)); AllWIdQClrP=false; // if queue full if (WIdQ.Len()==NGramLen+1){ // create sequence TIntV WIdV; WIdQ.GetSubValVec(0, WIdQ.Len()-1, WIdV); TIntPrV BEChXPrV; BEChXPrQ.GetSubValVec(0, BEChXPrQ.Len()-1, BEChXPrV); // add ngram-id or reset queues int WIdVP; if (WIdVToFqH.IsKey(WIdV, WIdVP)){ // if sequence is frequent int NGramId=GetWords()+WIdVP; // get sequence ngram-id NGramIdV.Add(NGramId); // add sequence ngram-id NGramBEChXPrV.Add(TIntPr(BEChXPrV[0].Val1, BEChXPrV.Last().Val2)); // add positions } } } } } else { // break queue sequences if infrequent word occures if (!AllWIdQClrP){ for (int NGramLen=1; NGramLen<MxNGramLen; NGramLen++){ TIntQ& WIdQ=WIdQV[NGramLen]; TIntPrQ& BEChXPrQ=BEChXPrQV[NGramLen]; if (!WIdQ.Empty()){WIdQ.Clr(); BEChXPrQ.Clr();} } AllWIdQClrP=true; } } } } // get next symbol HtmlLx.GetSym(); } }