// Parse strings of the form 2006-08-28 14:11:16 or 14:11:16 08/28/2008 // Non-numeric characters act as separators (there can be many consecutive separating characters) // Variables give indexes of the date fields TSecTm TSecTm::GetDtTmFromStr(const TChA& YmdHmsPmStr, const int& YearId, const int& MonId, const int& DayId, const int& HourId, const int& MinId, const int& SecId) { TChA Tmp = YmdHmsPmStr; TVec<char *> FldV; // get the sequences of numbers for (char *c = (char *) Tmp.CStr(); *c; c++) { if (TCh::IsNum(*c)) { FldV.Add(c); while (TCh::IsNum(*c)) { c++; } c--; } else { *c = 0; } } const int Y = atoi(FldV[YearId]); const int M = atoi(FldV[MonId]); const int D = atoi(FldV[DayId]); const int H = atoi(FldV[HourId]); const int m = atoi(FldV[MinId]); const int S = atoi(FldV[SecId]); IAssert(Y>0 && M>0 && D>0 && M<13 && D<32); IAssert(H>=0 && H<24 && m>=0 && m<60 && S>=0 && S<60); return TSecTm(Y,M,D,H,m,S); }
PSs TSs::LoadTxt( const TSsFmt& SsFmt, const TStr& FNm, const PNotify& Notify, const bool& IsExcelEoln, const int& MxY, const TIntV& AllowedColNV, const bool& IsQStr) { TNotify::OnNotify(Notify, ntInfo, TStr("Loading File ")+FNm+" ..."); PSIn SIn=TFIn::New(FNm); PSs Ss=TSs::New(); if (!SIn->Eof()) { int X=0; int Y=0; int PrevX=-1; int PrevY=-1; char Ch=SIn->GetCh(); TChA ChA; while (!SIn->Eof()) { // compose value ChA.Clr(); if (IsQStr&&(Ch=='"')) { // quoted string ('""' sequence means '"') Ch=SIn->GetCh(); forever { while ((!SIn->Eof())&&(Ch!='"')) { ChA+=Ch; Ch=SIn->GetCh(); } if (Ch=='"') { Ch=SIn->GetCh(); if (Ch=='"') { ChA+=Ch; Ch=SIn->GetCh(); } else { break; } } } } else { if (SsFmt==ssfTabSep) {
void TTokenizerUtil::Paragraphize(const PSIn& SIn, TStrV& Paragraphs) { TChA ParagraphBuf; int c; bool wasSpace = false; while (!SIn->Eof()) { c = SIn->GetCh(); // two consecutive spaces signal a new paragraph if (c == ' ' || c == '\t' || c == '\n') { if (wasSpace) { Paragraphs.Add(ParagraphBuf); ParagraphBuf.Clr(); continue; } wasSpace = true; } else { wasSpace = false; } ParagraphBuf += c; } if (ParagraphBuf.Len() > 0) { Paragraphs.Add(ParagraphBuf); } }
// http://www.ijs.si/fdfd/blah.html --> www.ijs.si TChA TStrUtil::GetDomNm(const TChA& UrlChA) { int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http:// if (EndSlash > 0) { const int BegSlash = UrlChA.SearchChBack('/', EndSlash); if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); } else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); } } else { if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); } EndSlash = UrlChA.SearchCh('/', 0); if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); } else { return TChA(UrlChA).ToLc(); } } }
void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm, TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) { // open line-doc file NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0; while (!FIn.Eof()){ Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;} printf("%d\r", Docs); // document name TChA DocNm; Ch=FIn.GetCh(); if (NamedP){ while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){ DocNm+=Ch; Ch=FIn.GetCh();} DocNm.Trunc(); if (DocNm.Empty()){Docs--; continue;} } else { DocNm = TInt::GetStr(Docs); } // categories TStrV CatNmV; forever { while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();} if (Ch=='!'){ if (!FIn.Eof()){Ch=FIn.GetCh();} TChA CatNm; while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){ CatNm+=Ch; Ch=FIn.GetCh();} if (!CatNm.Empty()){CatNmV.Add(CatNm);} } else { break; } } // document text TChA DocChA; while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){ DocChA+=Ch; Ch=FIn.GetCh();} // skip empty documents (empty lines) if (DocNm.Empty()&&DocChA.Empty()){ continue;} // add document to document-base NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP)); } // return document-base BowDocBs->AssertOk(); printf("\n"); }
const char* TJsonObj::ParseArrayVal(const char* JsonStr) { const char *c = JsonStr; bool Nested = false; TChA ValStr; Clr(); while (*c && TCh::IsWs(*c)) { c++; } if (*c == '"') { c = GetStr(c, ValStr); } // string else if (TCh::IsNum(*c) || (*c=='-' && TCh::IsNum(*(c+1)))) { // number while (*c && *c!=',' && *c!='}' && *c!=']' && ! TCh::IsWs(*c)) { ValStr.Push(*c); c++; } } else if (*c=='t' || *c=='f' || *c=='n') { // true, false, null while (*c && *c!=',' && *c!='}' && *c!=']') { ValStr.Push(*c); c++; } } else if (*c=='{') { // nested object EAssertR(! KeyArrayH.IsKey("key"), "JSON error: object with key 'key' already exists"); TJsonObj& Obj = KeyObjH.AddDat("key"); c = Obj.Parse(c) + 1; Nested = true; } else if (*c=='[') { // array EAssertR(! KeyArrayH.IsKey("key"), "JSON error: array with key 'key' already exists"); TVec<TJsonObj>& Array = KeyArrayH.AddDat("key"); c++; while (*c && *c!=']') { while (*c && TCh::IsWs(*c)) { c++; } Array.Add(); if (*c=='{') { c = Array.Last().Parse(c) + 1; } // nested object else { c = Array.Last().ParseArrayVal(c); } if (*c && *c==',') { c++; } } c++; Nested = true; } if (! Nested) { EAssertR(! KeyArrayH.IsKey("key"), "JSON error: object with key 'key' already exists"); KeyValH.AddDat("key", ValStr); } while (*c && TCh::IsWs(*c)) { c++; } return c; }
// remove ending /, /index.html, etc. and strip starting www. bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) { UrlOut = UrlIn; if (StripEnd(UrlIn, "/", UrlOut)) {} else if (StripEnd(UrlIn, "/index.html", UrlOut)) {} else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {} else if (StripEnd(UrlIn, "/index.php", UrlOut)) {} if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) { // if UrlIn is relative url, try combine it with BaseUrl if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) { //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr()); return false; } TChA Out; if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; } if (UrlIn[0] != '/') { Out.AddCh('/'); } Out += UrlOut; UrlOut = Out; } // http://www. --> http:// if (UrlOut.IsPrefix("http://www.")) { UrlOut = "http://"+UrlOut.GetSubStr(11, TInt::Mx); } UrlOut.ToLc(); return true; }
int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) { LineV.Clr(false); LineV.Add(ChA.CStr()); bool IsChs=false; for (char *c = (char *) ChA.CStr(); *c; c++) { if (*c == '\n') { if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; // \r\n } *c=0; if (SkipEmpty) { if (IsChs) { LineV.Add(c+1); } } else { LineV.Add(c+1); } IsChs=false; } else { IsChs=true; } } return LineV.Len(); }
/////////////////////////////// // Tokenizer-Utils void TTokenizerUtil::Sentencize(const PSIn& SIn, TStrV& Sentences, const bool& SplitNewLineP) { TChA SentenceBuf; int c; while (!SIn->Eof()) { c = SIn->GetCh(); switch (c) { case '\r': case '\n': { if (!SplitNewLineP) { SentenceBuf += ' '; break; } } case '"' : case '.' : case '!' : case ':' : case ';' : case '?' : case '\t': { if (SentenceBuf.Len() > 2) { Sentences.Add(SentenceBuf); printf("%s\n", SentenceBuf.CStr()); SentenceBuf.Clr(); } break; } default: SentenceBuf += c; break; } } if (SentenceBuf.Len() > 0) { Sentences.Add(SentenceBuf); } }
// get <TagNm>*</TagNm> (can be many tags inbetween bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) { if (XmlLx.GetSym() != xsySTag) { return false; } TagVal.Clr(); TagNm = XmlLx.TagNm; //const TXmlLxSym NextSym = XmlLx.GetSym(); while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) { if (TakeTagNms) { TagVal += XmlLx.TxtChA; } else if (XmlLx.Sym == xsyStr) { TagVal += XmlLx.TxtChA; } XmlLx.GetSym(); } return true; //if (NextSym == xsyStr) { // EAssertR(XmlLx.GetSym() == xsyETag, TagNm); //} else { // EAssertR(NextSym == xsyETag, TagNm); // empty tag //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr()); //} }
TStr THtmlLxChDef::GetCSZFromWin1250(const TChA& ChA){ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ unsigned char Ch=ChA[ChN]; switch (Ch){ case 232: DstChA+='c'; break; case 200: DstChA+='C'; break; case 154: DstChA+='s'; break; case 138: DstChA+='S'; break; case 158: DstChA+='z'; break; case 142: DstChA+='Z'; break; default: DstChA+=Ch; } } return DstChA; }
bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) { int AlNumCnt=0, ChCnt=0; for (const char *c = Str.CStr(); *c; c++) { if (TCh::IsWs(*c)) { continue; } if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; } ChCnt++; } if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; } return false; }
TSecTm TSecTm::GetDtTmFromYmdHmsStr(const TStr& YmdHmsPmStr, const char& DateSepCh, const char& TimeSepCh){ int YmdHmsPmStrLen=YmdHmsPmStr.Len(); // year TChA ChA; int ChN=0; while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=DateSepCh)){ ChA+=YmdHmsPmStr[ChN]; ChN++;} TStr YearStr=ChA; // month ChA.Clr(); ChN++; while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=DateSepCh)){ ChA+=YmdHmsPmStr[ChN]; ChN++;} TStr MonthStr=ChA; // day ChA.Clr(); ChN++; while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=' ')){ ChA+=YmdHmsPmStr[ChN]; ChN++;} TStr DayStr=ChA; // hour ChA.Clr(); ChN++; while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=TimeSepCh)){ ChA+=YmdHmsPmStr[ChN]; ChN++;} TStr HourStr=ChA; // minute ChA.Clr(); ChN++; while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=TimeSepCh)){ ChA+=YmdHmsPmStr[ChN]; ChN++;} TStr MinStr=ChA; // second ChA.Clr(); ChN++; while (ChN<YmdHmsPmStrLen){ ChA+=YmdHmsPmStr[ChN]; ChN++;} TStr SecStr=ChA; // transform to numbers int MonthN=MonthStr.GetInt(); int DayN=DayStr.GetInt(); int YearN=YearStr.GetInt(); int HourN; int MinN; int SecN; if (HourStr.IsInt()){ HourN=HourStr.GetInt(); MinN=MinStr.GetInt(); SecN=SecStr.GetInt(); } else { HourN=0; MinN=0; SecN=0; } // construct the time TSecTm Tm=TSecTm::GetDtTm(YearN, MonthN, DayN); Tm.AddHours(HourN); Tm.AddMins(MinN); Tm.AddSecs(SecN); return Tm; }
bool THttpLx::IsRespStatusLn(){ static const TChA MouldChA="http/N.N NNN "; TChA TestChA(MouldChA); int TestLen=TestChA.Len(); if (1+Len()<TestLen){return false;} TestChA.PutCh(0, ChDef.GetLcCh(Ch)); {for (int ChN=1; ChN<TestLen; ChN++){ TestChA.PutCh(ChN, ChDef.GetLcCh(GetCh()));}} {for (int ChN=1; ChN<TestLen; ChN++){ PutCh(TestChA[TestLen-ChN-1]);}} {for (int ChN=0; ChN<MouldChA.Len(); ChN++){ if (MouldChA[ChN]=='N'){ if (!ChDef.IsDigit(TestChA[ChN])){return false;} } else { if (MouldChA[ChN]!=TestChA[ChN]){return false;} } }} return true; }
TStr THtmlLxChDef::GetCSZFromYuascii(const TChA& ChA){ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ char Ch=ChA[ChN]; switch (Ch){ case '~': DstChA+='c'; break; case '^': DstChA+='C'; break; case '}': DstChA+='c'; break; case ']': DstChA+='C'; break; case '|': DstChA+='d'; break; case '\\': DstChA+='D'; break; case '{': DstChA+='s'; break; case '[': DstChA+='S'; break; case '`': DstChA+='z'; break; case '@': DstChA+='Z'; break; default: DstChA+=Ch; } } return DstChA; }
TStr THtmlLxChDef::GetWin1250FromYuascii(const TChA& ChA){ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ char Ch=ChA[ChN]; switch (Ch){ case '~': DstChA+=uchar(232); break; case '^': DstChA+=uchar(200); break; case '}': DstChA+='c'; break; case ']': DstChA+='C'; break; case '|': DstChA+='d'; break; case '\\': DstChA+='D'; break; case '{': DstChA+=uchar(154); break; case '[': DstChA+=uchar(138); break; case '`': DstChA+=uchar(158); break; case '@': DstChA+=uchar(142); break; default: DstChA+=Ch; } } return DstChA; }
TStr THtmlLxChDef::GetIsoCeFromYuascii(const TChA& ChA){ TChA DstChA; for (int ChN=0; ChN<ChA.Len(); ChN++){ char Ch=ChA[ChN]; switch (Ch){ case '~': DstChA+=uchar(232); break; case '^': DstChA+=uchar(200); break; case '}': DstChA+=uchar(230); break; case ']': DstChA+=uchar(198); break; case '|': DstChA+=uchar(240); break; case '\\': DstChA+=uchar(208); break; case '{': DstChA+=uchar(185); break; case '[': DstChA+=uchar(169); break; case '`': DstChA+=uchar(190); break; case '@': DstChA+=uchar(174); break; default: DstChA+=Ch; } } return DstChA; }
/// For every quote, add it to corresponding bucket for each hashed x-character shingle of the quote // (Shingles by characters) void LSH::HashShingles(TQuoteBase *QuoteBase, TClusterBase *CB, TInt ShingleLen, THash<TMd5Sig, TShingleIdSet>& ShingleToQuoteIds) { Err("Hashing shingles...\n"); TIntV QuoteIds; QuoteBase->GetAllQuoteIds(QuoteIds); for (int qt = 0; qt < QuoteIds.Len(); qt++) { if (qt % 1000 == 0) { fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len()); } if (CB->IsQuoteInArchivedCluster(QuoteIds[qt])) continue; TQuote Q; QuoteBase->GetQuote(QuoteIds[qt], Q); // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter TStr QContentStr; Q.GetParsedContentString(QContentStr); TChA QContentChA = TChA(QContentStr); int CurWord = 0; for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) { TChA ShingleChA = TChA(); for (int j = 0; j < ShingleLen; j++) { ShingleChA.AddCh(QContentChA.GetCh(i + j)); } TStr Shingle = TStr(ShingleChA); const TMd5Sig ShingleMd5(Shingle); TShingleIdSet ShingleQuoteIds; if (ShingleToQuoteIds.IsKey(ShingleMd5)) { ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5); } for (int j = CurWord; j > CurWord - WordWindow && j >= 0; j--) { ShingleQuoteIds.AddKey(TShingleId(QuoteIds[qt], j)); } ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds); // up the current word index if we see a space if (QContentChA.GetCh(i + ShingleLen - 1) == ' ') { CurWord++; } } } Err("Done hashing!\n"); }
TStr TLxChDef::GetUcStr(const TStr& Str) const { TChA UcStr; for (int ChN=0; ChN<Str.Len(); ChN++){ UcStr.AddCh(GetUc(Str.GetCh(ChN)));} return UcStr; }
TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) { if (LongStr.Len() < MaxLen) { return LongStr; } TChA Str = LongStr.GetSubStr(0, MaxLen-1); Str += "..."; return Str; }
int TStrUtil::CountWords(const TChA& ChA) { return CountWords(ChA.CStr()); }
// get website (GetDomNm2 or blog url) TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) { TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr); // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539 if (DomNm == "blog.myspace.com") { return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1); } // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx // http://ameblo.jp/baptism/entry-10126216277.html // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo // http://blogs.zdnet.com/hardware/?p=2391 // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html // http://blog.tv2.dk/ole.mork/entry254689.html // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co" || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com" || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk" || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") { return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); } // http://digg.com/submit?phase=2&url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&title=and // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show if (DomNm == "digg.com") { if (PostUrlStr.IsPrefix("http://digg.com/submit?")) { const int Url = PostUrlStr.SearchStr(";url="); if (Url != -1) { return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); } } else { return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); } } // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/") || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) { return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); } // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640 if (DomNm=="feeds.feedburner.com") { return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); } // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c if (DomNm=="groups.google.com") { return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); } // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa if (DomNm=="news.google.com") { // redirect const int UrlPos = PostUrlStr.SearchStr("&url="); if (UrlPos != -1) { return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); } } // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de if (DomNm == "bloggrevyen.no") { // redirect const int Http2 = PostUrlStr.SearchStr("/http://"); if (Http2!=-1) { return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); } } //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha if (DomNm.IsSuffix(".rd.yahoo.com")) { const int Http2 = PostUrlStr.SearchStr("/*"); if (Http2!=-1) { return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); } } return DomNm; }
int TSOut::PutStr(const TChA& ChA){ int Cs=UpdateLnLen(ChA.Len()); return Cs+PutBf(ChA.CStr(), ChA.Len()); }
// get domain name and also strip starting www. TChA TStrUtil::GetDomNm2(const TChA& UrlChA) { TChA Dom = GetDomNm(UrlChA); if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); } else { return Dom; } }
TStr TPorterStemmer::Stem(const TStr& s) { TChA buf = s; buf.ToUc(); return StemInPlace(buf.CStr()); }
///////////////////////////////////////////////// // Roget-Base void TRBase::LoadArtfl(const TStr& WebBaseFPath){ PWebBase WebBase=PWebBase(new TWebMemBase(WebBaseFPath)); int WebPgP=WebBase->FFirstWebPg(); int WebPgId; while (WebBase->FNextWebPg(WebPgP, WebPgId)){ TStr UrlStr=WebBase->GetUrlStr(WebPgId); static TStr RgShStr="RG.sh"; if (!UrlStr.IsStrIn(RgShStr)){continue;} // if (!UrlStr.IsStrIn("RG.sh?^544\\")){continue;} PWebPg WebPg=WebBase->GetWebPg(WebPgId); PSIn SIn=TStrIn::New(WebPg->GetBodyStr()); PHtmlDoc HtmlDoc=THtmlDoc::New(SIn, hdtAll); int TokN=0; PHtmlTok Tok; THtmlLxSym Sym; TStr Str; // move to <h2> do {HtmlDoc->GetTok(TokN++, Sym, Str); } while (!((Sym==hlsyBTag)&&(Str==THtmlTok::H2TagNm))); // parse "ddd[A|B]." TChA CtgNm; TChA CtgIdNm; HtmlDoc->GetTok(TokN++, Sym, Str); IAssert(Sym==hlsyNum); CtgNm+=Str; CtgIdNm+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); if (Sym==hlsyStr){ IAssert((Str=='A')||(Str=='B')); CtgNm+=Str; CtgIdNm+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); } IAssert((Sym==hlsySSym)&&(Str=='.')); CtgNm+=Str; // parse to </h2>" TChA BracketStr; HtmlDoc->GetTok(TokN++, Sym, Str); while (!((Sym==hlsyETag)&&(Str==THtmlTok::H2TagNm))){ if ((Sym==hlsySSym)&&(Str=='[')){ HtmlDoc->GetTok(TokN++, Sym, Str); while (!((Sym==hlsySSym)&&(Str==']'))){ if ((!BracketStr.Empty())&&(Sym==hlsyStr)){BracketStr+=' ';} BracketStr+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); } BracketStr.Ins(0, " ["); BracketStr+=']'; } else { if (Sym==hlsyStr){CtgNm+=' ';} CtgNm+=Str; } HtmlDoc->GetTok(TokN++, Sym, Str); } CtgNm+=BracketStr; TNotify::OnNotify(Notify, ntInfo, CtgNm); // parse words static TStr AdjStr="ADJ"; static TStr AdvStr="ADV"; static TStr IntStr="INT"; static TStr PgStr="PAGE"; static TStr PhrStr="PHR"; static TStr PrefStr="PREF"; static TStr PronStr="PRON"; HtmlDoc->GetTok(TokN++, Sym, Str); IAssert((Sym==hlsyStr)&&((Str=='N')||(Str==AdvStr))); while (TokN<HtmlDoc->GetToks()){ if (Sym==hlsyStr){ if (Str==PhrStr){break;} if ((Str!='N')&&(Str!='V')&&(Str!=AdjStr)&&(Str!=AdvStr)&& (Str!=IntStr)&&(Str!=PrefStr)&&(Str!=PronStr)){ TChA WordStr; do { if (!WordStr.Empty()){WordStr+=' ';} WordStr+=Str; HtmlDoc->GetTok(TokN++, Sym, Str); } while (Sym==hlsyStr); // TNotify::OnNotify(Notify, ntInfo, WordStr); } else { HtmlDoc->GetTok(TokN++, Sym, Str); } } else if (Sym==hlsySSym){ TStr ExpectStr; if (Str=='('){ExpectStr=')';} else if (Str=='['){ExpectStr=']';} else if (Str=='{'){ExpectStr='}';} else if (Str=='"'){ExpectStr='"';} if (!ExpectStr.Empty()){ do {HtmlDoc->GetTok(TokN++, Sym, Str); } while (!((Sym==hlsySSym)&&(Str==ExpectStr))); } HtmlDoc->GetTok(TokN++, Sym, Str); } else { HtmlDoc->GetTok(TokN++, Sym, Str); } } } }
bool IsCTxtHttpResp(const PUrl& Url, const PHttpResp& HttpResp, const int& MnCTxtToks){ if (HttpResp->IsStatusCd_Ok()){ PWebPg WebPg=TWebPg::New(Url->GetUrlStr(), HttpResp); if (HttpResp->IsContType(THttp::TextHtmlFldVal)){ TMem BodyMem=HttpResp->GetBodyAsMem(); PSIn BodyMemIn=TMemIn::New(BodyMem); // prepare html-tokens PHtmlDoc HtmlDoc=THtmlDoc::New(BodyMemIn, hdtAll, false); int Toks=HtmlDoc->GetToks(); THtmlLxSym TokSym; TStr TokStr; // prepare continuous-text indicators int CTxtToks=0; TChA CTxtChA; bool CTxtP=false; // prepare script & style flag bool InScript=false; bool InStyle=false; // traverse tokens for (int TokN=0; TokN<Toks; TokN++){ // get token data HtmlDoc->GetTok(TokN, TokSym, TokStr); switch (TokSym){ case hsyStr: case hsyNum: case hsySSym: if (!InScript&&!InStyle){ // text token CTxtToks++; CTxtChA+=TokStr; CTxtChA+=' '; } break; case hsyBTag: if (!InScript&&!InStyle){ if (TokStr=="<SCRIPT>"){ // start of script InScript=true; CTxtToks=0; CTxtChA.Clr(); } else if (TokStr=="<STYLE>"){ // start of style InStyle=true; CTxtToks=0; CTxtChA.Clr(); } else { if ((TokStr=="<P>")||(TokStr=="<B>")||(TokStr=="<I>")){ // skip in-text-tags } else { // non-text-tags - break continuous-text CTxtToks=0; CTxtChA.Clr(); } } } break; case hsyETag: if (InScript||InStyle){ if (TokStr=="<SCRIPT>"){ // end of script InScript=false; } else if (TokStr=="<STYLE>"){ // end of style InStyle=false; } } break; default: // non-text-token - break continuous-text CTxtToks=0; CTxtChA.Clr(); break; } // stop if enough continuous-text if (CTxtToks>MnCTxtToks){ CTxtP=true; break; } } if (CTxtP){ printf("%s\n", Url->GetUrlStr().CStr()); } return CTxtP; } } return false; }
PAmazonItem TAmazonItem::GetFromWebPg(const PWebPg& WebPg){ TStr UrlStr=WebPg->GetUrlStr(); TStr ItemId=TAmazonItem::GetItemId(WebPg->GetUrl()); TStr HtmlStr=WebPg->GetHttpBodyAsStr(); PSIn HtmlSIn=TStrIn::New(HtmlStr); THtmlLx HtmlLx(HtmlSIn); THtmlLxSym Sym; TChA ChA; // move to title while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<FONT>")){ TStr FaceArg=HtmlLx.GetArg("FACE", ""); TStr SizeArg=HtmlLx.GetArg("SIZE", ""); if ((FaceArg=="verdana,arial,helvetica")&&(SizeArg.Empty())){break;} } } // extract title TChA TitleChA; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;} if (!TitleChA.Empty()){TitleChA+=HtmlLx.GetPreSpaceStr();} TitleChA+=ChA; } TStr TitleStr=TitleChA; //printf("'%s'\n", TitleStr.CStr()); // extract authors TStrV AuthorNmV; TChA AuthorNmChA; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<A>")){ do { HtmlLx.GetSym(); Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if (Sym==hsyStr){ if (!AuthorNmChA.Empty()){AuthorNmChA+=HtmlLx.GetPreSpaceStr();} AuthorNmChA+=ChA; } } while (!((Sym==hsyETag)&&(ChA=="<A>"))); AuthorNmV.Add(AuthorNmChA); AuthorNmChA.Clr(); } if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;} } for (int AuthorNmN=0; AuthorNmN<AuthorNmV.Len(); AuthorNmN++){ //printf("'%s'\n", AuthorNmV[AuthorNmN].CStr()); } // move to x-sell TStrQ PrevStrQ(3); while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if (Sym==hsyStr){ PrevStrQ.Push(ChA); if ((PrevStrQ.Len()==3)&&(PrevStrQ[0]=="Customers") &&(PrevStrQ[1]=="who")&&(PrevStrQ[2]=="bought")){break;} } else { PrevStrQ.Clr(); } } // extract x-sell pointers TStrV NextItemIdV; while (HtmlLx.GetSym()!=hsyEof){ Sym=HtmlLx.Sym; ChA=HtmlLx.ChA; if ((Sym==hsyBTag)&&(ChA=="<A>")){ TStr RelUrlStr=HtmlLx.GetArg("HREF"); PUrl Url=TUrl::New(RelUrlStr, UrlStr); TStr NextItemId=TAmazonItem::GetItemId(Url); NextItemIdV.Add(NextItemId); } if ((Sym==hsyETag)&&(ChA=="<UL>")){break;} } for (int NextItemIdN=0; NextItemIdN<NextItemIdV.Len(); NextItemIdN++){ //printf("'%s'\n", NextItemIdV[NextItemIdN].CStr()); } // construct item object PAmazonItem AmazonItem=PAmazonItem(new TAmazonItem(ItemId, TitleStr, AuthorNmV, NextItemIdV)); return AmazonItem; }
TMIn::TMIn(const TChA& ChA): TSBase("Input-Memory"), TSIn("Input-Memory"), Bf(NULL), BfC(0), BfL(0){ BfL=ChA.Len(); Bf=new char[BfL]; strncpy(Bf, ChA.CStr(), BfL); }
PBowDocBs TBowFl::LoadTabTxt( const TStr& FNm, const TStr& SsFmtNm, const int& Recs, const TStr& SwSetTypeNm, const TStr& StemmerTypeNm, const int& MxNGramLen, const int& MnNGramFq, const int& IdFldN, const TStr& IdFldNm, const TIntV& CatFldNV, const TStrV& CatFldNmV, const TIntV& TxtFldNV, const TStrV& TxtFldNmV){ TSsFmt SsFmt=TSs::GetSsFmtFromStr(SsFmtNm); // load table-data PSs Ss=TSs::LoadTxt(SsFmt, FNm); // get id-field int FinalIdFldN=IdFldN; if ((FinalIdFldN==-1)&&(!IdFldNm.Empty())){ FinalIdFldN=Ss->GetFldX(IdFldNm); } // get category-fields TIntV FinalCatFldNV=CatFldNV; for (int CatN=0; CatN<CatFldNmV.Len(); CatN++){ int CatFldN=Ss->GetFldX(CatFldNmV[CatN]); FinalCatFldNV.Add(CatFldN); } FinalCatFldNV.Sort(); // get text-fields TIntV FinalTxtFldNV=TxtFldNV; for (int TxtN=0; TxtN<TxtFldNmV.Len(); TxtN++){ int TxtFldN=Ss->GetFldX(TxtFldNmV[TxtN]); FinalTxtFldNV.Add(TxtFldN); } FinalTxtFldNV.Sort(); // collect document-strings TStrV DocNmV; TVec<TStrV> CatNmVV; TStrV DocStrV; for (int Y=1; Y<Ss->GetYLen(); Y++){ // document-name TStr DocNm; if (FinalIdFldN==-1){ DocNm=TInt::GetStr(Y); } else { DocNm=Ss->GetVal(FinalIdFldN, Y); } DocNmV.Add(DocNm); // categories TStrV CatNmV; for (int CatN=0; CatN<FinalCatFldNV.Len(); CatN++){ int CatFldN=FinalCatFldNV[CatN]; TStr CatNm=Ss->GetVal(CatFldN, Y); if (!CatNm.Empty()){ CatNmV.Add(CatNm); } } CatNmVV.Add(CatNmV); // text TChA DocChA; for (int TxtN=0; TxtN<FinalTxtFldNV.Len(); TxtN++){ int TxtFldN=FinalTxtFldNV[TxtN]; TStr TxtStr=Ss->GetVal(TxtFldN, Y); if (!DocChA.Empty()){DocChA+=" <br> ";} if (!TxtStr.Empty()){DocChA+=TxtStr;} } DocStrV.Add(DocChA); } // prepare stop-words PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm); // prepare stemmer PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm); // create ngrams PNGramBs NGramBs; if (!((MxNGramLen==1)&&(MnNGramFq==1))){ NGramBs=TNGramBs::GetNGramBsFromHtmlStrV( DocStrV, MxNGramLen, MnNGramFq, SwSet, Stemmer); } // create document-base PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs); for (int DocN=0; DocN<DocNmV.Len(); DocN++){ BowDocBs->AddHtmlDoc(DocNmV[DocN], CatNmVV[DocN], DocStrV[DocN], true); } BowDocBs->AssertOk(); return BowDocBs; }