Example #1
0
// Parse strings of the form 2006-08-28 14:11:16 or 14:11:16  08/28/2008
// Non-numeric characters act as separators (there can be many consecutive separating characters)
// Variables give indexes of the date fields
TSecTm TSecTm::GetDtTmFromStr(const TChA& YmdHmsPmStr, const int& YearId, const int& MonId,
 const int& DayId, const int& HourId, const int& MinId, const int& SecId) {
  TChA Tmp = YmdHmsPmStr;
  TVec<char *> FldV;
  // get the sequences of numbers
  for (char *c = (char *) Tmp.CStr(); *c; c++) {
    if (TCh::IsNum(*c)) {
      FldV.Add(c);
      while (TCh::IsNum(*c)) { c++; }
      c--;
    } else { *c = 0; }
  }
  const int Y = atoi(FldV[YearId]);
  const int M = atoi(FldV[MonId]);
  const int D = atoi(FldV[DayId]);
  const int H = atoi(FldV[HourId]);
  const int m = atoi(FldV[MinId]);
  const int S = atoi(FldV[SecId]);
  IAssert(Y>0 && M>0 && D>0 && M<13 && D<32);
  IAssert(H>=0 && H<24 && m>=0 && m<60 && S>=0 && S<60);
  return TSecTm(Y,M,D,H,m,S);
}
Example #2
0
PSs TSs::LoadTxt(
    const TSsFmt& SsFmt, const TStr& FNm,
    const PNotify& Notify, const bool& IsExcelEoln,
    const int& MxY, const TIntV& AllowedColNV, const bool& IsQStr) {
    TNotify::OnNotify(Notify, ntInfo, TStr("Loading File ")+FNm+" ...");
    PSIn SIn=TFIn::New(FNm);
    PSs Ss=TSs::New();
    if (!SIn->Eof()) {
        int X=0;
        int Y=0;
        int PrevX=-1;
        int PrevY=-1;
        char Ch=SIn->GetCh();
        TChA ChA;
        while (!SIn->Eof()) {
            // compose value
            ChA.Clr();
            if (IsQStr&&(Ch=='"')) {
                // quoted string ('""' sequence means '"')
                Ch=SIn->GetCh();
                forever {
                    while ((!SIn->Eof())&&(Ch!='"')) {
                        ChA+=Ch;
                        Ch=SIn->GetCh();
                    }
                    if (Ch=='"') {
                        Ch=SIn->GetCh();
                        if (Ch=='"') {
                            ChA+=Ch;
                            Ch=SIn->GetCh();
                        }
                        else {
                            break;
                        }
                    }
                }
            } else {
                if (SsFmt==ssfTabSep) {
void TTokenizerUtil::Paragraphize(const PSIn& SIn, TStrV& Paragraphs) {
	TChA ParagraphBuf;
	int c;
	bool wasSpace = false;
	while (!SIn->Eof()) {
		c = SIn->GetCh();
		// two consecutive spaces signal a new paragraph
		if (c == ' ' || c == '\t' || c == '\n') {
			if (wasSpace) {
				Paragraphs.Add(ParagraphBuf);
				ParagraphBuf.Clr();
				continue;
			}
			wasSpace = true;
		} else {
			wasSpace = false;
		}
		ParagraphBuf += c;
	}
	if (ParagraphBuf.Len() > 0) {
		Paragraphs.Add(ParagraphBuf);
	}
}
Example #4
0
File: util.cpp Project: pikma/Snap
// http://www.ijs.si/fdfd/blah.html --> www.ijs.si
TChA TStrUtil::GetDomNm(const TChA& UrlChA) {
    int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
    if (EndSlash > 0) {
        const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
        if (BegSlash > 0) {
            return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc();
        }
        else {
            return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc();
        }
    } else {
        if (UrlChA.IsPrefix("http://")) {
            return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc();
        }
        EndSlash = UrlChA.SearchCh('/', 0);
        if (EndSlash > 0) {
            return UrlChA.GetSubStr(0, EndSlash-1).ToLc();
        }
        else {
            return TChA(UrlChA).ToLc();
        }
    }
}
Example #5
0
File: bowfl.cpp Project: Accio/snap
void TBowFl::LoadLnDocTxt(PBowDocBs BowDocBs, const TStr& LnDocFNm,
 TIntV& NewDIdV, const bool& NamedP, const int& MxDocs, const bool& SaveDocP) {
  // open line-doc file
  NewDIdV.Clr(); TFIn FIn(LnDocFNm); char Ch=' '; int Docs=0;
  while (!FIn.Eof()){
    Docs++; if ((MxDocs!=-1)&&(Docs>=MxDocs)){break;}
    printf("%d\r", Docs);
    // document name
    TChA DocNm;
    Ch=FIn.GetCh();
    if (NamedP){
      while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
        DocNm+=Ch; Ch=FIn.GetCh();}
      DocNm.Trunc();
      if (DocNm.Empty()){Docs--; continue;}
    } else {
        DocNm = TInt::GetStr(Docs);
    }
    // categories
    TStrV CatNmV;
    forever {
      while ((!FIn.Eof())&&(Ch==' ')){Ch=FIn.GetCh();}
      if (Ch=='!'){
        if (!FIn.Eof()){Ch=FIn.GetCh();}
        TChA CatNm;
        while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')&&(Ch!=' ')){
          CatNm+=Ch; Ch=FIn.GetCh();}
        if (!CatNm.Empty()){CatNmV.Add(CatNm);}
      } else {
        break;
      }
    }
    // document text
    TChA DocChA;
    while ((!FIn.Eof())&&(Ch!='\r')&&(Ch!='\n')){
      DocChA+=Ch; Ch=FIn.GetCh();}
    // skip empty documents (empty lines)
    if (DocNm.Empty()&&DocChA.Empty()){
      continue;}
    // add document to document-base
    NewDIdV.Add(BowDocBs->AddHtmlDoc(DocNm, CatNmV, DocChA, SaveDocP));
  }
  // return document-base
  BowDocBs->AssertOk();
  printf("\n");
}
Example #6
0
const char* TJsonObj::ParseArrayVal(const char* JsonStr) {
  const char *c = JsonStr;
  bool Nested = false;
  TChA ValStr;
  Clr();
  while (*c && TCh::IsWs(*c)) { c++; }
  if (*c == '"') { c = GetStr(c, ValStr); } // string
  else if (TCh::IsNum(*c) || (*c=='-' &&  TCh::IsNum(*(c+1)))) {  // number
    while (*c && *c!=',' && *c!='}' && *c!=']' && ! TCh::IsWs(*c)) { ValStr.Push(*c); c++; } }
  else if (*c=='t' || *c=='f' || *c=='n') { // true, false, null
    while (*c && *c!=',' && *c!='}' && *c!=']') { ValStr.Push(*c); c++; } }
  else if (*c=='{') { // nested object
    EAssertR(! KeyArrayH.IsKey("key"), "JSON error: object with key 'key' already exists");
    TJsonObj& Obj = KeyObjH.AddDat("key");
    c = Obj.Parse(c) + 1;  Nested = true;
  }
  else if (*c=='[') { // array
    EAssertR(! KeyArrayH.IsKey("key"), "JSON error: array with key 'key' already exists");
    TVec<TJsonObj>& Array = KeyArrayH.AddDat("key");
      c++;
      while (*c && *c!=']') {
        while (*c && TCh::IsWs(*c)) { c++; }
        Array.Add();
        if (*c=='{') { c = Array.Last().Parse(c) + 1; } // nested object
        else { c = Array.Last().ParseArrayVal(c); }
        if (*c && *c==',') { c++; }
      }
      c++; Nested = true;
  }
  if (! Nested) {
    EAssertR(! KeyArrayH.IsKey("key"), "JSON error: object with key 'key' already exists");
    KeyValH.AddDat("key", ValStr); 
  }
  while (*c && TCh::IsWs(*c)) { c++; }
  return c;
}
Example #7
0
// remove ending /, /index.html, etc. and strip starting www.
bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) {
  UrlOut = UrlIn;
  if (StripEnd(UrlIn, "/", UrlOut)) {}
  else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
  else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
  else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
  if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
    // if UrlIn is relative url, try combine it with BaseUrl
    if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
      //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
      return false; }
    TChA Out;
    if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
    if (UrlIn[0] != '/') { Out.AddCh('/'); }
    Out += UrlOut;
    UrlOut = Out;
  }
  // http://www. --> http://
  if (UrlOut.IsPrefix("http://www.")) {
    UrlOut = "http://"+UrlOut.GetSubStr(11, TInt::Mx);
  }
  UrlOut.ToLc();
  return true;
}
Example #8
0
File: util.cpp Project: pikma/Snap
int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) {
    LineV.Clr(false);
    LineV.Add(ChA.CStr());
    bool IsChs=false;
    for (char *c = (char *) ChA.CStr(); *c; c++) {
        if (*c == '\n') {
            if (c > ChA.CStr() && *(c-1)=='\r') {
                *(c-1)=0;    // \r\n
            }
            *c=0;
            if (SkipEmpty) {
                if (IsChs) {
                    LineV.Add(c+1);
                }
            } else {
                LineV.Add(c+1);
            }
            IsChs=false;
        } else {
            IsChs=true;
        }
    }
    return LineV.Len();
}
///////////////////////////////
// Tokenizer-Utils
void TTokenizerUtil::Sentencize(const PSIn& SIn, TStrV& Sentences, const bool& SplitNewLineP) {
	TChA SentenceBuf;
	int c;
	while (!SIn->Eof()) {
		c = SIn->GetCh();
		switch (c) {
			case '\r':
			case '\n':	{
				if (!SplitNewLineP) {
					SentenceBuf += ' ';
					break;
				}
			}
			case '"' :
			case '.' :
			case '!' :
			case ':' :
			case ';' :
			case '?' :
			case '\t': {
				if (SentenceBuf.Len() > 2) {
					Sentences.Add(SentenceBuf);
					printf("%s\n", SentenceBuf.CStr());
					SentenceBuf.Clr();
				}
				break;
			}
			default: 
				SentenceBuf += c;
				break;
		}
	}
	if (SentenceBuf.Len() > 0) {
		Sentences.Add(SentenceBuf);
	}	
}
Example #10
0
File: util.cpp Project: pikma/Snap
// get <TagNm>*</TagNm> (can be many tags inbetween
bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) {
    if (XmlLx.GetSym() != xsySTag) {
        return false;
    }
    TagVal.Clr();
    TagNm = XmlLx.TagNm;
    //const TXmlLxSym NextSym = XmlLx.GetSym();
    while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
        if (TakeTagNms) {
            TagVal += XmlLx.TxtChA;
        }
        else if (XmlLx.Sym == xsyStr) {
            TagVal += XmlLx.TxtChA;
        }
        XmlLx.GetSym();
    }
    return true;
    //if (NextSym == xsyStr) {
    //  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
    //} else {
    //  EAssertR(NextSym == xsyETag, TagNm); // empty tag
    //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
    //}
}
Example #11
0
File: html.cpp Project: pikma/Snap
TStr THtmlLxChDef::GetCSZFromWin1250(const TChA& ChA){
  TChA DstChA;
  for (int ChN=0; ChN<ChA.Len(); ChN++){
    unsigned char Ch=ChA[ChN];
    switch (Ch){
      case 232: DstChA+='c'; break;
      case 200: DstChA+='C'; break;
      case 154: DstChA+='s'; break;
      case 138: DstChA+='S'; break;
      case 158: DstChA+='z'; break;
      case 142: DstChA+='Z'; break;
      default: DstChA+=Ch;
    }
  }
  return DstChA;
}
Example #12
0
File: util.cpp Project: pikma/Snap
bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
    int AlNumCnt=0, ChCnt=0;
    for (const char *c = Str.CStr(); *c; c++) {
        if (TCh::IsWs(*c)) {
            continue;
        }
        if (*c > 0 && TCh::IsAlNum(*c)) {
            AlNumCnt++;
        }
        ChCnt++;
    }
    if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) {
        return true;
    }
    return false;
}
Example #13
0
TSecTm TSecTm::GetDtTmFromYmdHmsStr(const TStr& YmdHmsPmStr,
 const char& DateSepCh, const char& TimeSepCh){
  int YmdHmsPmStrLen=YmdHmsPmStr.Len();
  // year
  TChA ChA; int ChN=0;
  while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=DateSepCh)){
    ChA+=YmdHmsPmStr[ChN]; ChN++;}
  TStr YearStr=ChA;
  // month
  ChA.Clr(); ChN++;
  while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=DateSepCh)){
    ChA+=YmdHmsPmStr[ChN]; ChN++;}
  TStr MonthStr=ChA;
  // day
  ChA.Clr(); ChN++;
  while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=' ')){
    ChA+=YmdHmsPmStr[ChN]; ChN++;}
  TStr DayStr=ChA;
  // hour
  ChA.Clr(); ChN++;
  while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=TimeSepCh)){
    ChA+=YmdHmsPmStr[ChN]; ChN++;}
  TStr HourStr=ChA;
  // minute
  ChA.Clr(); ChN++;
  while ((ChN<YmdHmsPmStrLen)&&(YmdHmsPmStr[ChN]!=TimeSepCh)){
    ChA+=YmdHmsPmStr[ChN]; ChN++;}
  TStr MinStr=ChA;
  // second
  ChA.Clr(); ChN++;
  while (ChN<YmdHmsPmStrLen){
    ChA+=YmdHmsPmStr[ChN]; ChN++;}
  TStr SecStr=ChA;
  // transform to numbers
  int MonthN=MonthStr.GetInt();
  int DayN=DayStr.GetInt();
  int YearN=YearStr.GetInt();
  int HourN; int MinN; int SecN;
  if (HourStr.IsInt()){
    HourN=HourStr.GetInt();
    MinN=MinStr.GetInt();
    SecN=SecStr.GetInt();
  } else {
    HourN=0; MinN=0; SecN=0;
  }
  // construct the time
  TSecTm Tm=TSecTm::GetDtTm(YearN, MonthN, DayN);
  Tm.AddHours(HourN);
  Tm.AddMins(MinN);
  Tm.AddSecs(SecN);
  return Tm;
}
bool THttpLx::IsRespStatusLn(){
  static const TChA MouldChA="http/N.N NNN ";
  TChA TestChA(MouldChA);
  int TestLen=TestChA.Len();
  if (1+Len()<TestLen){return false;}
  TestChA.PutCh(0, ChDef.GetLcCh(Ch));
  {for (int ChN=1; ChN<TestLen; ChN++){
    TestChA.PutCh(ChN, ChDef.GetLcCh(GetCh()));}}
  {for (int ChN=1; ChN<TestLen; ChN++){
    PutCh(TestChA[TestLen-ChN-1]);}}
  {for (int ChN=0; ChN<MouldChA.Len(); ChN++){
    if (MouldChA[ChN]=='N'){
      if (!ChDef.IsDigit(TestChA[ChN])){return false;}
    } else {
      if (MouldChA[ChN]!=TestChA[ChN]){return false;}
    }
  }}
  return true;
}
Example #15
0
File: html.cpp Project: pikma/Snap
TStr THtmlLxChDef::GetCSZFromYuascii(const TChA& ChA){
  TChA DstChA;
  for (int ChN=0; ChN<ChA.Len(); ChN++){
    char Ch=ChA[ChN];
    switch (Ch){
      case '~': DstChA+='c'; break;
      case '^': DstChA+='C'; break;
      case '}': DstChA+='c'; break;
      case ']': DstChA+='C'; break;
      case '|': DstChA+='d'; break;
      case '\\': DstChA+='D'; break;
      case '{': DstChA+='s'; break;
      case '[': DstChA+='S'; break;
      case '`': DstChA+='z'; break;
      case '@': DstChA+='Z'; break;
      default: DstChA+=Ch;
    }
  }
  return DstChA;
}
Example #16
0
File: html.cpp Project: pikma/Snap
TStr THtmlLxChDef::GetWin1250FromYuascii(const TChA& ChA){
  TChA DstChA;
  for (int ChN=0; ChN<ChA.Len(); ChN++){
    char Ch=ChA[ChN];
    switch (Ch){
      case '~': DstChA+=uchar(232); break;
      case '^': DstChA+=uchar(200); break;
      case '}': DstChA+='c'; break;
      case ']': DstChA+='C'; break;
      case '|': DstChA+='d'; break;
      case '\\': DstChA+='D'; break;
      case '{': DstChA+=uchar(154); break;
      case '[': DstChA+=uchar(138); break;
      case '`': DstChA+=uchar(158); break;
      case '@': DstChA+=uchar(142); break;
      default: DstChA+=Ch;
    }
  }
  return DstChA;
}
Example #17
0
File: html.cpp Project: pikma/Snap
TStr THtmlLxChDef::GetIsoCeFromYuascii(const TChA& ChA){
  TChA DstChA;
  for (int ChN=0; ChN<ChA.Len(); ChN++){
    char Ch=ChA[ChN];
    switch (Ch){
      case '~': DstChA+=uchar(232); break;
      case '^': DstChA+=uchar(200); break;
      case '}': DstChA+=uchar(230); break;
      case ']': DstChA+=uchar(198); break;
      case '|': DstChA+=uchar(240); break;
      case '\\': DstChA+=uchar(208); break;
      case '{': DstChA+=uchar(185); break;
      case '[': DstChA+=uchar(169); break;
      case '`': DstChA+=uchar(190); break;
      case '@': DstChA+=uchar(174); break;
      default: DstChA+=Ch;
    }
  }
  return DstChA;
}
Example #18
0
/// For every quote, add it to corresponding bucket for each hashed x-character shingle of the quote
// (Shingles by characters)
void LSH::HashShingles(TQuoteBase *QuoteBase, TClusterBase *CB, TInt ShingleLen,
    THash<TMd5Sig, TShingleIdSet>& ShingleToQuoteIds) {
  Err("Hashing shingles...\n");
  TIntV QuoteIds;
  QuoteBase->GetAllQuoteIds(QuoteIds);
  for (int qt = 0; qt < QuoteIds.Len(); qt++) {
    if (qt % 1000 == 0) {
      fprintf(stderr, "%d out of %d completed\n", qt, QuoteIds.Len());
    }

    if (CB->IsQuoteInArchivedCluster(QuoteIds[qt]))
      continue;
    TQuote Q;
    QuoteBase->GetQuote(QuoteIds[qt], Q);

    // Put x-character (or x-word) shingles into hash table; x is specified by ShingleLen parameter
    TStr QContentStr;
    Q.GetParsedContentString(QContentStr);
    TChA QContentChA = TChA(QContentStr);

    int CurWord = 0;

    for (int i = 0; i < QContentChA.Len() - ShingleLen + 1; i++) {
      TChA ShingleChA = TChA();
      for (int j = 0; j < ShingleLen; j++) {
        ShingleChA.AddCh(QContentChA.GetCh(i + j));
      }
      TStr Shingle = TStr(ShingleChA);
      const TMd5Sig ShingleMd5(Shingle);
      TShingleIdSet ShingleQuoteIds;
      if (ShingleToQuoteIds.IsKey(ShingleMd5)) {
        ShingleQuoteIds = ShingleToQuoteIds.GetDat(ShingleMd5);
      }

      for (int j = CurWord; j > CurWord - WordWindow && j >= 0; j--) {
        ShingleQuoteIds.AddKey(TShingleId(QuoteIds[qt], j));
      }

      ShingleToQuoteIds.AddDat(ShingleMd5, ShingleQuoteIds);

      // up the current word index if we see a space
      if (QContentChA.GetCh(i + ShingleLen - 1) == ' ') {
        CurWord++;
      }
    }
  }
  Err("Done hashing!\n");
}
Example #19
0
TStr TLxChDef::GetUcStr(const TStr& Str) const {
  TChA UcStr;
  for (int ChN=0; ChN<Str.Len(); ChN++){
    UcStr.AddCh(GetUc(Str.GetCh(ChN)));}
  return UcStr;
}
Example #20
0
TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) {
  if (LongStr.Len() < MaxLen) { return LongStr; }
  TChA Str = LongStr.GetSubStr(0, MaxLen-1);
  Str += "...";
  return Str;
}
Example #21
0
int TStrUtil::CountWords(const TChA& ChA) {
  return CountWords(ChA.CStr());
}
Example #22
0
// get website (GetDomNm2 or blog url)
TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) {
  TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
  // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
  if (DomNm == "blog.myspace.com") {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1); 
  }
  // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
  // http://ameblo.jp/baptism/entry-10126216277.html
  // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
  // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
  // http://blogs.zdnet.com/hardware/?p=2391
  // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
  // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
  // http://blog.tv2.dk/ole.mork/entry254689.html
  // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
  // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
  // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
  if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com" 
    || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co" 
    || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" 
    || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com" 
    || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk" 
    || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
      return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); 
  }
  // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
  // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
  if (DomNm == "digg.com") {
    if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
      const int Url = PostUrlStr.SearchStr(";url=");
      if (Url != -1) { 
        return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
    } else {
      return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
  }
  // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
  // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
  if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/") 
    || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); 
  }
  // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
  if (DomNm=="feeds.feedburner.com") {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); 
  }
  // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
  if (DomNm=="groups.google.com") {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1); 
  }
  // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
  if (DomNm=="news.google.com") { // redirect
    const int UrlPos = PostUrlStr.SearchStr("&url=");
    if (UrlPos != -1) { 
      return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
  }
  // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
  if (DomNm == "bloggrevyen.no") { // redirect
    const int Http2 = PostUrlStr.SearchStr("/http://");
    if (Http2!=-1) {
      return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
  }
  //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
  //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
  if (DomNm.IsSuffix(".rd.yahoo.com")) {
    const int Http2 = PostUrlStr.SearchStr("/*");
    if (Http2!=-1) {
      return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
  }
  return DomNm;
}
Example #23
0
int TSOut::PutStr(const TChA& ChA){
  int Cs=UpdateLnLen(ChA.Len());
  return Cs+PutBf(ChA.CStr(), ChA.Len());
}
Example #24
0
// get domain name and also strip starting www.
TChA TStrUtil::GetDomNm2(const TChA& UrlChA) {
  TChA Dom = GetDomNm(UrlChA);
  if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
  else { return Dom; }
}
Example #25
0
TStr TPorterStemmer::Stem(const TStr& s) {
    TChA buf = s;
    buf.ToUc();
    return StemInPlace(buf.CStr());
}
Example #26
0
/////////////////////////////////////////////////
// Roget-Base
void TRBase::LoadArtfl(const TStr& WebBaseFPath){
  PWebBase WebBase=PWebBase(new TWebMemBase(WebBaseFPath));
  int WebPgP=WebBase->FFirstWebPg(); int WebPgId;
  while (WebBase->FNextWebPg(WebPgP, WebPgId)){
    TStr UrlStr=WebBase->GetUrlStr(WebPgId);
    static TStr RgShStr="RG.sh"; if (!UrlStr.IsStrIn(RgShStr)){continue;}
//    if (!UrlStr.IsStrIn("RG.sh?^544\\")){continue;}

    PWebPg WebPg=WebBase->GetWebPg(WebPgId);
    PSIn SIn=TStrIn::New(WebPg->GetBodyStr());
    PHtmlDoc HtmlDoc=THtmlDoc::New(SIn, hdtAll);
    int TokN=0; PHtmlTok Tok; THtmlLxSym Sym; TStr Str;

    // move to <h2>
    do {HtmlDoc->GetTok(TokN++, Sym, Str);
    } while (!((Sym==hlsyBTag)&&(Str==THtmlTok::H2TagNm)));

    // parse "ddd[A|B]."
    TChA CtgNm; TChA CtgIdNm;
    HtmlDoc->GetTok(TokN++, Sym, Str);
    IAssert(Sym==hlsyNum); CtgNm+=Str; CtgIdNm+=Str;
    HtmlDoc->GetTok(TokN++, Sym, Str);
    if (Sym==hlsyStr){
      IAssert((Str=='A')||(Str=='B')); CtgNm+=Str; CtgIdNm+=Str;
      HtmlDoc->GetTok(TokN++, Sym, Str);
    }
    IAssert((Sym==hlsySSym)&&(Str=='.')); CtgNm+=Str;

    // parse to </h2>"
    TChA BracketStr;
    HtmlDoc->GetTok(TokN++, Sym, Str);
    while (!((Sym==hlsyETag)&&(Str==THtmlTok::H2TagNm))){
      if ((Sym==hlsySSym)&&(Str=='[')){
        HtmlDoc->GetTok(TokN++, Sym, Str);
        while (!((Sym==hlsySSym)&&(Str==']'))){
          if ((!BracketStr.Empty())&&(Sym==hlsyStr)){BracketStr+=' ';}
          BracketStr+=Str; HtmlDoc->GetTok(TokN++, Sym, Str);
        }
        BracketStr.Ins(0, " ["); BracketStr+=']';
      } else {
        if (Sym==hlsyStr){CtgNm+=' ';}
        CtgNm+=Str;
      }
      HtmlDoc->GetTok(TokN++, Sym, Str);
    }
    CtgNm+=BracketStr;
    TNotify::OnNotify(Notify, ntInfo, CtgNm);

    // parse words
    static TStr AdjStr="ADJ"; static TStr AdvStr="ADV";
    static TStr IntStr="INT"; static TStr PgStr="PAGE";
    static TStr PhrStr="PHR"; static TStr PrefStr="PREF";
    static TStr PronStr="PRON";
    HtmlDoc->GetTok(TokN++, Sym, Str);
    IAssert((Sym==hlsyStr)&&((Str=='N')||(Str==AdvStr)));
    while (TokN<HtmlDoc->GetToks()){
      if (Sym==hlsyStr){
        if (Str==PhrStr){break;}
        if ((Str!='N')&&(Str!='V')&&(Str!=AdjStr)&&(Str!=AdvStr)&&
         (Str!=IntStr)&&(Str!=PrefStr)&&(Str!=PronStr)){
          TChA WordStr;
          do {
            if (!WordStr.Empty()){WordStr+=' ';} WordStr+=Str;
            HtmlDoc->GetTok(TokN++, Sym, Str);
          } while (Sym==hlsyStr);
//          TNotify::OnNotify(Notify, ntInfo, WordStr);
        } else {
          HtmlDoc->GetTok(TokN++, Sym, Str);
        }
      } else
      if (Sym==hlsySSym){
        TStr ExpectStr;
        if (Str=='('){ExpectStr=')';}
        else if (Str=='['){ExpectStr=']';}
        else if (Str=='{'){ExpectStr='}';}
        else if (Str=='"'){ExpectStr='"';}
        if (!ExpectStr.Empty()){
          do {HtmlDoc->GetTok(TokN++, Sym, Str);
          } while (!((Sym==hlsySSym)&&(Str==ExpectStr)));
        }
        HtmlDoc->GetTok(TokN++, Sym, Str);
      } else {
        HtmlDoc->GetTok(TokN++, Sym, Str);
      }
    }
  }
}
Example #27
0
bool IsCTxtHttpResp(const PUrl& Url, const PHttpResp& HttpResp, const int& MnCTxtToks){
  if (HttpResp->IsStatusCd_Ok()){
    PWebPg WebPg=TWebPg::New(Url->GetUrlStr(), HttpResp);
    if (HttpResp->IsContType(THttp::TextHtmlFldVal)){
      TMem BodyMem=HttpResp->GetBodyAsMem();
      PSIn BodyMemIn=TMemIn::New(BodyMem);
      // prepare html-tokens
      PHtmlDoc HtmlDoc=THtmlDoc::New(BodyMemIn, hdtAll, false);
      int Toks=HtmlDoc->GetToks(); THtmlLxSym TokSym; TStr TokStr;
      // prepare continuous-text indicators
      int CTxtToks=0; TChA CTxtChA; bool CTxtP=false;
      // prepare script & style flag
      bool InScript=false; bool InStyle=false; 
      // traverse tokens
      for (int TokN=0; TokN<Toks; TokN++){
        // get token data
        HtmlDoc->GetTok(TokN, TokSym, TokStr);
        switch (TokSym){
          case hsyStr:
          case hsyNum:
          case hsySSym:
            if (!InScript&&!InStyle){
              // text token
              CTxtToks++; CTxtChA+=TokStr; CTxtChA+=' '; 
            }
            break;
          case hsyBTag:
            if (!InScript&&!InStyle){
              if (TokStr=="<SCRIPT>"){
                // start of script
                InScript=true; CTxtToks=0; CTxtChA.Clr();
              } else 
              if (TokStr=="<STYLE>"){
                // start of style
                InStyle=true; CTxtToks=0; CTxtChA.Clr();
              } else {
                if ((TokStr=="<P>")||(TokStr=="<B>")||(TokStr=="<I>")){
                  // skip in-text-tags
                } else {
                  // non-text-tags - break continuous-text
                  CTxtToks=0; CTxtChA.Clr();
                }
              }
            }
            break;
          case hsyETag:
            if (InScript||InStyle){
              if (TokStr=="<SCRIPT>"){
                // end of script
                InScript=false;
              } else
              if (TokStr=="<STYLE>"){
                // end of style
                InStyle=false;
              }
            }
            break;
          default: 
            // non-text-token - break continuous-text
            CTxtToks=0; CTxtChA.Clr();
            break;
        }
        // stop if enough continuous-text
        if (CTxtToks>MnCTxtToks){
          CTxtP=true; break;
        }
      }
      if (CTxtP){
        printf("%s\n", Url->GetUrlStr().CStr());
      }
      return CTxtP;
    }
  }
  return false;
}
Example #28
0
PAmazonItem TAmazonItem::GetFromWebPg(const PWebPg& WebPg){
  TStr UrlStr=WebPg->GetUrlStr();
  TStr ItemId=TAmazonItem::GetItemId(WebPg->GetUrl());
  TStr HtmlStr=WebPg->GetHttpBodyAsStr();
  PSIn HtmlSIn=TStrIn::New(HtmlStr);
  THtmlLx HtmlLx(HtmlSIn);
  THtmlLxSym Sym; TChA ChA;

  // move to title
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<FONT>")){
      TStr FaceArg=HtmlLx.GetArg("FACE", "");
      TStr SizeArg=HtmlLx.GetArg("SIZE", "");
      if ((FaceArg=="verdana,arial,helvetica")&&(SizeArg.Empty())){break;}
    }
  }
  // extract title
  TChA TitleChA;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;}
    if (!TitleChA.Empty()){TitleChA+=HtmlLx.GetPreSpaceStr();}
    TitleChA+=ChA;
  }
  TStr TitleStr=TitleChA;
  //printf("'%s'\n", TitleStr.CStr());
  // extract authors
  TStrV AuthorNmV;
  TChA AuthorNmChA;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<A>")){
      do {
        HtmlLx.GetSym();
        Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
        if (Sym==hsyStr){
          if (!AuthorNmChA.Empty()){AuthorNmChA+=HtmlLx.GetPreSpaceStr();}
          AuthorNmChA+=ChA;
        }
      } while (!((Sym==hsyETag)&&(ChA=="<A>")));
      AuthorNmV.Add(AuthorNmChA); AuthorNmChA.Clr();
    }
    if ((Sym==hsyETag)&&(ChA=="<FONT>")){break;}
  }
  for (int AuthorNmN=0; AuthorNmN<AuthorNmV.Len(); AuthorNmN++){
    //printf("'%s'\n", AuthorNmV[AuthorNmN].CStr());
  }
  // move to x-sell
  TStrQ PrevStrQ(3);
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if (Sym==hsyStr){
      PrevStrQ.Push(ChA);
      if ((PrevStrQ.Len()==3)&&(PrevStrQ[0]=="Customers")
       &&(PrevStrQ[1]=="who")&&(PrevStrQ[2]=="bought")){break;}
    } else {
      PrevStrQ.Clr();
    }
  }
  // extract x-sell pointers
  TStrV NextItemIdV;
  while (HtmlLx.GetSym()!=hsyEof){
    Sym=HtmlLx.Sym; ChA=HtmlLx.ChA;
    if ((Sym==hsyBTag)&&(ChA=="<A>")){
      TStr RelUrlStr=HtmlLx.GetArg("HREF");
      PUrl Url=TUrl::New(RelUrlStr, UrlStr);
      TStr NextItemId=TAmazonItem::GetItemId(Url);
      NextItemIdV.Add(NextItemId);
    }
    if ((Sym==hsyETag)&&(ChA=="<UL>")){break;}
  }
  for (int NextItemIdN=0; NextItemIdN<NextItemIdV.Len(); NextItemIdN++){
    //printf("'%s'\n", NextItemIdV[NextItemIdN].CStr());
  }

  // construct item object
  PAmazonItem AmazonItem=PAmazonItem(new
   TAmazonItem(ItemId, TitleStr, AuthorNmV, NextItemIdV));
  return AmazonItem;
}
Example #29
0
TMIn::TMIn(const TChA& ChA):
  TSBase("Input-Memory"), TSIn("Input-Memory"), Bf(NULL), BfC(0), BfL(0){
  BfL=ChA.Len(); Bf=new char[BfL]; strncpy(Bf, ChA.CStr(), BfL);
}
Example #30
0
File: bowfl.cpp Project: Accio/snap
PBowDocBs TBowFl::LoadTabTxt(
 const TStr& FNm, const TStr& SsFmtNm, const int& Recs,
 const TStr& SwSetTypeNm, const TStr& StemmerTypeNm,
 const int& MxNGramLen, const int& MnNGramFq,
 const int& IdFldN, const TStr& IdFldNm,
 const TIntV& CatFldNV, const TStrV& CatFldNmV,
 const TIntV& TxtFldNV, const TStrV& TxtFldNmV){
  TSsFmt SsFmt=TSs::GetSsFmtFromStr(SsFmtNm);
  // load table-data
  PSs Ss=TSs::LoadTxt(SsFmt, FNm);
  // get id-field
  int FinalIdFldN=IdFldN;
  if ((FinalIdFldN==-1)&&(!IdFldNm.Empty())){
    FinalIdFldN=Ss->GetFldX(IdFldNm);
  }
  // get category-fields
  TIntV FinalCatFldNV=CatFldNV;
  for (int CatN=0; CatN<CatFldNmV.Len(); CatN++){
    int CatFldN=Ss->GetFldX(CatFldNmV[CatN]);
    FinalCatFldNV.Add(CatFldN);
  }
  FinalCatFldNV.Sort();
  // get text-fields
  TIntV FinalTxtFldNV=TxtFldNV;
  for (int TxtN=0; TxtN<TxtFldNmV.Len(); TxtN++){
    int TxtFldN=Ss->GetFldX(TxtFldNmV[TxtN]);
    FinalTxtFldNV.Add(TxtFldN);
  }
  FinalTxtFldNV.Sort();
  // collect document-strings
  TStrV DocNmV;
  TVec<TStrV> CatNmVV;
  TStrV DocStrV;
  for (int Y=1; Y<Ss->GetYLen(); Y++){
    // document-name
    TStr DocNm;
    if (FinalIdFldN==-1){
      DocNm=TInt::GetStr(Y);
    } else {
      DocNm=Ss->GetVal(FinalIdFldN, Y);
    }
    DocNmV.Add(DocNm);
    // categories
    TStrV CatNmV;
    for (int CatN=0; CatN<FinalCatFldNV.Len(); CatN++){
      int CatFldN=FinalCatFldNV[CatN];
      TStr CatNm=Ss->GetVal(CatFldN, Y);
      if (!CatNm.Empty()){
        CatNmV.Add(CatNm);
      }
    }
    CatNmVV.Add(CatNmV);
    // text
    TChA DocChA;
    for (int TxtN=0; TxtN<FinalTxtFldNV.Len(); TxtN++){
      int TxtFldN=FinalTxtFldNV[TxtN];
      TStr TxtStr=Ss->GetVal(TxtFldN, Y);
      if (!DocChA.Empty()){DocChA+=" <br> ";}
      if (!TxtStr.Empty()){DocChA+=TxtStr;}
    }
    DocStrV.Add(DocChA);
  }
  // prepare stop-words
  PSwSet SwSet=TSwSet::GetSwSet(SwSetTypeNm);
  // prepare stemmer
  PStemmer Stemmer=TStemmer::GetStemmer(StemmerTypeNm);
  // create ngrams
  PNGramBs NGramBs;
  if (!((MxNGramLen==1)&&(MnNGramFq==1))){
    NGramBs=TNGramBs::GetNGramBsFromHtmlStrV(
     DocStrV, MxNGramLen, MnNGramFq, SwSet, Stemmer);
  }
  // create document-base
  PBowDocBs BowDocBs=TBowDocBs::New(SwSet, Stemmer, NGramBs);
  for (int DocN=0; DocN<DocNmV.Len(); DocN++){
    BowDocBs->AddHtmlDoc(DocNmV[DocN], CatNmVV[DocN], DocStrV[DocN], true);
  }
  BowDocBs->AssertOk();
  return BowDocBs;
}