Exemplo n.º 1
0
int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') {
  const char *c = Url.CStr();
  int cnt = 0;
  while (*c && cnt != Count) {
    if (*c == Ch) { cnt++; }
    c++;
  }
  return int(c-Url.CStr()-1);
}
Exemplo n.º 2
0
int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) {
  WrdV.Clr(false);
  WrdV.Add(ChA.CStr());
  for (char *c = (char *) ChA.CStr(); *c; c++) {
    if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
      *c = 0;
      if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
      WrdV.Add(c+1);
    }
  }
  return WrdV.Len();
}
Exemplo n.º 3
0
int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) {
  WrdV.Clr(false);
  WrdV.Add(ChA.CStr());
  for (char *c = (char *) ChA.CStr(); *c; c++) {
    if (*c == Ch) {
      *c = 0;
      if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
      WrdV.Add(c+1);
    }
  }
  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
  return WrdV.Len();
}
Exemplo n.º 4
0
Arquivo: util.cpp Projeto: pikma/Snap
// space seprated sequence of words (includes all non-blank characters, i.e., punctuations)
TChA TStrUtil::GetCleanStr(const TChA& ChA) {
    char *b = (char *) ChA.CStr();
    while (*b && ! TCh::IsAlNum(*b)) {
        b++;
    }
    if (*b == 0) {
        return TChA();
    }
    TChA OutChA(ChA.Len());
    char *e = b;
    bool ws=false;
    while (*e) {
        while (*e && TCh::IsWs(*e)) {
            e++;
            ws=true;
        }
        if (! *e) {
            break;
        }
        if (ws) {
            OutChA.AddCh(' ');
            ws=false;
        }
        OutChA.AddCh(*e);
        e++;
    }
    //OutChA.ToLc();
    return OutChA;
}
Exemplo n.º 5
0
Arquivo: util.cpp Projeto: pikma/Snap
// space separated sequence of words, remove all punctuations, etc.
TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) {
    char *b = (char *) ChA.CStr();
    while (*b && ! TCh::IsAlNum(*b)) {
        b++;
    }
    if (*b == 0) {
        return TChA();
    }
    TChA OutChA(ChA.Len());
    char *e = b, tmp;
    while (*e) {
        b = e;
        while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) {
            e++;
        }
        if (b < e) {
            tmp = *e;
            *e=0;
            OutChA += b;
            OutChA.AddCh(' ');
            *e = tmp;
        }
        while (*e && ! TCh::IsAlNum(*e)) {
            e++;
        }
        if (! *e) {
            break;
        }
    }
    OutChA.DelLastCh();
    OutChA.ToLc();
    return OutChA;
}
Exemplo n.º 6
0
const char* TSsParserMP::DumpStr() const {
  static TChA ChA(10*1024);
  ChA.Clr();
  for (int i = 0; i < FldV.Len(); i++) {
    ChA += TStr::Fmt("  %d: '%s'\n", i, FldV[i]);
  }
  return ChA.CStr();
}
Exemplo n.º 7
0
void TNodeJsFIn::readLine(const v8::FunctionCallbackInfo<v8::Value>& Args) {
    v8::Isolate* Isolate = v8::Isolate::GetCurrent();
    v8::HandleScope HandleScope(Isolate);

    TNodeJsFIn* JsFIn = ObjectWrap::Unwrap<TNodeJsFIn>(Args.This());
    TChA LnChA; JsFIn->SIn->GetNextLnBf(LnChA);

    Args.GetReturnValue().Set(v8::String::NewFromUtf8(Isolate, LnChA.CStr()));
}
Exemplo n.º 8
0
TStr TPorterStemmer::StemX(const TStr& s){
  TChA buf = s; buf.ToUc();
  int len = buf.Len(); char *p = buf.CStr();
  if (len > 1 && p[len - 1] == 'S' && p[len - 2] == '\'')
    p[len - 2] = '\0';
  else if (len > 0 && p[len - 1] == '\'')
    p[len - 1] = '\0';
  return StemInPlace(p);
}
Exemplo n.º 9
0
bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
  int AlNumCnt=0, ChCnt=0;
  for (const char *c = Str.CStr(); *c; c++) {
    if (TCh::IsWs(*c)) { continue; }
    if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
    ChCnt++;
  }
  if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
  return false;
}
Exemplo n.º 10
0
int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) {
  LineV.Clr(false);
  LineV.Add(ChA.CStr());
  bool IsChs=false;
  for (char *c = (char *) ChA.CStr(); *c; c++) {
    if (*c == '\n') {
      if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
      *c=0;
      if (SkipEmpty) {
        if (IsChs) { LineV.Add(c+1); }
      } else {
        LineV.Add(c+1);
      }
      IsChs=false;
    } else {
      IsChs=true;
    }
  }
  return LineV.Len();
}
Exemplo n.º 11
0
void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) {
  TextStr.Clr();
  char *StrB, *StrE;
  // use full page html: skip till <body>
  //PageHtmlStr = "<script fdsfs>  fsdfsd </script> jure";
  /*if (UseFullHtml) {
    StrB = PageHtmlStr.CStr();
    StrE = StrB+PageHtmlStr.Len();
    char * NewB = strstr(StrB, "<body>");
    if (NewB != NULL) { StrB = NewB+6; }
    char * NewE = strstr(StrB, "body>");
    if (NewE != NULL) {
      while (true) {
        char *E=strstr(NewE+4, "body>");
        if (E == NULL) { break; }  NewE = E; }
      StrE = NewE;
    }
  } else {  // only extracted post html*/
  StrB = (char *) HtmlStr.CStr();
  StrE = (char *) StrB+HtmlStr.Len(); //}
  for (char *e = StrB; e < StrE; ) {
    char* b = e;
    while (e<StrE && *e != '<') { e++; }
    // copy text
    char tmp=*e;  *e = 0;
    TextStr+= b; TextStr.AddCh(' ');  *e = tmp;
    if (e >= StrE) { return; }
    // if start of a comment: skip
    if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
      e += 3;
      while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
      e++;  continue;
    }
    // if "<script" then skip
    if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
      e += 5;
      while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
      e++;  continue;
    }
    // skip to end of tag
    while (e < StrE && *e != '>') { e++; }
    if (e>=StrE) { return; }
    e++;
  }
}
Exemplo n.º 12
0
int TStrUtil::SplitSentences(TChA& ChA, TVec<char *>& SentenceV) {
  SentenceV.Clr();
  const char *B = ChA.CStr();
  const char *E = B+ChA.Len();
  char *c = (char *) B;
  while (*c && TCh::IsWs(*c)) { c++; }
  if (*c) { SentenceV.Add(c); } else { return 0; }
  for (; c < E; c++) {
    if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
      if (c<E && *(c+1)=='"') { *c='"';  c++; } // blah." --> blah"
      if (c>=E) { continue; }
      *c=0;  c++;
      char *e = c-1;
      while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
      while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
      if (c<E) { SentenceV.Add(c); }
    }
  }
  return SentenceV.Len();
}
Exemplo n.º 13
0
// get <TagNm>*</TagNm> (can be many tags inbetween
bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) {
  if (XmlLx.GetSym() != xsySTag) {
    return false; }
  TagVal.Clr();
  TagNm = XmlLx.TagNm;
  //const TXmlLxSym NextSym = XmlLx.GetSym();
  while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
    if (TakeTagNms) {
      TagVal += XmlLx.TxtChA; }
    else if (XmlLx.Sym == xsyStr) {
      TagVal += XmlLx.TxtChA; }
    XmlLx.GetSym();
  }
  return true;
  //if (NextSym == xsyStr) {
  //  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
  //} else {
  //  EAssertR(NextSym == xsyETag, TagNm); // empty tag
    //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
  //}
}
Exemplo n.º 14
0
// Parse strings of the form 2006-08-28 14:11:16 or 14:11:16  08/28/2008
// Non-numeric characters act as separators (there can be many consecutive separating characters)
// Variables give indexes of the date fields
TSecTm TSecTm::GetDtTmFromStr(const TChA& YmdHmsPmStr, const int& YearId, const int& MonId,
 const int& DayId, const int& HourId, const int& MinId, const int& SecId) {
  TChA Tmp = YmdHmsPmStr;
  TVec<char *> FldV;
  // get the sequences of numbers
  for (char *c = (char *) Tmp.CStr(); *c; c++) {
    if (TCh::IsNum(*c)) {
      FldV.Add(c);
      while (TCh::IsNum(*c)) { c++; }
      c--;
    } else { *c = 0; }
  }
  const int Y = atoi(FldV[YearId]);
  const int M = atoi(FldV[MonId]);
  const int D = atoi(FldV[DayId]);
  const int H = atoi(FldV[HourId]);
  const int m = atoi(FldV[MinId]);
  const int S = atoi(FldV[SecId]);
  IAssert(Y>0 && M>0 && D>0 && M<13 && D<32);
  IAssert(H>=0 && H<24 && m>=0 && m<60 && S>=0 && S<60);
  return TSecTm(Y,M,D,H,m,S);
}
///////////////////////////////
// Tokenizer-Utils
void TTokenizerUtil::Sentencize(const PSIn& SIn, TStrV& Sentences, const bool& SplitNewLineP) {
	TChA SentenceBuf;
	int c;
	while (!SIn->Eof()) {
		c = SIn->GetCh();
		switch (c) {
			case '\r':
			case '\n':	{
				if (!SplitNewLineP) {
					SentenceBuf += ' ';
					break;
				}
			}
			case '"' :
			case '.' :
			case '!' :
			case ':' :
			case ';' :
			case '?' :
			case '\t': {
				if (SentenceBuf.Len() > 2) {
					Sentences.Add(SentenceBuf);
					printf("%s\n", SentenceBuf.CStr());
					SentenceBuf.Clr();
				}
				break;
			}
			default: 
				SentenceBuf += c;
				break;
		}
	}
	if (SentenceBuf.Len() > 0) {
		Sentences.Add(SentenceBuf);
	}	
}
Exemplo n.º 16
0
TMIn::TMIn(const TChA& ChA):
  TSBase("Input-Memory"), TSIn("Input-Memory"), Bf(NULL), BfC(0), BfL(0){
  BfL=ChA.Len(); Bf=new char[BfL]; strncpy(Bf, ChA.CStr(), BfL);
}
Exemplo n.º 17
0
int TSOut::PutStr(const TChA& ChA){
  int Cs=UpdateLnLen(ChA.Len());
  return Cs+PutBf(ChA.CStr(), ChA.Len());
}
Exemplo n.º 18
0
TStr TPorterStemmer::Stem(const TStr& s) {
    TChA buf = s;
    buf.ToUc();
    return StemInPlace(buf.CStr());
}
Exemplo n.º 19
0
void TSkyGridBs::SaveTxt(const TStr& FNm, const uint64& CurTm){
  // time-limit
  TStr CurTmStr=TTm::GetTmFromMSecs(CurTm).GetWebLogDateTimeStr();
  uint64 CurDateTm=TTm::GetMSecsFromTm(TTm::GetTmFromWebLogDateTimeStr(TTm::GetTmFromMSecs(CurTm).GetWebLogDateStr()));
  TStr CurDateTmStr=TTm::GetTmFromMSecs(CurDateTm).GetWebLogDateTimeStr();
  TUInt64V MnTmV;
  MnTmV.Add(CurDateTm-0*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-1*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-2*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-4*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-8*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-16*TTmInfo::GetDayMSecs());
  MnTmV.Add(CurDateTm-32*TTmInfo::GetDayMSecs());

  // get bow
  //PBowDocBs BowDocBs=GetBowDocBs(3, 5);
  PBowDocBs BowDocBs=GetBowDocBs();
  PBowDocWgtBs BowDocWgtBs=GetBowDocWgtBs(BowDocBs);

  // open file
  TFOut FOut(FNm); FILE* fOut=FOut.GetFileId();
  // get docs-entities sorted vector
  TIntPrV DocsEntIdPrV; GetSorted_DocsEntIdPrV(DocsEntIdPrV);
  // traverse entities
  for (int EntN=0; EntN<DocsEntIdPrV.Len(); EntN++){
    int EntId=DocsEntIdPrV[EntN].Val2;
    TStr EntNm=GetEntNm(EntId);
    int EntDocs=DocsEntIdPrV[EntN].Val1;
    TSkyGridEnt& Ent=GetEnt(EntId);
    int LinkEnts=Ent.GetLinkEnts();
    fprintf(fOut, "'%s' [%d docs] [%d ents]\n", EntNm.CStr(), EntDocs, LinkEnts);

    // output docs over dates
    {TStrIntPrV DateStrDocsPrV; int _EntDocs;
    Ent.GetDocsPerDateV(this, DateStrDocsPrV, _EntDocs);
    fprintf(fOut, "   Docs per Date (%d docs):", _EntDocs);
    for (int DateN=0; DateN<DateStrDocsPrV.Len(); DateN++){
      TStr DateStr=DateStrDocsPrV[DateN].Val1;
      int Docs=DateStrDocsPrV[DateN].Val2;
      fprintf(fOut, " [%s:%d]", DateStr.CStr(), Docs);
    }
    fprintf(fOut, "\n");}

    fprintf(fOut, "   [Now: %s]\n", CurTmStr.CStr());
    TIntPrV PrevLinkWgtDstEntIdPrV;
    TStrFltPrV PrevWordStrWgtPrV;
    for (int MnTmN=0; MnTmN<MnTmV.Len(); MnTmN++){
      uint64 MnTm=MnTmV[MnTmN];
      double PastDays=(CurDateTm-MnTm)/double(TTmInfo::GetDayMSecs());
      TStr MnTmStr=TTm::GetTmFromMSecs(MnTm).GetWebLogDateTimeStr();
      // get linked entities
      TIntPrV LinkWgtDstEntIdPrV;
      Ent.GetSorted_LinkWgtDstEntIdPrV(MnTm, 0.9, LinkWgtDstEntIdPrV);
      // output difference between previous and current centroid
      if (MnTmN>0){
        TIntPrV NegDiffLinkWgtDstEntIdPrV; TIntPrV PosDiffLinkWgtDstEntIdPrV;
        GetLinkWgtDstEntIdPrVDiff(LinkWgtDstEntIdPrV, PrevLinkWgtDstEntIdPrV,
         NegDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrV);
        // output positive change
        TChA PosDiffLinkWgtDstEntIdPrVChA;
        GetLinkWgtDstEntIdPrVChA(PosDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrVChA);
        fprintf(fOut, "         Pos-Diff: %s\n", PosDiffLinkWgtDstEntIdPrVChA.CStr());
        // output negative change
        TChA NegDiffLinkWgtDstEntIdPrVChA;
        GetLinkWgtDstEntIdPrVChA(NegDiffLinkWgtDstEntIdPrV, NegDiffLinkWgtDstEntIdPrVChA);
        fprintf(fOut, "         Neg-Diff: %s\n", NegDiffLinkWgtDstEntIdPrVChA.CStr());
      }
      PrevLinkWgtDstEntIdPrV=LinkWgtDstEntIdPrV;
      // output linked entities
      int TopLinkEnts=LinkWgtDstEntIdPrV.Len();
      TChA LinkWgtDstEntIdPrVChA;
      GetLinkWgtDstEntIdPrVChA(LinkWgtDstEntIdPrV, LinkWgtDstEntIdPrVChA);
      fprintf(fOut, "      Entities (%d ents): %s\n",
       TopLinkEnts, LinkWgtDstEntIdPrVChA.CStr());
      // get text centroid
      int CtrDocs; TStrFltPrV WordStrWgtPrV;
      Ent.GetDocCentroid(this, BowDocBs, BowDocWgtBs, MnTm, 150, 0.9, CtrDocs, WordStrWgtPrV);
      // output difference between previous and current centroid
      if (MnTmN>0){
        TStrFltPrV NegDiffWordStrWgtPrV; TStrFltPrV PosDiffWordStrWgtPrV;
        GetWordStrWgtPrVDiff(WordStrWgtPrV, PrevWordStrWgtPrV,
         NegDiffWordStrWgtPrV, PosDiffWordStrWgtPrV);
        // output positive change
        TChA PosDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(PosDiffWordStrWgtPrV, PosDiffWordStrWgtPrVChA);
        fprintf(fOut, "         Pos-Diff: %s\n", PosDiffWordStrWgtPrVChA.CStr());
        // output negative change
        TChA NegDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(NegDiffWordStrWgtPrV, NegDiffWordStrWgtPrVChA);
        fprintf(fOut, "         Neg-Diff: %s\n", NegDiffWordStrWgtPrVChA.CStr());
      }
      PrevWordStrWgtPrV=WordStrWgtPrV;
      // output centroid
      TChA WordStrWgtPrVChA; GetWordStrWgtPrVChA(WordStrWgtPrV, WordStrWgtPrVChA);
      fprintf(fOut, "      Centroid (%d docs, %d words): %s\n",
       CtrDocs, WordStrWgtPrV.Len(), WordStrWgtPrVChA.CStr());
      // output time
      fprintf(fOut, "   [-%.1f days: %s]\n", PastDays, MnTmStr.CStr());
    }
    // entity clustering
    /*TVec<TStrFltPrV> EntNmWgtPrVV;
    Ent.GetEntClustV(this, MnTmV.Last(), 100, 1000, 10, EntNmWgtPrVV);
    for (int ClustN=0; ClustN<EntNmWgtPrVV.Len(); ClustN++){
      TStrFltPrV& EntNmWgtPrV=EntNmWgtPrVV[ClustN];
      fprintf(fOut, "   Clust-%d:", ClustN);
      for (int EntN=0; EntN<EntNmWgtPrV.Len(); EntN++){
        TStr EntNm=EntNmWgtPrV[EntN].Val1;
        double Wgt=EntNmWgtPrV[EntN].Val2;
        fprintf(fOut, " ['%s':%.3f]", EntNm.CStr(), Wgt);
      }
      fprintf(fOut, "\n");
    }*/
    fprintf(fOut, "\n");
  }
}
Exemplo n.º 20
0
int TStrUtil::CountWords(const TChA& ChA) {
  return CountWords(ChA.CStr());
}