int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') { const char *c = Url.CStr(); int cnt = 0; while (*c && cnt != Count) { if (*c == Ch) { cnt++; } c++; } return int(c-Url.CStr()-1); }
int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) { WrdV.Clr(false); WrdV.Add(ChA.CStr()); for (char *c = (char *) ChA.CStr(); *c; c++) { if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) { *c = 0; if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); } WrdV.Add(c+1); } } return WrdV.Len(); }
int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) { WrdV.Clr(false); WrdV.Add(ChA.CStr()); for (char *c = (char *) ChA.CStr(); *c; c++) { if (*c == Ch) { *c = 0; if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); } WrdV.Add(c+1); } } if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); } return WrdV.Len(); }
// space seprated sequence of words (includes all non-blank characters, i.e., punctuations) TChA TStrUtil::GetCleanStr(const TChA& ChA) { char *b = (char *) ChA.CStr(); while (*b && ! TCh::IsAlNum(*b)) { b++; } if (*b == 0) { return TChA(); } TChA OutChA(ChA.Len()); char *e = b; bool ws=false; while (*e) { while (*e && TCh::IsWs(*e)) { e++; ws=true; } if (! *e) { break; } if (ws) { OutChA.AddCh(' '); ws=false; } OutChA.AddCh(*e); e++; } //OutChA.ToLc(); return OutChA; }
// space separated sequence of words, remove all punctuations, etc. TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) { char *b = (char *) ChA.CStr(); while (*b && ! TCh::IsAlNum(*b)) { b++; } if (*b == 0) { return TChA(); } TChA OutChA(ChA.Len()); char *e = b, tmp; while (*e) { b = e; while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; } if (b < e) { tmp = *e; *e=0; OutChA += b; OutChA.AddCh(' '); *e = tmp; } while (*e && ! TCh::IsAlNum(*e)) { e++; } if (! *e) { break; } } OutChA.DelLastCh(); OutChA.ToLc(); return OutChA; }
const char* TSsParserMP::DumpStr() const { static TChA ChA(10*1024); ChA.Clr(); for (int i = 0; i < FldV.Len(); i++) { ChA += TStr::Fmt(" %d: '%s'\n", i, FldV[i]); } return ChA.CStr(); }
void TNodeJsFIn::readLine(const v8::FunctionCallbackInfo<v8::Value>& Args) { v8::Isolate* Isolate = v8::Isolate::GetCurrent(); v8::HandleScope HandleScope(Isolate); TNodeJsFIn* JsFIn = ObjectWrap::Unwrap<TNodeJsFIn>(Args.This()); TChA LnChA; JsFIn->SIn->GetNextLnBf(LnChA); Args.GetReturnValue().Set(v8::String::NewFromUtf8(Isolate, LnChA.CStr())); }
TStr TPorterStemmer::StemX(const TStr& s){ TChA buf = s; buf.ToUc(); int len = buf.Len(); char *p = buf.CStr(); if (len > 1 && p[len - 1] == 'S' && p[len - 2] == '\'') p[len - 2] = '\0'; else if (len > 0 && p[len - 1] == '\'') p[len - 1] = '\0'; return StemInPlace(p); }
bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) { int AlNumCnt=0, ChCnt=0; for (const char *c = Str.CStr(); *c; c++) { if (TCh::IsWs(*c)) { continue; } if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; } ChCnt++; } if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; } return false; }
int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) { LineV.Clr(false); LineV.Add(ChA.CStr()); bool IsChs=false; for (char *c = (char *) ChA.CStr(); *c; c++) { if (*c == '\n') { if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n *c=0; if (SkipEmpty) { if (IsChs) { LineV.Add(c+1); } } else { LineV.Add(c+1); } IsChs=false; } else { IsChs=true; } } return LineV.Len(); }
void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) { TextStr.Clr(); char *StrB, *StrE; // use full page html: skip till <body> //PageHtmlStr = "<script fdsfs> fsdfsd </script> jure"; /*if (UseFullHtml) { StrB = PageHtmlStr.CStr(); StrE = StrB+PageHtmlStr.Len(); char * NewB = strstr(StrB, "<body>"); if (NewB != NULL) { StrB = NewB+6; } char * NewE = strstr(StrB, "body>"); if (NewE != NULL) { while (true) { char *E=strstr(NewE+4, "body>"); if (E == NULL) { break; } NewE = E; } StrE = NewE; } } else { // only extracted post html*/ StrB = (char *) HtmlStr.CStr(); StrE = (char *) StrB+HtmlStr.Len(); //} for (char *e = StrB; e < StrE; ) { char* b = e; while (e<StrE && *e != '<') { e++; } // copy text char tmp=*e; *e = 0; TextStr+= b; TextStr.AddCh(' '); *e = tmp; if (e >= StrE) { return; } // if start of a comment: skip if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment e += 3; while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; } e++; continue; } // if "<script" then skip if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') { e += 5; while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; } e++; continue; } // skip to end of tag while (e < StrE && *e != '>') { e++; } if (e>=StrE) { return; } e++; } }
int TStrUtil::SplitSentences(TChA& ChA, TVec<char *>& SentenceV) { SentenceV.Clr(); const char *B = ChA.CStr(); const char *E = B+ChA.Len(); char *c = (char *) B; while (*c && TCh::IsWs(*c)) { c++; } if (*c) { SentenceV.Add(c); } else { return 0; } for (; c < E; c++) { if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence if (c<E && *(c+1)=='"') { *c='"'; c++; } // blah." --> blah" if (c>=E) { continue; } *c=0; c++; char *e = c-1; while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum if (c<E) { SentenceV.Add(c); } } } return SentenceV.Len(); }
// get <TagNm>*</TagNm> (can be many tags inbetween bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) { if (XmlLx.GetSym() != xsySTag) { return false; } TagVal.Clr(); TagNm = XmlLx.TagNm; //const TXmlLxSym NextSym = XmlLx.GetSym(); while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) { if (TakeTagNms) { TagVal += XmlLx.TxtChA; } else if (XmlLx.Sym == xsyStr) { TagVal += XmlLx.TxtChA; } XmlLx.GetSym(); } return true; //if (NextSym == xsyStr) { // EAssertR(XmlLx.GetSym() == xsyETag, TagNm); //} else { // EAssertR(NextSym == xsyETag, TagNm); // empty tag //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr()); //} }
// Parse strings of the form 2006-08-28 14:11:16 or 14:11:16 08/28/2008 // Non-numeric characters act as separators (there can be many consecutive separating characters) // Variables give indexes of the date fields TSecTm TSecTm::GetDtTmFromStr(const TChA& YmdHmsPmStr, const int& YearId, const int& MonId, const int& DayId, const int& HourId, const int& MinId, const int& SecId) { TChA Tmp = YmdHmsPmStr; TVec<char *> FldV; // get the sequences of numbers for (char *c = (char *) Tmp.CStr(); *c; c++) { if (TCh::IsNum(*c)) { FldV.Add(c); while (TCh::IsNum(*c)) { c++; } c--; } else { *c = 0; } } const int Y = atoi(FldV[YearId]); const int M = atoi(FldV[MonId]); const int D = atoi(FldV[DayId]); const int H = atoi(FldV[HourId]); const int m = atoi(FldV[MinId]); const int S = atoi(FldV[SecId]); IAssert(Y>0 && M>0 && D>0 && M<13 && D<32); IAssert(H>=0 && H<24 && m>=0 && m<60 && S>=0 && S<60); return TSecTm(Y,M,D,H,m,S); }
/////////////////////////////// // Tokenizer-Utils void TTokenizerUtil::Sentencize(const PSIn& SIn, TStrV& Sentences, const bool& SplitNewLineP) { TChA SentenceBuf; int c; while (!SIn->Eof()) { c = SIn->GetCh(); switch (c) { case '\r': case '\n': { if (!SplitNewLineP) { SentenceBuf += ' '; break; } } case '"' : case '.' : case '!' : case ':' : case ';' : case '?' : case '\t': { if (SentenceBuf.Len() > 2) { Sentences.Add(SentenceBuf); printf("%s\n", SentenceBuf.CStr()); SentenceBuf.Clr(); } break; } default: SentenceBuf += c; break; } } if (SentenceBuf.Len() > 0) { Sentences.Add(SentenceBuf); } }
TMIn::TMIn(const TChA& ChA): TSBase("Input-Memory"), TSIn("Input-Memory"), Bf(NULL), BfC(0), BfL(0){ BfL=ChA.Len(); Bf=new char[BfL]; strncpy(Bf, ChA.CStr(), BfL); }
int TSOut::PutStr(const TChA& ChA){ int Cs=UpdateLnLen(ChA.Len()); return Cs+PutBf(ChA.CStr(), ChA.Len()); }
TStr TPorterStemmer::Stem(const TStr& s) { TChA buf = s; buf.ToUc(); return StemInPlace(buf.CStr()); }
void TSkyGridBs::SaveTxt(const TStr& FNm, const uint64& CurTm){ // time-limit TStr CurTmStr=TTm::GetTmFromMSecs(CurTm).GetWebLogDateTimeStr(); uint64 CurDateTm=TTm::GetMSecsFromTm(TTm::GetTmFromWebLogDateTimeStr(TTm::GetTmFromMSecs(CurTm).GetWebLogDateStr())); TStr CurDateTmStr=TTm::GetTmFromMSecs(CurDateTm).GetWebLogDateTimeStr(); TUInt64V MnTmV; MnTmV.Add(CurDateTm-0*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-1*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-2*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-4*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-8*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-16*TTmInfo::GetDayMSecs()); MnTmV.Add(CurDateTm-32*TTmInfo::GetDayMSecs()); // get bow //PBowDocBs BowDocBs=GetBowDocBs(3, 5); PBowDocBs BowDocBs=GetBowDocBs(); PBowDocWgtBs BowDocWgtBs=GetBowDocWgtBs(BowDocBs); // open file TFOut FOut(FNm); FILE* fOut=FOut.GetFileId(); // get docs-entities sorted vector TIntPrV DocsEntIdPrV; GetSorted_DocsEntIdPrV(DocsEntIdPrV); // traverse entities for (int EntN=0; EntN<DocsEntIdPrV.Len(); EntN++){ int EntId=DocsEntIdPrV[EntN].Val2; TStr EntNm=GetEntNm(EntId); int EntDocs=DocsEntIdPrV[EntN].Val1; TSkyGridEnt& Ent=GetEnt(EntId); int LinkEnts=Ent.GetLinkEnts(); fprintf(fOut, "'%s' [%d docs] [%d ents]\n", EntNm.CStr(), EntDocs, LinkEnts); // output docs over dates {TStrIntPrV DateStrDocsPrV; int _EntDocs; Ent.GetDocsPerDateV(this, DateStrDocsPrV, _EntDocs); fprintf(fOut, " Docs per Date (%d docs):", _EntDocs); for (int DateN=0; DateN<DateStrDocsPrV.Len(); DateN++){ TStr DateStr=DateStrDocsPrV[DateN].Val1; int Docs=DateStrDocsPrV[DateN].Val2; fprintf(fOut, " [%s:%d]", DateStr.CStr(), Docs); } fprintf(fOut, "\n");} fprintf(fOut, " [Now: %s]\n", CurTmStr.CStr()); TIntPrV PrevLinkWgtDstEntIdPrV; TStrFltPrV PrevWordStrWgtPrV; for (int MnTmN=0; MnTmN<MnTmV.Len(); MnTmN++){ uint64 MnTm=MnTmV[MnTmN]; double PastDays=(CurDateTm-MnTm)/double(TTmInfo::GetDayMSecs()); TStr MnTmStr=TTm::GetTmFromMSecs(MnTm).GetWebLogDateTimeStr(); // get linked entities TIntPrV LinkWgtDstEntIdPrV; Ent.GetSorted_LinkWgtDstEntIdPrV(MnTm, 0.9, LinkWgtDstEntIdPrV); // output difference between previous and current centroid if (MnTmN>0){ TIntPrV NegDiffLinkWgtDstEntIdPrV; TIntPrV PosDiffLinkWgtDstEntIdPrV; GetLinkWgtDstEntIdPrVDiff(LinkWgtDstEntIdPrV, PrevLinkWgtDstEntIdPrV, NegDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrV); // output positive change TChA PosDiffLinkWgtDstEntIdPrVChA; GetLinkWgtDstEntIdPrVChA(PosDiffLinkWgtDstEntIdPrV, PosDiffLinkWgtDstEntIdPrVChA); fprintf(fOut, " Pos-Diff: %s\n", PosDiffLinkWgtDstEntIdPrVChA.CStr()); // output negative change TChA NegDiffLinkWgtDstEntIdPrVChA; GetLinkWgtDstEntIdPrVChA(NegDiffLinkWgtDstEntIdPrV, NegDiffLinkWgtDstEntIdPrVChA); fprintf(fOut, " Neg-Diff: %s\n", NegDiffLinkWgtDstEntIdPrVChA.CStr()); } PrevLinkWgtDstEntIdPrV=LinkWgtDstEntIdPrV; // output linked entities int TopLinkEnts=LinkWgtDstEntIdPrV.Len(); TChA LinkWgtDstEntIdPrVChA; GetLinkWgtDstEntIdPrVChA(LinkWgtDstEntIdPrV, LinkWgtDstEntIdPrVChA); fprintf(fOut, " Entities (%d ents): %s\n", TopLinkEnts, LinkWgtDstEntIdPrVChA.CStr()); // get text centroid int CtrDocs; TStrFltPrV WordStrWgtPrV; Ent.GetDocCentroid(this, BowDocBs, BowDocWgtBs, MnTm, 150, 0.9, CtrDocs, WordStrWgtPrV); // output difference between previous and current centroid if (MnTmN>0){ TStrFltPrV NegDiffWordStrWgtPrV; TStrFltPrV PosDiffWordStrWgtPrV; GetWordStrWgtPrVDiff(WordStrWgtPrV, PrevWordStrWgtPrV, NegDiffWordStrWgtPrV, PosDiffWordStrWgtPrV); // output positive change TChA PosDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(PosDiffWordStrWgtPrV, PosDiffWordStrWgtPrVChA); fprintf(fOut, " Pos-Diff: %s\n", PosDiffWordStrWgtPrVChA.CStr()); // output negative change TChA NegDiffWordStrWgtPrVChA; GetWordStrWgtPrVChA(NegDiffWordStrWgtPrV, NegDiffWordStrWgtPrVChA); fprintf(fOut, " Neg-Diff: %s\n", NegDiffWordStrWgtPrVChA.CStr()); } PrevWordStrWgtPrV=WordStrWgtPrV; // output centroid TChA WordStrWgtPrVChA; GetWordStrWgtPrVChA(WordStrWgtPrV, WordStrWgtPrVChA); fprintf(fOut, " Centroid (%d docs, %d words): %s\n", CtrDocs, WordStrWgtPrV.Len(), WordStrWgtPrVChA.CStr()); // output time fprintf(fOut, " [-%.1f days: %s]\n", PastDays, MnTmStr.CStr()); } // entity clustering /*TVec<TStrFltPrV> EntNmWgtPrVV; Ent.GetEntClustV(this, MnTmV.Last(), 100, 1000, 10, EntNmWgtPrVV); for (int ClustN=0; ClustN<EntNmWgtPrVV.Len(); ClustN++){ TStrFltPrV& EntNmWgtPrV=EntNmWgtPrVV[ClustN]; fprintf(fOut, " Clust-%d:", ClustN); for (int EntN=0; EntN<EntNmWgtPrV.Len(); EntN++){ TStr EntNm=EntNmWgtPrV[EntN].Val1; double Wgt=EntNmWgtPrV[EntN].Val2; fprintf(fOut, " ['%s':%.3f]", EntNm.CStr(), Wgt); } fprintf(fOut, "\n"); }*/ fprintf(fOut, "\n"); } }
int TStrUtil::CountWords(const TChA& ChA) { return CountWords(ChA.CStr()); }