PBowDocBs TNmObjBs::GetBowDocBs(const int& MnNmObjFq) const { printf("Generating Bag-Of-Words...\n"); // create bag-of-words PBowDocBs BowDocBs=TBowDocBs::New(); // traverse documents for (int DocId=0; DocId<GetDocs(); DocId++){ if (DocId%100==0){printf("%d\r", DocId);} TStr DocNm=GetDocNm(DocId); TStr DateStr=GetDocDateStr(DocId); TStrV WordStrV; int DocNmObjs=GetDocNmObjs(DocId); for (int DocNmObjN=0; DocNmObjN<DocNmObjs; DocNmObjN++){ int NmObjId; int TermFq; GetDocNmObjId(DocId, DocNmObjN, NmObjId, TermFq); if ((MnNmObjFq==-1)||(GetNmObjDocs(NmObjId)>=MnNmObjFq)){ TStr NmObjStr=GetNmObjStr(NmObjId); for (int TermOccN=0; TermOccN<TermFq; TermOccN++){ WordStrV.Add(NmObjStr); } } } if (!WordStrV.Empty()){ int DId=BowDocBs->AddDoc(DocNm, TStrV(), WordStrV); BowDocBs->PutDateStr(DId, DateStr); } } // return bag-of-words BowDocBs->AssertOk(); printf("\nDone.\n"); return BowDocBs; }
////////////////////////////////////// // URL-Redirect-Function TSASFunRedirect::TSASFunRedirect(const TStr& FunNm, const TStr& SettingFNm): TSAppSrvFun(FunNm, saotUndef) { printf("Loading redirects %s\n", FunNm.CStr()); TFIn FIn(SettingFNm); TStr LnStr, OrgFunNm; while (FIn.GetNextLn(LnStr)) { TStrV PartV; LnStr.SplitOnAllCh('\t', PartV, false); if (PartV.Empty()) { continue; } if (PartV[0].Empty()) { // parameters EAssert(PartV.Len() >= 3); TStr FldNm = PartV[1]; TStr FldVal = PartV[2]; if (FldVal.StartsWith("$")) { MapH.GetDat(OrgFunNm).FldNmMapH.AddDat(FldVal.Right(1), FldNm); } else { MapH.GetDat(OrgFunNm).FldNmValPrV.Add(TStrKd(FldNm, FldVal)); } } else { // new function EAssert(PartV.Len() >= 2); OrgFunNm = PartV[0]; MapH.AddDat(OrgFunNm).FunNm = PartV[1]; printf(" %s - %s\n", PartV[0].CStr(), PartV[1].CStr()); } } printf("Done\n"); }
void TGgSchRef::GetAuthNmVPubStr( const TStr& AuthNmVPubStr, TStrV& AuthNmV, TStr& PubNm, TStr& PubYearStr){ // split input string into two parts TStr AuthNmVStr; TStr PubStr; AuthNmVPubStr.SplitOnStr(AuthNmVStr, " - ", PubStr); // author-names string AuthNmVStr.SplitOnAllCh(',', AuthNmV, true); for (int AuthN=0; AuthN<AuthNmV.Len(); AuthN++){ AuthNmV[AuthN].ToTrunc(); } if ((!AuthNmV.Empty())&& ((AuthNmV.Last().IsStrIn("..."))||(AuthNmV.Last().Len()<=2))){ AuthNmV.DelLast(); } // publication-name & publication-year string TStr OriginStr; TStr LinkStr; PubStr.SplitOnStr(OriginStr, " - ", LinkStr); OriginStr.SplitOnLastCh(PubNm, ',', PubYearStr); PubNm.ToTrunc(); PubYearStr.ToTrunc(); if ((PubYearStr.Len()>=4)&&(PubYearStr.GetSubStr(0, 3).IsInt())){ PubYearStr=PubYearStr.GetSubStr(0, 3); } else if ((PubNm.Len()>=4)&&(PubNm.GetSubStr(0, 3).IsInt())){ PubYearStr=PubNm.GetSubStr(0, 3); PubNm=""; } else { PubYearStr=""; } }
TStrV TEnv::GetIfArgPrefixStrV( const TStr& PrefixStr, TStrV& DfValV, const TStr& DNm) const { TStrV ArgValV; if (Env.GetArgs()<=MnArgs) { // 'usage' argument message if (!SilentP) { printf(" %s%s (default:", PrefixStr.CStr(), DNm.CStr()); for (int DfValN=0; DfValN<DfValV.Len(); DfValN++) { if (DfValN>0) { printf(", "); } printf("'%s'", DfValV[DfValN].CStr()); } printf(")\n"); } return ArgValV; } else { // argument & value message TStr ArgValVChA; for (int ArgN=0; ArgN<GetArgs(); ArgN++) { // get argument string TStr ArgStr=GetArg(ArgN); if (ArgStr.StartsWith(PrefixStr)) { // extract & add argument value ArgStr.DelStr(PrefixStr); ArgValV.Add(ArgStr); // add to message string if (ArgValV.Len()>1) { ArgValVChA+=", "; } ArgValVChA+=ArgValV.Last(); } } if (ArgValV.Empty()) { ArgValV=DfValV; } // output argument values TChA MsgChA; MsgChA+=" "+DNm; MsgChA+=" ("; MsgChA+=PrefixStr; MsgChA+=")="; for (int ArgValN=0; ArgValN<ArgValV.Len(); ArgValN++) { if (ArgValN>0) { MsgChA+=", "; } MsgChA+="'"; MsgChA+=ArgValV[ArgValN]; MsgChA+="'"; } if (!SilentP) { TNotify::OnStatus(Notify, MsgChA); } return ArgValV; } }
// <last_name>_<first name innitial> TStr TStrUtil::GetStdName(TStr AuthorName) { TStr StdName; AuthorName.ToLc(); AuthorName.ChangeChAll('\n', ' '); AuthorName.ChangeChAll('.', ' '); // if there is a number in the name, remove it and everything after it int i, pos = 0; while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) { pos++; } if (pos < AuthorName.Len()) { AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); } if (AuthorName.Empty()) { return TStr::GetNullStr(); } // replace everything after '(' int b = AuthorName.SearchCh('('); if (b != -1) { AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); } // skip if contains ')' if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); } // skip if it is not a name if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1 || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) { return TStr::GetNullStr(); } // remove all non-letters (latex tags, ...) TChA NewName; for (i = 0; i < AuthorName.Len(); i++) { const char Ch = AuthorName[i]; if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; } } StdName = NewName; StdName.ToTrunc(); TStrV AuthNmV; StdName.SplitOnWs(AuthNmV); // too short -- not a name if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast(); if (AuthNmV.Len() < 2) return TStr::GetNullStr(); const TStr LastNm = AuthNmV.Last(); if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr(); IAssert(isalpha(AuthNmV[0][0])); return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]); }
// parse: // 10:16, 16 Sep 2004 // 10:20, 2004 Sep 16 // 2005-07-07 20:30:35 // 23:24:07, 2005-07-10 // 9 July 2005 14:38 // 21:16, July 9, 2005 // 06:02, 10 July 2005 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) { static TStrV MonthV1, MonthV2; if (MonthV1.Empty()) { TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1); TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2); } TChA Tmp(TmStr); Tmp.ToLc(); TVec<char *> WrdV; const char* End = Tmp.CStr()+Tmp.Len(); int Col = -1, Cols=0; for (char *b = Tmp.CStr(); b <End; ) { WrdV.Add(b); while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; } if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++; } *b=0; b++; while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; } } if (Cols == 2) { if (Col+1 >= WrdV.Len()) { return false; } WrdV.Del(Col+1); } if (Col<1) { return false; } const int Hr = atoi(WrdV[Col-1]); const int Min = atoi(WrdV[Col]); WrdV.Del(Col); WrdV.Del(Col-1); if (WrdV.Len() != 3) { return false; } int y=0,m=1,d=2, Mon=-1; if (TCh::IsAlpha(WrdV[0][0])) { y=2; m=0; d=1; } else if (TCh::IsAlpha(WrdV[1][0])) { y=2; m=1; d=0; } else if (TCh::IsAlpha(WrdV[2][0])) { y=0; m=2; d=1; } else { y=0; m=1; d=2; Mon = atoi(WrdV[m]); } int Day = atoi(WrdV[d]); if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; } if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; } if (Mon == 0) { return false; } int Year = atoi(WrdV[y]); if (Day > Year) { ::Swap(Day, Year); } //printf("%d-%02d-%02d %02d:%02d\n", Year, Mon, Day, Hr, Min); Tm = TSecTm(Year, Mon, Day, Hr, Min, 0); return true; }
TStrV TEnv::GetIfArgPrefixStrV(const TStr& PrefixStr, TStrV& DfValV, const TStr& DNm) const { TStrV ArgValV; if (Env.GetArgs() <= MnArgs) { // 'usage' argument message if (!SilentP) { printf(" %s%s (default:", PrefixStr.CStr(), DNm.CStr()); for (int DfValN = 0; DfValN < DfValV.Len(); DfValN++) { if (DfValN > 0) printf(", "); printf("%s", DfValV[DfValN].CStr()); } printf(")\n"); } return ArgValV; } else { // argument & value message TStrV Items; for (int ArgN = 0; ArgN < GetArgs(); ArgN++) { // get argument string TStr ArgStr = GetArg(ArgN); if (ArgStr.GetSubStr(0, PrefixStr.Len() - 1) == PrefixStr) { TStr ArgVals = ArgStr.GetSubStr( PrefixStr.Len(), ArgStr.Len()); ArgVals.SplitOnAllCh(',', Items); for (int i = 0; i < Items.Len(); i++) ArgValV.Add(Items[i]); } } if (ArgValV.Empty()) ArgValV = DfValV; // output argument values TChA MsgChA; MsgChA += DNm; MsgChA += " ("; MsgChA += PrefixStr; MsgChA += ")="; for (int ArgValN = 0; ArgValN < ArgValV.Len(); ArgValN++) { if (ArgValN > 0) MsgChA += ", "; MsgChA += ArgValV[ArgValN]; } if (!SilentP) TNotify::OnStatus(Notify, MsgChA); return ArgValV; } }
PJsonVal TGraphCascade::GetPosterior(const TStrV& NodeNmV, const TFltV& QuantileV) const { PJsonVal Result = TJsonVal::NewObj(); TIntV NodeIdV; if (NodeNmV.Empty()) { // go over all zero timestamps for which samples exist TIntV FullNodeIdV; Graph.GetNIdV(FullNodeIdV); int Nodes = Graph.GetNodes(); for (int NodeN = 0; NodeN < Nodes; NodeN++) { int NodeId = FullNodeIdV[NodeN]; if (Timestamps.IsKey(NodeId) && Sample.IsKey(NodeId) && !Sample.GetDat(NodeId).Empty() && Timestamps.GetDat(NodeId) == 0) { NodeIdV.Add(NodeId); } } } else { int Nodes = NodeNmV.Len(); for (int NodeN = 0; NodeN < Nodes; NodeN++) { if (!NodeNmIdH.IsKey(NodeNmV[NodeN])) { continue; } int NodeId = NodeNmIdH.GetDat(NodeNmV[NodeN]); if (Timestamps.IsKey(NodeId) && Sample.IsKey(NodeId) && !Sample.GetDat(NodeId).Empty() && Timestamps.GetDat(NodeId) == 0) { NodeIdV.Add(NodeId); } } } EAssertR(QuantileV.Len() > 0, "TGraphCascade::GetPosterior quantiles should not be empty!"); for (int QuantileN = 0; QuantileN < QuantileV.Len(); QuantileN++) { EAssertR((QuantileV[QuantileN] >= 0.0) && (QuantileV[QuantileN] <= 1.0), "TGraphCascade::GetPosterior quantiles should be between 0.0 and 1.0"); } int Nodes = NodeIdV.Len(); for (int NodeN = 0; NodeN < Nodes; NodeN++) { int NodeId = NodeIdV[NodeN]; TStr NodeNm = NodeIdNmH.GetDat(NodeId); int Quantiles = QuantileV.Len(); TUInt64V SampleV = Sample.GetDat(NodeId); SampleV.Sort(true); int SampleSize = SampleV.Len(); PJsonVal QuantilesArr = TJsonVal::NewArr(); for (int QuantileN = 0; QuantileN < Quantiles; QuantileN++) { int Idx = (int)floor(QuantileV[QuantileN] * SampleSize); Idx = MIN(Idx, SampleSize - 1); uint64 UnixTimestamp = TTm::GetUnixMSecsFromWinMSecs(SampleV[Idx]); QuantilesArr->AddToArr((double)UnixTimestamp); } Result->AddToObj(NodeNm, QuantilesArr); } return Result; }
void TNmObjBs::LoadCustSwSet(const PSIn& SIn){ if (SIn.Empty()){return;} TILx Lx(SIn, TFSet(iloCmtAlw, iloRetEoln, iloExcept)); // traverse lines Lx.GetSym(syLn, syEof); while (Lx.Sym!=syEof){ // get stop-phrase string TStr WordStrVStr=Lx.Str; // split phrase to words TStrV WordStrV; WordStrVStr.SplitOnWs(WordStrV); if (!WordStrV.Empty()){ // define phrase as stop-word WordStrVToNmObjAttrSetH.AddDat(WordStrV).Incl(noaIgnore); } // get next symbol Lx.GetSym(syLn, syEof); } }
void TNmObjBs::ExtrCandWordStrV( const TStr& HtmlStr, TStrV& CandWordStrV, const bool& DumpP){ // prepare named-object vector CandWordStrV.Clr(); // prepare html parsing PSIn HtmlSIn=TStrIn::New(HtmlStr); PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn, hdtAll, false); PHtmlTok Tok; THtmlLxSym Sym; TStr Str; TStr NrStr; CandWordStrV.Add(PeriodTagStr); bool InTitle=false; bool InScript=false; int LastNmObjTokN=-1; // traverse html tokens if (DumpP){printf("Tokens: ");} for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){ PHtmlTok Tok=HtmlDoc->GetTok(TokN); HtmlDoc->GetTok(TokN, Sym, Str); switch (Sym){ case hsyUndef: break; case hsyStr: case hsyNum: if (InTitle){break;} if (InScript){break;} NrStr=GetNrWordStr(Str); if (DumpP){ if (Str==NrStr){printf("%s ", Str.CStr());} else {printf("%s(%s) ", Str.CStr(), NrStr.CStr());} } if (IsFirstCapWordStr(NrStr)||IsNmObjAttr(NrStr, noaAsCapitalized)){ if ((LastNmObjTokN!=-1)&&(LastNmObjTokN<TokN-1)){ if (CandWordStrV.Last()!=PeriodTagStr){ CandWordStrV.Add(BreakTagStr); } } CandWordStrV.Add(NrStr); LastNmObjTokN=TokN; } break; case hsySSym: if (InTitle){break;} if (InScript){break;} if (DumpP){ printf("%s ", Str.CStr());} if ( (Str==".")||(Str=="!")||(Str=="?")|| (Str=="\"")||(Str=="-")||(Str=="/")|| (Str==":")||(Str==";")){ if (CandWordStrV.Last()!=PeriodTagStr){ CandWordStrV.Add(PeriodTagStr); } } break; case hsyBTag: case hsyETag: if (Str=="<TITLE>"){ InTitle=(Sym==hsyBTag); } else if (Str=="<SCRIPT>"){ InScript=(Sym==hsyBTag); } else if (Str=="<P>"){ if ((!CandWordStrV.Empty())&&(CandWordStrV.Last()!=ParagraphTagStr)){ CandWordStrV.Add(ParagraphTagStr); CandWordStrV.Add(PeriodTagStr); } } else if ((Str=="<TD>")||(Str=="<BR>")){ CandWordStrV.Add(PeriodTagStr); } break; case hsyEof: break; default: break; } } CandWordStrV.Add(EofTagStr); if (DumpP){printf("\n");} if (DumpP){ printf("Candidates: "); for (int CandWordStrN=0; CandWordStrN<CandWordStrV.Len(); CandWordStrN++){ printf("%s ", CandWordStrV[CandWordStrN].CStr());} printf("\n"); } }
///////////////////////////////////////////////// // NIST-score double TEvalScoreNist::Eval(const PTransCorpus& TransCorpus, const TIntV& _SentIdV) { // check if the corpus has translations IAssert(TransCorpus->IsTrans()); // ngram counts (cliped and full) TIntH ClipCountNGramH, CountNGramH; // ngram info score TIntFltH NGramInfoH; // candidate and effective reference length double FullTransLen = 0.0, FullRefLen = 0.0; // iterate over sentences TIntV SentIdV = _SentIdV; if (SentIdV.Empty()) { TransCorpus->GetSentIdV(SentIdV); } const int Sents = SentIdV.Len(); for (int SentIdN = 0; SentIdN < Sents; SentIdN++) { const int SentId = SentIdV[SentIdN]; // tokenize translation TIntV TransWIdV; Parse(TransCorpus->GetTransStr(SentId), TransWIdV); TIntH TransNGramH; GetNGramH(TransWIdV, MxNGramLen, TransNGramH); TIntH FreeTransNGramH = TransNGramH; // number of non-matched ngrams // counters for getting the closest length of reference sentences const int TransLen = TransWIdV.Len(); int RefLenSum = 0; // go over reference translations and count ngram matches TStrV RefTransStrV = TransCorpus->GetRefTransStrV(SentId); // we assume that there is at least one reference translation IAssert(!RefTransStrV.Empty()); for (int RefN = 0; RefN < RefTransStrV.Len(); RefN++) { // parse reference translation sentence TIntV RefWIdV; Parse(RefTransStrV[RefN], RefWIdV); TIntH RefNGramH; GetNGramH(RefWIdV, MxNGramLen, RefNGramH); // check for matches int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { const int NGramId = TransNGramH.GetKey(TransNGramKeyId); const int FreeTransNGrams = FreeTransNGramH(NGramId); if (RefNGramH.IsKey(NGramId) && (FreeTransNGrams>0)) { // ngram match and still some free ngrams left to clip const int RefNGrams = RefNGramH(NGramId); FreeTransNGramH(NGramId) = TInt::GetMx(0, FreeTransNGrams - RefNGrams); } } // check the length difference const int RefLen = RefWIdV.Len(); RefLenSum += RefLen; } // count ngrams int TransNGramKeyId = TransNGramH.FFirstKeyId(); while(TransNGramH.FNextKeyId(TransNGramKeyId)) { // get ngram const int NGramId = TransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId != -1); // check if two hash tables are aligned (should be...) const int FreeNGramId = FreeTransNGramH.GetKey(TransNGramKeyId); IAssert(NGramId == FreeNGramId); // get ngram count and clip-count const int Count = TransNGramH[TransNGramKeyId]; const int ClipCount = Count - FreeTransNGramH[TransNGramKeyId]; // add ngram to the coprus ngram counts CountNGramH.AddDat(NGramId) += Count; ClipCountNGramH.AddDat(NGramId) += ClipCount; } // count length FullTransLen += double(TransLen); FullRefLen += double(RefLenSum) / double(RefTransStrV.Len()); } // calculate ngram info scores int CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { // get the n-gram const int NGramId = CountNGramH.GetKey(CountKeyId); TIntV NGram = GetNGram(NGramId); // prepare counts if (NGram.Len() == 1) { // n-gram is a word const int WordCount = CountNGramH[CountKeyId]; const double NGramInfoScore = TMath::Log2(FullTransLen / double(WordCount)); NGramInfoH.AddDat(NGramId, NGramInfoScore); } else { // more then one word in the n-gram // get a n-gram with removed last element TIntV N1Gram = NGram; N1Gram.DelLast(); const int N1GramId = NGramH.GetKeyId(N1Gram); // get the counts const int NGramCount = CountNGramH(NGramId); const int N1GramCount = CountNGramH(N1GramId); // get the score const double NGramInfoScore = TMath::Log2(double(N1GramCount) / double(NGramCount)); NGramInfoH.AddDat(NGramId, NGramInfoScore); } } // calcualte ngram precisions TFltV ClipCountV(MxNGramLen); ClipCountV.PutAll(0); int ClipCountKeyId = ClipCountNGramH.FFirstKeyId(); while (ClipCountNGramH.FNextKeyId(ClipCountKeyId)) { const int NGramId = ClipCountNGramH.GetKey(ClipCountKeyId); const int NGramLen = GetNGramLen(NGramId); const double NGramInfo = NGramInfoH(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); const int ClipCountNGram = ClipCountNGramH[ClipCountKeyId]; ClipCountV[NGramLen-1] += double(ClipCountNGram) * NGramInfo; } TIntV CountV(MxNGramLen); CountV.PutAll(0); CountKeyId = CountNGramH.FFirstKeyId(); while (CountNGramH.FNextKeyId(CountKeyId)) { const int NGramId = CountNGramH.GetKey(CountKeyId); const int NGramLen = GetNGramLen(NGramId); IAssert(0 < NGramLen && NGramLen <= MxNGramLen); CountV[NGramLen-1] += CountNGramH[CountKeyId]; } TFltV PrecV(MxNGramLen, 0); for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { const double ClipCount = ClipCountV[NGramLen]; const int Count = CountV[NGramLen]; const double Prec = (Count > 0) ? ClipCount / double(Count) : 0.0; PrecV.Add(Prec); } // calcualte brevity penalty const double LenFrac = double(FullTransLen)/double(FullRefLen); double BP = 0.0; if (LenFrac >= 1.0) { BP = 1.0; } else if (LenFrac <= 0.0) { BP = 0.0; } else { // calculate beta const double LenFracX = 1.5, BPX = 0.5; const double Beta = log(BPX) / TMath::Sqr(log(LenFracX)); // calculate BP score BP = exp(Beta * TMath::Sqr(log(LenFrac))); } // calculate full NIST score double NistScore = 0.0; for (int NGramLen = 0; NGramLen < MxNGramLen; NGramLen++) { NistScore += PrecV[NGramLen]; } NistScore *= BP; printf("NIST Score: %.5f\n", NistScore); // done! return NistScore; }