int TGnuPlot::AddPlot(const TIntFltKdV& XYValV, const TGpSeriesTy& SeriesTy, const TStr& Label, const TStr& Style) { TFltKdV XYFltValV(XYValV.Len(), 0); for (int i = 0; i < XYValV.Len(); i++) { XYFltValV.Add(TFltKd(TFlt(XYValV[i].Key), TFlt(XYValV[i].Dat))); } return AddPlot(XYFltValV, SeriesTy, Label, Style); }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV) const { // make sure we either do not have explicit values, or their dimension matches with string keys EAssertR(FltV.Empty() || (StrV.Len() == FltV.Len()), "TMultinomial::AddFtr:: String and double values not aligned"); // generate internal feature vector SpV.Gen(StrV.Len(), 0); for (int StrN = 0; StrN < StrV.Len(); StrN++) { const int FtrId = FtrGen.GetFtr(StrV[StrN]); // only use features we've seen during updates if (FtrId != -1) { const double Val = FltV.Empty() ? 1.0 : FltV[StrN].Val; if (Val > 1e-16) { SpV.Add(TIntFltKd(FtrId, Val)); } } } SpV.Sort(); // merge elements with the same id int GoodSpN = 0; for (int SpN = 1; SpN < SpV.Len(); SpN++) { if (SpV[GoodSpN].Key == SpV[SpN].Key) { // repetition of previous id, sum counts SpV[GoodSpN].Dat += SpV[SpN].Dat; } else { // increase the pointer to the next good position GoodSpN++; // and move the new value down to the good position SpV[GoodSpN] = SpV[SpN]; } } // truncate the vector SpV.Trunc(GoodSpN + 1); // replace values with 1 if needed if (IsBinary()) { for (TIntFltKd& Sp : SpV) { Sp.Dat = 1.0; } } // final normalization, if needed if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
void TStrFeatureSpace::ToStr(const TIntFltKdV& FeatureIds, TChA& ChA, int k, char Sep) const { TIntSet TakenIndexes(k); int Len = TMath::Mn(FeatureIds.Len(), k); for (int i = 0; i < Len; i++) { double MxVal = TFlt::Mn; int MxIndex = 0; for (int j = 0; j < FeatureIds.Len(); j++) { const TIntFltKd& Feature = FeatureIds[j]; if (Feature.Dat > MxVal) { if (!TakenIndexes.IsKey(Feature.Key)) { MxVal = Feature.Dat; MxIndex = Feature.Key; } } } TakenIndexes.AddKey(MxIndex); ChA += ISpace.KeyFromOfs(Space[MxIndex]); ChA += ':'; ChA += TFlt::GetStr(MxVal, "%2.6f"); if (i < Len) { ChA += Sep; } } }
void TSparseNumeric::AddFtr(const TIntFltKdV& InSpV, TIntFltKdV& SpV, int& Offset) const { for (int SpN = 0; SpN < InSpV.Len(); SpN++) { const int Id = InSpV[SpN].Key; double Val = FtrGen.GetFtr(InSpV[SpN].Dat); SpV.Add(TIntFltKd(Offset + Id, Val)); } Offset += GetVals(); }
void TGUtil::Normalize(TIntFltKdV& PdfV) { double Sum = 0.0; for (int i = 0; i < PdfV.Len(); i++) { Sum += PdfV[i].Dat; } if (Sum <= 0.0) { return; } for (int i = 0; i < PdfV.Len(); i++) { PdfV[i].Dat /= Sum; } }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TFltV& FullV, int& Offset) const { // create sparse vector TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; FullV[Offset + ValSp.Key] = ValSp.Dat; } // increase the offset by the dimension Offset += GetDim(); }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TFltV& FullV, int& Offset) const { // generate feature TIntFltKdV ValSpV; AddFtr(StrV, FltV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; FullV[Offset + ValSp.Key] = ValSp.Dat; } // increase the offset by the dimension Offset += GetDim(); }
PJsonVal TNearestNeighbor::Explain(const TIntFltKdV& Vec) const { // if not initialized, return null (JSON) if (!IsInit()) { return TJsonVal::NewNull(); } // find nearest neighbor double NearDist = TFlt::Mx; int NearColN = -1; for (int ColN = 0; ColN < Mat.Len(); ColN++) { const double Dist = TLinAlg::Norm2(Vec) - 2 * TLinAlg::DotProduct(Vec, Mat[ColN]) + TLinAlg::Norm2(Mat[ColN]); if (Dist < NearDist) { NearDist = Dist; NearColN = ColN; } } const TIntFltKdV& NearVec = Mat[NearColN]; // generate JSon explanations PJsonVal ResVal = TJsonVal::NewObj(); // id of the nearest element ResVal->AddToObj("nearestDat", DatV[NearColN]); ResVal->AddToObj("distance", NearDist); // element-wise difference PJsonVal DiffVal = TJsonVal::NewArr(); int NearEltN = 0, EltN = 0; while (NearEltN < NearVec.Len() || EltN < Vec.Len()) { // get the feature ID const int VecFtrId = EltN < Vec.Len() ? Vec[EltN].Key.Val : TInt::Mx; const int NearFtrId = NearEltN < NearVec.Len() ? NearVec[NearEltN].Key.Val : TInt::Mx; const int FtrId = NearFtrId < VecFtrId ? NearFtrId : VecFtrId; // get values const double VecVal = FtrId < VecFtrId ? 0.0 : Vec[EltN].Dat.Val; const double NearVal = FtrId < NearFtrId ? 0.0 : NearVec[NearEltN].Dat.Val; // get diff const double Diff = TMath::Sqr(NearVal - VecVal) / NearDist; // add to json result PJsonVal FtrVal = TJsonVal::NewObj(); //avoid unnecessary fields in the explanation if (Diff > 1e-8) { FtrVal->AddToObj("id", FtrId); FtrVal->AddToObj("val", VecVal); FtrVal->AddToObj("nearVal", NearVal); FtrVal->AddToObj("contribution", Diff); DiffVal->AddToArr(FtrVal); } // move to the next feature if (VecFtrId <= NearFtrId) { EltN++; } if (NearFtrId <= VecFtrId) { NearEltN++; } } ResVal->AddToObj("features", DiffVal); // first and last record in the buffer ResVal->AddToObj("oldestDat", DatV[NextCol]); int CurCol = NextCol > 0 ? NextCol - 1 : WindowSize - 1; ResVal->AddToObj("newestDat", DatV[CurCol]); return ResVal; }
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs, const TStr& DocNm, const TStrV& FtrValV) const { TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV); // make KdV to PrV const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0); for (int WIdN = 0; WIdN < WIds; WIdN++) { WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat)); } // add the feature vector to trainsets BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV); }
void TJsonVal::GetArrNumSpV(TIntFltKdV& NumSpV) const { EAssert(IsArr()); for (int ElN = 0; ElN < GetArrVals(); ElN++) { PJsonVal ArrVal = GetArrVal(ElN); EAssert(ArrVal->IsArr()); EAssert(ArrVal->GetArrVals() == 2); int Idx = ArrVal->GetArrVal(0)->GetInt(); double Val = ArrVal->GetArrVal(1)->GetNum(); NumSpV.Add(TIntFltKd(Idx, Val)); } NumSpV.Sort(); }
void TBagOfWords::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // tokenize TStrV TokenStrV(Val.Len() / 5, 0); GetFtr(Val, TokenStrV); // create sparse vector TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat)); } // increase the offset by the dimension Offset += GetDim(); }
// interpolate effective diameter double CalcEffDiam(const TIntFltKdV& DistNbrsCdfV, const double& Percentile) { const double EffPairs = Percentile * DistNbrsCdfV.Last().Dat; int ValN; for (ValN = 0; ValN < DistNbrsCdfV.Len(); ValN++) { if (DistNbrsCdfV[ValN].Dat() > EffPairs) { break; } } if (ValN >= DistNbrsCdfV.Len()) return DistNbrsCdfV.Last().Key; if (ValN == 0) return 1; // interpolate const double DeltaNbrs = DistNbrsCdfV[ValN].Dat - DistNbrsCdfV[ValN-1].Dat; if (DeltaNbrs == 0) return DistNbrsCdfV[ValN].Key; return DistNbrsCdfV[ValN-1].Key + (EffPairs - DistNbrsCdfV[ValN-1].Dat)/DeltaNbrs; }
void TFtrGenToken::Add(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // step (1): tokenize TStrV TokenStrV; GetTokenV(Val, TokenStrV); // step (2): aggregate token counts TIntH TokenFqH; for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) { const TStr& TokenStr = TokenStrV[TokenStrN]; if (TokenH.IsKey(TokenStr)) { const int TokenId = TokenH.GetKeyId(TokenStr); TokenFqH.AddDat(TokenId)++; } } // step (3): make a sparse vector out of it TIntFltKdV ValSpV(TokenFqH.Len(), 0); int KeyId = TokenFqH.FFirstKeyId(); while (TokenFqH.FNextKeyId(KeyId)) { const int TokenId = TokenFqH.GetKey(KeyId); const int TokenFq = TokenFqH[KeyId]; const int TokenDocFq = TokenH[TokenId]; const double IDF = log(double(Docs) / double(TokenDocFq)); ValSpV.Add(TIntFltKd(TokenId, double(TokenFq) * IDF)); } ValSpV.Sort(); TLinAlg::NormalizeL1(ValSpV); // step (4): add the sparse vector to the final feature vector for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const int Key = ValSpV[ValSpN].Key + Offset; const double Dat = ValSpV[ValSpN].Dat; SpV.Add(TIntFltKd(Key, Dat)); } Offset += TokenH.Len(); }
void TMultinomial::AddFtr(const TStr& Str, TIntFltKdV& SpV, int& Offset) const { const int FtrId = FtrGen.GetFtr(Str); if (FtrId != -1) { SpV.Add(TIntFltKd(Offset + FtrId, 1.0)); } Offset += GetDim(); }
void TFtrGenNumeric::Add( const TStr& Val, TIntFltKdV& SpV, int& Offset) const { double Flt = GetFlt(Val); SpV.Add(TIntFltKd(Offset, Trans(Flt))); Offset++; }
void TDateWnd::AddFtr(const TTm& Val, TIntFltKdV& SpV, int& Offset) const { const int Ftr = GetFtr(Val); for (int FtrN = 0; FtrN < WndSize; FtrN++) { SpV.Add(TIntFltKd(Offset + Ftr + FtrN, Wgt)); } Offset += GetDim(); }
void TGnuPlot::SaveTs(const TIntFltKdV& KdV, const TStr& FNm, const TStr& HeadLn) { FILE *F = fopen(FNm.CStr(), "wt"); EAssert(F); if (! HeadLn.Empty()) fprintf(F, "# %s\n", HeadLn.CStr()); for (int i = 0; i < KdV.Len(); i++) fprintf(F, "%d\t%g\n", KdV[i].Key(), KdV[i].Dat()); fclose(F); }
void TCategorical::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // get dimension to set to 1.0 const int Dim = GetFtr(Val); // set to 1.0 if we get a dimension if (Dim != -1) { SpV.Add(TIntFltKd(Offset + Dim, 1.0)); } // update offset Offset += GetDim(); }
void TFtrGenNominal::Add( const TStr& Val, TIntFltKdV& SpV, int& Offset) const { if (ValH.IsKey(Val)) { SpV.Add(TIntFltKd(Offset + ValH.GetKeyId(Val), 1.0)); } Offset += ValH.Len(); }
double CalcAvgDiamPdf(const TIntFltKdV& DistNbrsPdfV) { double Paths=0, SumLen=0; for (int i = 0; i < DistNbrsPdfV.Len(); i++) { SumLen += DistNbrsPdfV[i].Key * DistNbrsPdfV[i].Dat; Paths += DistNbrsPdfV[i].Dat; } return SumLen/Paths; }
void TStrFeatureSpace::FromAddStr(const TStr& Serialized, TIntFltKdV& Vec, char Sep) { TStrV Toks; Serialized.SplitOnAllCh(Sep, Toks, true); Vec.Gen(Toks.Len()); for (int i = 0; i < Toks.Len(); i++) { TStr Key, Value; Toks[i].SplitOnCh(Key, ':', Value); int FeatureId = GetAddId(Key); double FeatureWgt; if (Value.IsFlt(FeatureWgt)) { Vec[i].Key = FeatureId; Vec[i].Dat = FeatureWgt; } else { EFailR((Value + TStr(" is not a valid floating point number.")).CStr()); } } Vec.Sort(); }
/////////////////////////////////////////////////////////////////////// // Logistic-Regression-Model double TLogRegMd::GetCfy(const TIntFltKdV& AttrV) { int len = AttrV.Len(); double res = bb.Last(); for (int i = 0; i < len; i++) { if (AttrV[i].Key < bb.Len()) res += AttrV[i].Dat * bb[AttrV[i].Key]; } double mu = 1/(1 + exp(-res)); return mu; }
TStr TStrUtil::GetStr(const TIntFltKdV& IntFltKdV, const TStr& FieldDelimiterStr, const TStr& DelimiterStr, const TStr& FmtStr) { TChA ResChA; for (int EltN = 0; EltN < IntFltKdV.Len(); EltN++) { if (!ResChA.Empty()) { ResChA+=DelimiterStr; } ResChA+=IntFltKdV[EltN].Key.GetStr(); ResChA+=FieldDelimiterStr; ResChA+=TFlt::GetStr(IntFltKdV[EltN].Dat, FmtStr); } return ResChA; }
TStr TAlignPairBs::MapQuery(const TAlignPairMap& Map, const TStr& QueryStr, const int& QueryLangId, const int& TargetLangId, const int& TransQueryMtpy, const double& MxWgtPrc) { // get alignment corpus PAlignPair AlignPair = GetAlignPair(QueryLangId, TargetLangId); AlignPair->Def(); // get languages const TStr& QueryLang = LangH.GetKey(QueryLangId); const TStr& TargetLang = LangH.GetKey(TargetLangId); // get sparse vector from the query TIntFltKdV InSpV; AlignPair->GetSpV(QueryStr, QueryLang, InSpV); // get sparse matrices with aligned columns const TMatrix& QueryMatrix = AlignPair->GetMatrix(QueryLang); const TMatrix& TargetMatrix = AlignPair->GetMatrix(TargetLang); // map the query TIntFltKdV OutSpV; Map(InSpV, QueryMatrix, TargetMatrix, OutSpV); // make query back to string return AlignPair->GetSpVStr(OutSpV, TargetLang, InSpV.Len() * TransQueryMtpy, MxWgtPrc); }
void TStrFeatureSpace::FromStr(const TStr& Serialized, TIntFltKdV& Vec, char Sep) const { TStrV Toks; Serialized.SplitOnAllCh(Sep, Toks, true); Vec.Gen(Toks.Len(),0); for (int i = 0; i < Toks.Len(); i++) { TStr Key, Value; Toks[i].SplitOnCh(Key, ':', Value); TStrFSSize FeatureId; if (GetIfExistsId(Key, FeatureId)) { double FeatureWgt; if (Value.IsFlt(FeatureWgt)) { TIntFltKd& Kv = Vec[Vec.Add()]; Kv.Key = FeatureId; Kv.Dat = FeatureWgt; } else { EFailR((Value + TStr(" is not a valid floating point number.")).CStr()); } } } Vec.Sort(); }
double TBowLinAlg::DotProduct(const TIntFltKdV& x, PBowSpV y) { TBowWIdWgtKd* vec2 = y->BegI(); int len1 = x.Len(), len2 = y->Len(); double res = 0.0; int j1 = 0, j2 = 0; while (j1 < len1 && j2 < len2) { if (x[j1].Key < vec2[j2].Key) { j1++; } else if (x[j1].Key > vec2[j2].Key) { j2++; } else { res += x[j1].Dat * vec2[j2].Dat; j1++; j2++; } } return res; }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV) const { // aggregate token counts TIntH TermFqH; TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; // get token ID const int TokenId = IsHashing() ? (TokenStr.GetHashTrick() % HashDim) : // hashing TokenSet.GetKeyId(TokenStr); // vocabulary // add if known token if (TokenId != -1) { TermFqH.AddDat(TokenId)++; } } // make a sparse vector out of it SpV.Gen(TermFqH.Len(), 0); int KeyId = TermFqH.FFirstKeyId(); while (TermFqH.FNextKeyId(KeyId)) { const int TermId = TermFqH.GetKey(KeyId); double TermVal = 1.0; if (IsTf()) { TermVal *= double(TermFqH[KeyId]); } if (IsIdf()) { if (ForgetP) { const double DocFq = double(DocFqV[TermId]) + OldDocFqV[TermId]; if (DocFq > 0.1) { TermVal *= log((double(Docs) + OldDocs) / DocFq); } } else { TermVal *= log(double(Docs) / double(DocFqV[TermId])); } } SpV.Add(TIntFltKd(TermId, TermVal)); } SpV.Sort(); // step (4): normalize the vector if so required if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
PJsonVal TNearestNeighbor::Explain(const TIntFltKdV& Vec) const { // if not initialized, return null (JSON) if (!IsInit()) { return TJsonVal::NewNull(); } // find nearest neighbor double NearDist = TFlt::Mx; int NearColN = -1; TIntFltKdV DiffV; for (int ColN = 0; ColN < Mat.Len(); ColN++) { const double Dist = TLinAlg::Norm2(Vec) - 2 * TLinAlg::DotProduct(Vec, Mat[ColN]) + TLinAlg::Norm2(Mat[ColN]); if (Dist < NearDist) { NearDist = Dist; NearColN = ColN; } } const TIntFltKdV& NearVec = Mat[NearColN]; // generate JSon explanations PJsonVal ResVal = TJsonVal::NewObj(); // id of the nearest element ResVal->AddToObj("nearestID", IDVec[NearColN]); ResVal->AddToObj("distance", NearDist); // element-wise difference PJsonVal DiffVal = TJsonVal::NewArr(); int NearEltN = 0, EltN = 0; while (NearEltN < NearVec.Len() && EltN < Vec.Len()) { // get values const int FtrId = (NearVec[NearEltN].Key < Vec[EltN].Key) ? NearVec[NearEltN].Key : Vec[EltN].Key; const double Val = (NearVec[NearEltN].Key >= Vec[EltN].Key) ? Vec[EltN].Dat.Val : 0.0; const double NearVal = (NearVec[NearEltN].Key <= Vec[EltN].Key) ? NearVec[NearEltN].Dat.Val : 0.0; const double Diff = TMath::Sqr(NearVal - Val) / NearDist; // add to json result PJsonVal FtrVal = TJsonVal::NewObj(); FtrVal->AddToObj("id", FtrId); FtrVal->AddToObj("val", Val); FtrVal->AddToObj("nearVal", NearVal); FtrVal->AddToObj("contribution", Diff); DiffVal->AddToArr(FtrVal); // move to the next feature if (NearVec[NearEltN].Key > Vec[EltN].Key) { EltN++; } else if (NearVec[NearEltN].Key < Vec[EltN].Key) { NearEltN++; } else { NearEltN++; EltN++; } } ResVal->AddToObj("features", DiffVal); return ResVal; }
void TFtrGenMultiNom::AddFtr(const TStrV& StrV, TIntFltKdV& SpV, int& Offset) const { // generate feature vector just for this feature generate TIntFltKdV MultiNomSpV(StrV.Len(), 0); for (int StrN = 0; StrN < StrV.Len(); StrN++) { const int FtrId = FtrGen.GetFtr(StrV[StrN]); // only use features we've seen during updates if (FtrId != -1) { MultiNomSpV.Add(TIntFltKd(Offset + FtrId, 1.0)); } } MultiNomSpV.Sort(); // merge elements with same id double NormSq = 0.0; int GoodSpN = 0; for (int SpN = 1; SpN < MultiNomSpV.Len(); SpN++) { if (MultiNomSpV[GoodSpN].Key == MultiNomSpV[SpN].Key) { // repeatition of previous id MultiNomSpV[GoodSpN].Dat += MultiNomSpV[SpN].Dat; } else { // new id // keep track of norm NormSq += TMath::Sqr(MultiNomSpV[GoodSpN].Dat); // increase the pointer to the next good position GoodSpN++; // and move the new value down to the good position MultiNomSpV[GoodSpN] = MultiNomSpV[SpN]; } } // only bother if there is something to add if (MultiNomSpV.Len() > 0) { // update the norm with the last element NormSq += TMath::Sqr(MultiNomSpV[GoodSpN].Dat); // truncate the vector MultiNomSpV.Trunc(GoodSpN+1); // normalize double Norm = TMath::Sqrt(NormSq); TLinAlg::MultiplyScalar(1.0 / Norm, MultiNomSpV, MultiNomSpV); // add the the full feature vector and increase offset count SpV.AddV(MultiNomSpV); } // increase the offset by the dimension Offset += GetVals(); }
void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) { PdfV = CdfV; for (int i = PdfV.Len()-1; i > 0; i--) { PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; } }