void TStrFeatureSpace::ToStr(const TIntFltKdV& FeatureIds, TChA& ChA, int k, char Sep) const { TIntSet TakenIndexes(k); int Len = TMath::Mn(FeatureIds.Len(), k); for (int i = 0; i < Len; i++) { double MxVal = TFlt::Mn; int MxIndex = 0; for (int j = 0; j < FeatureIds.Len(); j++) { const TIntFltKd& Feature = FeatureIds[j]; if (Feature.Dat > MxVal) { if (!TakenIndexes.IsKey(Feature.Key)) { MxVal = Feature.Dat; MxIndex = Feature.Key; } } } TakenIndexes.AddKey(MxIndex); ChA += ISpace.KeyFromOfs(Space[MxIndex]); ChA += ':'; ChA += TFlt::GetStr(MxVal, "%2.6f"); if (i < Len) { ChA += Sep; } } }
int TGnuPlot::AddPlot(const TIntFltKdV& XYValV, const TGpSeriesTy& SeriesTy, const TStr& Label, const TStr& Style) { TFltKdV XYFltValV(XYValV.Len(), 0); for (int i = 0; i < XYValV.Len(); i++) { XYFltValV.Add(TFltKd(TFlt(XYValV[i].Key), TFlt(XYValV[i].Dat))); } return AddPlot(XYFltValV, SeriesTy, Label, Style); }
void TGUtil::Normalize(TIntFltKdV& PdfV) { double Sum = 0.0; for (int i = 0; i < PdfV.Len(); i++) { Sum += PdfV[i].Dat; } if (Sum <= 0.0) { return; } for (int i = 0; i < PdfV.Len(); i++) { PdfV[i].Dat /= Sum; } }
PJsonVal TNearestNeighbor::Explain(const TIntFltKdV& Vec) const { // if not initialized, return null (JSON) if (!IsInit()) { return TJsonVal::NewNull(); } // find nearest neighbor double NearDist = TFlt::Mx; int NearColN = -1; for (int ColN = 0; ColN < Mat.Len(); ColN++) { const double Dist = TLinAlg::Norm2(Vec) - 2 * TLinAlg::DotProduct(Vec, Mat[ColN]) + TLinAlg::Norm2(Mat[ColN]); if (Dist < NearDist) { NearDist = Dist; NearColN = ColN; } } const TIntFltKdV& NearVec = Mat[NearColN]; // generate JSon explanations PJsonVal ResVal = TJsonVal::NewObj(); // id of the nearest element ResVal->AddToObj("nearestDat", DatV[NearColN]); ResVal->AddToObj("distance", NearDist); // element-wise difference PJsonVal DiffVal = TJsonVal::NewArr(); int NearEltN = 0, EltN = 0; while (NearEltN < NearVec.Len() || EltN < Vec.Len()) { // get the feature ID const int VecFtrId = EltN < Vec.Len() ? Vec[EltN].Key.Val : TInt::Mx; const int NearFtrId = NearEltN < NearVec.Len() ? NearVec[NearEltN].Key.Val : TInt::Mx; const int FtrId = NearFtrId < VecFtrId ? NearFtrId : VecFtrId; // get values const double VecVal = FtrId < VecFtrId ? 0.0 : Vec[EltN].Dat.Val; const double NearVal = FtrId < NearFtrId ? 0.0 : NearVec[NearEltN].Dat.Val; // get diff const double Diff = TMath::Sqr(NearVal - VecVal) / NearDist; // add to json result PJsonVal FtrVal = TJsonVal::NewObj(); //avoid unnecessary fields in the explanation if (Diff > 1e-8) { FtrVal->AddToObj("id", FtrId); FtrVal->AddToObj("val", VecVal); FtrVal->AddToObj("nearVal", NearVal); FtrVal->AddToObj("contribution", Diff); DiffVal->AddToArr(FtrVal); } // move to the next feature if (VecFtrId <= NearFtrId) { EltN++; } if (NearFtrId <= VecFtrId) { NearEltN++; } } ResVal->AddToObj("features", DiffVal); // first and last record in the buffer ResVal->AddToObj("oldestDat", DatV[NextCol]); int CurCol = NextCol > 0 ? NextCol - 1 : WindowSize - 1; ResVal->AddToObj("newestDat", DatV[CurCol]); return ResVal; }
// interpolate effective diameter double CalcEffDiam(const TIntFltKdV& DistNbrsCdfV, const double& Percentile) { const double EffPairs = Percentile * DistNbrsCdfV.Last().Dat; int ValN; for (ValN = 0; ValN < DistNbrsCdfV.Len(); ValN++) { if (DistNbrsCdfV[ValN].Dat() > EffPairs) { break; } } if (ValN >= DistNbrsCdfV.Len()) return DistNbrsCdfV.Last().Key; if (ValN == 0) return 1; // interpolate const double DeltaNbrs = DistNbrsCdfV[ValN].Dat - DistNbrsCdfV[ValN-1].Dat; if (DeltaNbrs == 0) return DistNbrsCdfV[ValN].Key; return DistNbrsCdfV[ValN-1].Key + (EffPairs - DistNbrsCdfV[ValN-1].Dat)/DeltaNbrs; }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV) const { // make sure we either do not have explicit values, or their dimension matches with string keys EAssertR(FltV.Empty() || (StrV.Len() == FltV.Len()), "TMultinomial::AddFtr:: String and double values not aligned"); // generate internal feature vector SpV.Gen(StrV.Len(), 0); for (int StrN = 0; StrN < StrV.Len(); StrN++) { const int FtrId = FtrGen.GetFtr(StrV[StrN]); // only use features we've seen during updates if (FtrId != -1) { const double Val = FltV.Empty() ? 1.0 : FltV[StrN].Val; if (Val > 1e-16) { SpV.Add(TIntFltKd(FtrId, Val)); } } } SpV.Sort(); // merge elements with the same id int GoodSpN = 0; for (int SpN = 1; SpN < SpV.Len(); SpN++) { if (SpV[GoodSpN].Key == SpV[SpN].Key) { // repetition of previous id, sum counts SpV[GoodSpN].Dat += SpV[SpN].Dat; } else { // increase the pointer to the next good position GoodSpN++; // and move the new value down to the good position SpV[GoodSpN] = SpV[SpN]; } } // truncate the vector SpV.Trunc(GoodSpN + 1); // replace values with 1 if needed if (IsBinary()) { for (TIntFltKd& Sp : SpV) { Sp.Dat = 1.0; } } // final normalization, if needed if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
void TSparseNumeric::AddFtr(const TIntFltKdV& InSpV, TIntFltKdV& SpV, int& Offset) const { for (int SpN = 0; SpN < InSpV.Len(); SpN++) { const int Id = InSpV[SpN].Key; double Val = FtrGen.GetFtr(InSpV[SpN].Dat); SpV.Add(TIntFltKd(Offset + Id, Val)); } Offset += GetVals(); }
double CalcAvgDiamPdf(const TIntFltKdV& DistNbrsPdfV) { double Paths=0, SumLen=0; for (int i = 0; i < DistNbrsPdfV.Len(); i++) { SumLen += DistNbrsPdfV[i].Key * DistNbrsPdfV[i].Dat; Paths += DistNbrsPdfV[i].Dat; } return SumLen/Paths; }
void TGnuPlot::SaveTs(const TIntFltKdV& KdV, const TStr& FNm, const TStr& HeadLn) { FILE *F = fopen(FNm.CStr(), "wt"); EAssert(F); if (! HeadLn.Empty()) fprintf(F, "# %s\n", HeadLn.CStr()); for (int i = 0; i < KdV.Len(); i++) fprintf(F, "%d\t%g\n", KdV[i].Key(), KdV[i].Dat()); fclose(F); }
/////////////////////////////////////////////////////////////////////// // Logistic-Regression-Model double TLogRegMd::GetCfy(const TIntFltKdV& AttrV) { int len = AttrV.Len(); double res = bb.Last(); for (int i = 0; i < len; i++) { if (AttrV[i].Key < bb.Len()) res += AttrV[i].Dat * bb[AttrV[i].Key]; } double mu = 1/(1 + exp(-res)); return mu; }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TFltV& FullV, int& Offset) const { // create sparse vector TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; FullV[Offset + ValSp.Key] = ValSp.Dat; } // increase the offset by the dimension Offset += GetDim(); }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TFltV& FullV, int& Offset) const { // generate feature TIntFltKdV ValSpV; AddFtr(StrV, FltV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; FullV[Offset + ValSp.Key] = ValSp.Dat; } // increase the offset by the dimension Offset += GetDim(); }
TStr TStrUtil::GetStr(const TIntFltKdV& IntFltKdV, const TStr& FieldDelimiterStr, const TStr& DelimiterStr, const TStr& FmtStr) { TChA ResChA; for (int EltN = 0; EltN < IntFltKdV.Len(); EltN++) { if (!ResChA.Empty()) { ResChA+=DelimiterStr; } ResChA+=IntFltKdV[EltN].Key.GetStr(); ResChA+=FieldDelimiterStr; ResChA+=TFlt::GetStr(IntFltKdV[EltN].Dat, FmtStr); } return ResChA; }
void TFtrGenBs::AddBowDoc(const PBowDocBs& BowDocBs, const TStr& DocNm, const TStrV& FtrValV) const { TIntFltKdV FtrSpV; GenFtrV(FtrValV, FtrSpV); // make KdV to PrV const int WIds = FtrSpV.Len(); TIntFltPrV WIdWgtPrV(WIds, 0); for (int WIdN = 0; WIdN < WIds; WIdN++) { WIdWgtPrV.Add(TIntFltPr(FtrSpV[WIdN].Key, FtrSpV[WIdN].Dat)); } // add the feature vector to trainsets BowDocBs->AddDoc(DocNm, TStrV(), WIdWgtPrV); }
double TBowLinAlg::DotProduct(const TIntFltKdV& x, PBowSpV y) { TBowWIdWgtKd* vec2 = y->BegI(); int len1 = x.Len(), len2 = y->Len(); double res = 0.0; int j1 = 0, j2 = 0; while (j1 < len1 && j2 < len2) { if (x[j1].Key < vec2[j2].Key) { j1++; } else if (x[j1].Key > vec2[j2].Key) { j2++; } else { res += x[j1].Dat * vec2[j2].Dat; j1++; j2++; } } return res; }
void TBagOfWords::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // tokenize TStrV TokenStrV(Val.Len() / 5, 0); GetFtr(Val, TokenStrV); // create sparse vector TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat)); } // increase the offset by the dimension Offset += GetDim(); }
PJsonVal TNearestNeighbor::Explain(const TIntFltKdV& Vec) const { // if not initialized, return null (JSON) if (!IsInit()) { return TJsonVal::NewNull(); } // find nearest neighbor double NearDist = TFlt::Mx; int NearColN = -1; TIntFltKdV DiffV; for (int ColN = 0; ColN < Mat.Len(); ColN++) { const double Dist = TLinAlg::Norm2(Vec) - 2 * TLinAlg::DotProduct(Vec, Mat[ColN]) + TLinAlg::Norm2(Mat[ColN]); if (Dist < NearDist) { NearDist = Dist; NearColN = ColN; } } const TIntFltKdV& NearVec = Mat[NearColN]; // generate JSon explanations PJsonVal ResVal = TJsonVal::NewObj(); // id of the nearest element ResVal->AddToObj("nearestID", IDVec[NearColN]); ResVal->AddToObj("distance", NearDist); // element-wise difference PJsonVal DiffVal = TJsonVal::NewArr(); int NearEltN = 0, EltN = 0; while (NearEltN < NearVec.Len() && EltN < Vec.Len()) { // get values const int FtrId = (NearVec[NearEltN].Key < Vec[EltN].Key) ? NearVec[NearEltN].Key : Vec[EltN].Key; const double Val = (NearVec[NearEltN].Key >= Vec[EltN].Key) ? Vec[EltN].Dat.Val : 0.0; const double NearVal = (NearVec[NearEltN].Key <= Vec[EltN].Key) ? NearVec[NearEltN].Dat.Val : 0.0; const double Diff = TMath::Sqr(NearVal - Val) / NearDist; // add to json result PJsonVal FtrVal = TJsonVal::NewObj(); FtrVal->AddToObj("id", FtrId); FtrVal->AddToObj("val", Val); FtrVal->AddToObj("nearVal", NearVal); FtrVal->AddToObj("contribution", Diff); DiffVal->AddToArr(FtrVal); // move to the next feature if (NearVec[NearEltN].Key > Vec[EltN].Key) { EltN++; } else if (NearVec[NearEltN].Key < Vec[EltN].Key) { NearEltN++; } else { NearEltN++; EltN++; } } ResVal->AddToObj("features", DiffVal); return ResVal; }
TStr TAlignPairBs::MapQuery(const TAlignPairMap& Map, const TStr& QueryStr, const int& QueryLangId, const int& TargetLangId, const int& TransQueryMtpy, const double& MxWgtPrc) { // get alignment corpus PAlignPair AlignPair = GetAlignPair(QueryLangId, TargetLangId); AlignPair->Def(); // get languages const TStr& QueryLang = LangH.GetKey(QueryLangId); const TStr& TargetLang = LangH.GetKey(TargetLangId); // get sparse vector from the query TIntFltKdV InSpV; AlignPair->GetSpV(QueryStr, QueryLang, InSpV); // get sparse matrices with aligned columns const TMatrix& QueryMatrix = AlignPair->GetMatrix(QueryLang); const TMatrix& TargetMatrix = AlignPair->GetMatrix(TargetLang); // map the query TIntFltKdV OutSpV; Map(InSpV, QueryMatrix, TargetMatrix, OutSpV); // make query back to string return AlignPair->GetSpVStr(OutSpV, TargetLang, InSpV.Len() * TransQueryMtpy, MxWgtPrc); }
void TSparseNumeric::Update(const TIntFltKdV& SpV) { for (int SpN = 0; SpN < SpV.Len(); SpN++) { MxId = TInt::GetMx(SpV[SpN].Key, MxId); FtrGen.Update(SpV[SpN].Dat); } }
void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) { PdfV = CdfV; for (int i = PdfV.Len()-1; i > 0; i--) { PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; } }
void TGUtil::GetCCdf(const TIntFltKdV& PdfV, TIntFltKdV& CCdfV) { CCdfV = PdfV; for (int i = CCdfV.Len()-2; i >= 0; i--) { CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; } }
void TGUtil::GetCdf(const TIntFltKdV& PdfV, TIntFltKdV& CdfV) { CdfV = PdfV; for (int i = 1; i < CdfV.Len(); i++) { CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; } }