void TFtrGenToken::Add(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // step (1): tokenize TStrV TokenStrV; GetTokenV(Val, TokenStrV); // step (2): aggregate token counts TIntH TokenFqH; for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) { const TStr& TokenStr = TokenStrV[TokenStrN]; if (TokenH.IsKey(TokenStr)) { const int TokenId = TokenH.GetKeyId(TokenStr); TokenFqH.AddDat(TokenId)++; } } // step (3): make a sparse vector out of it TIntFltKdV ValSpV(TokenFqH.Len(), 0); int KeyId = TokenFqH.FFirstKeyId(); while (TokenFqH.FNextKeyId(KeyId)) { const int TokenId = TokenFqH.GetKey(KeyId); const int TokenFq = TokenFqH[KeyId]; const int TokenDocFq = TokenH[TokenId]; const double IDF = log(double(Docs) / double(TokenDocFq)); ValSpV.Add(TIntFltKd(TokenId, double(TokenFq) * IDF)); } ValSpV.Sort(); TLinAlg::NormalizeL1(ValSpV); // step (4): add the sparse vector to the final feature vector for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const int Key = ValSpV[ValSpN].Key + Offset; const double Dat = ValSpV[ValSpN].Dat; SpV.Add(TIntFltKd(Key, Dat)); } Offset += TokenH.Len(); }
void TFtrGenNumeric::Add( const TStr& Val, TIntFltKdV& SpV, int& Offset) const { double Flt = GetFlt(Val); SpV.Add(TIntFltKd(Offset, Trans(Flt))); Offset++; }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV) const { // make sure we either do not have explicit values, or their dimension matches with string keys EAssertR(FltV.Empty() || (StrV.Len() == FltV.Len()), "TMultinomial::AddFtr:: String and double values not aligned"); // generate internal feature vector SpV.Gen(StrV.Len(), 0); for (int StrN = 0; StrN < StrV.Len(); StrN++) { const int FtrId = FtrGen.GetFtr(StrV[StrN]); // only use features we've seen during updates if (FtrId != -1) { const double Val = FltV.Empty() ? 1.0 : FltV[StrN].Val; if (Val > 1e-16) { SpV.Add(TIntFltKd(FtrId, Val)); } } } SpV.Sort(); // merge elements with the same id int GoodSpN = 0; for (int SpN = 1; SpN < SpV.Len(); SpN++) { if (SpV[GoodSpN].Key == SpV[SpN].Key) { // repetition of previous id, sum counts SpV[GoodSpN].Dat += SpV[SpN].Dat; } else { // increase the pointer to the next good position GoodSpN++; // and move the new value down to the good position SpV[GoodSpN] = SpV[SpN]; } } // truncate the vector SpV.Trunc(GoodSpN + 1); // replace values with 1 if needed if (IsBinary()) { for (TIntFltKd& Sp : SpV) { Sp.Dat = 1.0; } } // final normalization, if needed if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
void TDateWnd::AddFtr(const TTm& Val, TIntFltKdV& SpV, int& Offset) const { const int Ftr = GetFtr(Val); for (int FtrN = 0; FtrN < WndSize; FtrN++) { SpV.Add(TIntFltKd(Offset + Ftr + FtrN, Wgt)); } Offset += GetDim(); }
void TMultinomial::AddFtr(const TStr& Str, TIntFltKdV& SpV, int& Offset) const { const int FtrId = FtrGen.GetFtr(Str); if (FtrId != -1) { SpV.Add(TIntFltKd(Offset + FtrId, 1.0)); } Offset += GetDim(); }
void TFtrGenNominal::Add( const TStr& Val, TIntFltKdV& SpV, int& Offset) const { if (ValH.IsKey(Val)) { SpV.Add(TIntFltKd(Offset + ValH.GetKeyId(Val), 1.0)); } Offset += ValH.Len(); }
void TSparseNumeric::AddFtr(const TIntFltKdV& InSpV, TIntFltKdV& SpV, int& Offset) const { for (int SpN = 0; SpN < InSpV.Len(); SpN++) { const int Id = InSpV[SpN].Key; double Val = FtrGen.GetFtr(InSpV[SpN].Dat); SpV.Add(TIntFltKd(Offset + Id, Val)); } Offset += GetVals(); }
void TCategorical::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // get dimension to set to 1.0 const int Dim = GetFtr(Val); // set to 1.0 if we get a dimension if (Dim != -1) { SpV.Add(TIntFltKd(Offset + Dim, 1.0)); } // update offset Offset += GetDim(); }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV, int& Offset) const { // create sparse vector TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat)); } // increase the offset by the dimension Offset += GetDim(); }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV, int& Offset) const { // generate feature TIntFltKdV ValSpV; AddFtr(StrV, FltV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat)); } // increase the offset by the dimension Offset += GetDim(); }
void TJsonVal::GetArrNumSpV(TIntFltKdV& NumSpV) const { EAssert(IsArr()); for (int ElN = 0; ElN < GetArrVals(); ElN++) { PJsonVal ArrVal = GetArrVal(ElN); EAssert(ArrVal->IsArr()); EAssert(ArrVal->GetArrVals() == 2); int Idx = ArrVal->GetArrVal(0)->GetInt(); double Val = ArrVal->GetArrVal(1)->GetNum(); NumSpV.Add(TIntFltKd(Idx, Val)); } NumSpV.Sort(); }
void TStrFeatureSpace::FromStr(const TStr& Serialized, TIntFltKdV& Vec, char Sep) const { TStrV Toks; Serialized.SplitOnAllCh(Sep, Toks, true); Vec.Gen(Toks.Len(),0); for (int i = 0; i < Toks.Len(); i++) { TStr Key, Value; Toks[i].SplitOnCh(Key, ':', Value); TStrFSSize FeatureId; if (GetIfExistsId(Key, FeatureId)) { double FeatureWgt; if (Value.IsFlt(FeatureWgt)) { TIntFltKd& Kv = Vec[Vec.Add()]; Kv.Key = FeatureId; Kv.Dat = FeatureWgt; } else { EFailR((Value + TStr(" is not a valid floating point number.")).CStr()); } } } Vec.Sort(); }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV) const { // aggregate token counts TIntH TermFqH; TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; // get token ID const int TokenId = IsHashing() ? (TokenStr.GetHashTrick() % HashDim) : // hashing TokenSet.GetKeyId(TokenStr); // vocabulary // add if known token if (TokenId != -1) { TermFqH.AddDat(TokenId)++; } } // make a sparse vector out of it SpV.Gen(TermFqH.Len(), 0); int KeyId = TermFqH.FFirstKeyId(); while (TermFqH.FNextKeyId(KeyId)) { const int TermId = TermFqH.GetKey(KeyId); double TermVal = 1.0; if (IsTf()) { TermVal *= double(TermFqH[KeyId]); } if (IsIdf()) { if (ForgetP) { const double DocFq = double(DocFqV[TermId]) + OldDocFqV[TermId]; if (DocFq > 0.1) { TermVal *= log((double(Docs) + OldDocs) / DocFq); } } else { TermVal *= log(double(Docs) / double(DocFqV[TermId])); } } SpV.Add(TIntFltKd(TermId, TermVal)); } SpV.Sort(); // step (4): normalize the vector if so required if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
void TNumeric::AddFtr(const double& Val, TIntFltKdV& SpV, int& Offset) const { SpV.Add(TIntFltKd(Offset, GetFtr(Val))); Offset++; }
void TEmaSpVec::Update(const TIntFltKdV& Val, const uint64& NewTmMSecs) { double TmInterval1; // EMA(first_point) = first_point (no smoothing is possible) if (InitMinMSecs == 0) { if (LastVal.Empty()) { LastVal = Val; Ema = Val; TmMSecs = NewTmMSecs; InitP = true; return; } } if (NewTmMSecs == TmMSecs) { TmInterval1 = 1.0; } else { TmInterval1 = (double)(NewTmMSecs - TmMSecs); } if (InitP) { // compute parameters for EMA double Alpha = TmInterval1 / TmInterval; const double Mi = exp(-Alpha); const double Ni = GetNi(Alpha, Mi); // compute new ema //Ema = Mi*Ema + (Ni - Mi)*LastVal + (1.0 - Ni)*Val; TIntFltKdV Tmp; TLinAlg::LinComb(Mi, Ema, Ni - Mi, LastVal, Tmp); TLinAlg::LinComb(1, Tmp, 1.0 - Ni, Val, Ema); } else { // update buffers InitValV.Add(Val); InitMSecsV.Add(NewTmMSecs); // initialize when enough data const uint64 StartInitMSecs = InitMSecsV[0] + InitMinMSecs; if (StartInitMSecs < NewTmMSecs) { // Initialize using "buildup time interval", //TODO: check how interpolation type influences this code const int Vals = InitMSecsV.Len(); // compute weights for each value in buffer TFltV WeightV(Vals, 0); for (int ValN = 0; ValN < Vals; ValN++) { const double Alpha = (double)(TmInterval1); WeightV.Add(exp(-Alpha)); } // normalize weights so they sum to 1.0 TLinAlg::NormalizeL1(WeightV); // compute initial value of EMA as weighted sum //Ema = TLinAlg::DotProduct(WeightV, InitValV); TIntFltKdV Tmp; for (int i = 0; i < WeightV.Len(); i++) { TIntFltKdV Tmp2; TLinAlg::LinComb(1, Tmp, WeightV[i], InitValV[i], Tmp2); Tmp = Tmp2; } Ema = Tmp; // mark that we are done and clean up after us InitP = true; InitValV.Clr(); InitMSecsV.Clr(); } } // remove dimensions bellow cutoff TIntFltKdV TmpEma; //printf("cutoff %f\n", Cutoff.Val); for (int i = 0; i < Ema.Len(); i++) { if (TFlt::Abs(Ema[i].Dat.Val) >= Cutoff) { TmpEma.Add(Ema[i]); } } Ema = TmpEma; // update last value LastVal = Val; // update current time TmMSecs = NewTmMSecs; }