void TFtrGenToken::Add(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // step (1): tokenize TStrV TokenStrV; GetTokenV(Val, TokenStrV); // step (2): aggregate token counts TIntH TokenFqH; for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) { const TStr& TokenStr = TokenStrV[TokenStrN]; if (TokenH.IsKey(TokenStr)) { const int TokenId = TokenH.GetKeyId(TokenStr); TokenFqH.AddDat(TokenId)++; } } // step (3): make a sparse vector out of it TIntFltKdV ValSpV(TokenFqH.Len(), 0); int KeyId = TokenFqH.FFirstKeyId(); while (TokenFqH.FNextKeyId(KeyId)) { const int TokenId = TokenFqH.GetKey(KeyId); const int TokenFq = TokenFqH[KeyId]; const int TokenDocFq = TokenH[TokenId]; const double IDF = log(double(Docs) / double(TokenDocFq)); ValSpV.Add(TIntFltKd(TokenId, double(TokenFq) * IDF)); } ValSpV.Sort(); TLinAlg::NormalizeL1(ValSpV); // step (4): add the sparse vector to the final feature vector for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const int Key = ValSpV[ValSpN].Key + Offset; const double Dat = ValSpV[ValSpN].Dat; SpV.Add(TIntFltKd(Key, Dat)); } Offset += TokenH.Len(); }
void TFtrGenNumeric::Add( const TStr& Val, TIntFltKdV& SpV, int& Offset) const { double Flt = GetFlt(Val); SpV.Add(TIntFltKd(Offset, Trans(Flt))); Offset++; }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV) const { // make sure we either do not have explicit values, or their dimension matches with string keys EAssertR(FltV.Empty() || (StrV.Len() == FltV.Len()), "TMultinomial::AddFtr:: String and double values not aligned"); // generate internal feature vector SpV.Gen(StrV.Len(), 0); for (int StrN = 0; StrN < StrV.Len(); StrN++) { const int FtrId = FtrGen.GetFtr(StrV[StrN]); // only use features we've seen during updates if (FtrId != -1) { const double Val = FltV.Empty() ? 1.0 : FltV[StrN].Val; if (Val > 1e-16) { SpV.Add(TIntFltKd(FtrId, Val)); } } } SpV.Sort(); // merge elements with the same id int GoodSpN = 0; for (int SpN = 1; SpN < SpV.Len(); SpN++) { if (SpV[GoodSpN].Key == SpV[SpN].Key) { // repetition of previous id, sum counts SpV[GoodSpN].Dat += SpV[SpN].Dat; } else { // increase the pointer to the next good position GoodSpN++; // and move the new value down to the good position SpV[GoodSpN] = SpV[SpN]; } } // truncate the vector SpV.Trunc(GoodSpN + 1); // replace values with 1 if needed if (IsBinary()) { for (TIntFltKd& Sp : SpV) { Sp.Dat = 1.0; } } // final normalization, if needed if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
void TMultinomial::AddFtr(const TStr& Str, TIntFltKdV& SpV, int& Offset) const { const int FtrId = FtrGen.GetFtr(Str); if (FtrId != -1) { SpV.Add(TIntFltKd(Offset + FtrId, 1.0)); } Offset += GetDim(); }
void TDateWnd::AddFtr(const TTm& Val, TIntFltKdV& SpV, int& Offset) const { const int Ftr = GetFtr(Val); for (int FtrN = 0; FtrN < WndSize; FtrN++) { SpV.Add(TIntFltKd(Offset + Ftr + FtrN, Wgt)); } Offset += GetDim(); }
double CalcEffDiam(const TFltPrV& DistNbrsCdfV, const double& Percentile) { TIntFltKdV KdV(DistNbrsCdfV.Len(), 0); for (int i = 0; i < DistNbrsCdfV.Len(); i++) { KdV.Add(TIntFltKd(int(DistNbrsCdfV[i].Val1()), DistNbrsCdfV[i].Val2)); } return CalcEffDiam(KdV, Percentile); }
void TSparseNumeric::AddFtr(const TIntFltKdV& InSpV, TIntFltKdV& SpV, int& Offset) const { for (int SpN = 0; SpN < InSpV.Len(); SpN++) { const int Id = InSpV[SpN].Key; double Val = FtrGen.GetFtr(InSpV[SpN].Dat); SpV.Add(TIntFltKd(Offset + Id, Val)); } Offset += GetVals(); }
void TCategorical::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const { // get dimension to set to 1.0 const int Dim = GetFtr(Val); // set to 1.0 if we get a dimension if (Dim != -1) { SpV.Add(TIntFltKd(Offset + Dim, 1.0)); } // update offset Offset += GetDim(); }
void TFtrGenNominal::Add( const TStr& Val, TIntFltKdV& SpV, int& Offset) const { if (ValH.IsKey(Val)) { SpV.Add(TIntFltKd(Offset + ValH.GetKeyId(Val), 1.0)); } Offset += ValH.Len(); }
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV, int& Offset) const { // generate feature TIntFltKdV ValSpV; AddFtr(StrV, FltV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat)); } // increase the offset by the dimension Offset += GetDim(); }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV, int& Offset) const { // create sparse vector TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV); // add to the full feature vector and increase offset count for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) { const TIntFltKd& ValSp = ValSpV[ValSpN]; SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat)); } // increase the offset by the dimension Offset += GetDim(); }
void TJsonVal::GetArrNumSpV(TIntFltKdV& NumSpV) const { EAssert(IsArr()); for (int ElN = 0; ElN < GetArrVals(); ElN++) { PJsonVal ArrVal = GetArrVal(ElN); EAssert(ArrVal->IsArr()); EAssert(ArrVal->GetArrVals() == 2); int Idx = ArrVal->GetArrVal(0)->GetInt(); double Val = ArrVal->GetArrVal(1)->GetNum(); NumSpV.Add(TIntFltKd(Idx, Val)); } NumSpV.Sort(); }
void TFtrGenMultiNom::AddFtr(const TStrV& StrV, TIntFltKdV& SpV, int& Offset) const { // generate feature vector just for this feature generate TIntFltKdV MultiNomSpV(StrV.Len(), 0); for (int StrN = 0; StrN < StrV.Len(); StrN++) { const int FtrId = FtrGen.GetFtr(StrV[StrN]); // only use features we've seen during updates if (FtrId != -1) { MultiNomSpV.Add(TIntFltKd(Offset + FtrId, 1.0)); } } MultiNomSpV.Sort(); // merge elements with same id double NormSq = 0.0; int GoodSpN = 0; for (int SpN = 1; SpN < MultiNomSpV.Len(); SpN++) { if (MultiNomSpV[GoodSpN].Key == MultiNomSpV[SpN].Key) { // repeatition of previous id MultiNomSpV[GoodSpN].Dat += MultiNomSpV[SpN].Dat; } else { // new id // keep track of norm NormSq += TMath::Sqr(MultiNomSpV[GoodSpN].Dat); // increase the pointer to the next good position GoodSpN++; // and move the new value down to the good position MultiNomSpV[GoodSpN] = MultiNomSpV[SpN]; } } // only bother if there is something to add if (MultiNomSpV.Len() > 0) { // update the norm with the last element NormSq += TMath::Sqr(MultiNomSpV[GoodSpN].Dat); // truncate the vector MultiNomSpV.Trunc(GoodSpN+1); // normalize double Norm = TMath::Sqrt(NormSq); TLinAlg::MultiplyScalar(1.0 / Norm, MultiNomSpV, MultiNomSpV); // add the the full feature vector and increase offset count SpV.AddV(MultiNomSpV); } // increase the offset by the dimension Offset += GetVals(); }
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV) const { // aggregate token counts TIntH TermFqH; TStrV NgramStrV; GenerateNgrams(TokenStrV, NgramStrV); for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) { const TStr& TokenStr = NgramStrV[TokenStrN]; // get token ID const int TokenId = IsHashing() ? (TokenStr.GetHashTrick() % HashDim) : // hashing TokenSet.GetKeyId(TokenStr); // vocabulary // add if known token if (TokenId != -1) { TermFqH.AddDat(TokenId)++; } } // make a sparse vector out of it SpV.Gen(TermFqH.Len(), 0); int KeyId = TermFqH.FFirstKeyId(); while (TermFqH.FNextKeyId(KeyId)) { const int TermId = TermFqH.GetKey(KeyId); double TermVal = 1.0; if (IsTf()) { TermVal *= double(TermFqH[KeyId]); } if (IsIdf()) { if (ForgetP) { const double DocFq = double(DocFqV[TermId]) + OldDocFqV[TermId]; if (DocFq > 0.1) { TermVal *= log((double(Docs) + OldDocs) / DocFq); } } else { TermVal *= log(double(Docs) / double(DocFqV[TermId])); } } SpV.Add(TIntFltKd(TermId, TermVal)); } SpV.Sort(); // step (4): normalize the vector if so required if (IsNormalize()) { TLinAlg::Normalize(SpV); } }
///////////////////////////////////////////////// // Yahoo-Feature-Selection TYFSelBs::TYFSelBs( const TYFSelType& FSelType, const double& FSels, const bool& FSelPosWords, const PAttrEst& AttrEst, const TYNegDsType& _YNegDsType, const TYPriorType& YPriorType, const PYBs& YBs, const PYDsBs& YDsBs, const PNotify& Notify): YNegDsType(_YNegDsType), DocIdToWordIdEstVV(YBs->GetDocs()){ TNotify::OnNotify(Notify, ntInfo, "Start Feature Selection"); PDmHd DmHd=new TYDmHd(YBs, YDsBs); PYWordDs NegWordDs=TYDmDs::GetNegWordDs(YNegDsType, YBs, YDsBs); PTbValSplit BoolValSplit=TTbValSplit::GetBoolValSplit(); int DocId=YBs->FFirstDocId(); int DocIds=0; while (YBs->FNextDocId(DocId)){ PYWordDs PosWordDs=YDsBs->GetWordDs(DocId); DocIds++; int SelWordIds; switch (FSelType){ case yfstFix: SelWordIds=int(FSels); break; case yfstPosPrc: SelWordIds=int(FSels*double(PosWordDs->GetWordIds())); break; case yfstUnionPrc:{ PYWordDs UnionWordDs=TYWordDs::GetMerged(PosWordDs, NegWordDs, 1, 1); SelWordIds=int(FSels*double(UnionWordDs->GetWordIds())); break;} default: Fail; SelWordIds=0; } if (SelWordIds<=0){SelWordIds=1;} PDmDs DmDs=PDmDs(new TYDmDs( false, DocId, YNegDsType, YPriorType, YBs, YDsBs, DmHd)); PDmDs PriorDmDs=PDmDs(new TYDmDs( true, DocId, yndtAll, yptDocs, YBs, YDsBs, DmHd)); PYWordDs WordDs; PYWordDs TrvWordDs; TIntH SelWordIdH(SelWordIds); TFltIntKdV WordEstIdKdV(SelWordIds, 0); for (int CDsc=0; CDsc<TTbVal::BoolVals; CDsc++){ switch (CDsc){ case 0: WordDs=NegWordDs; break; case 1: WordDs=PosWordDs; break; default: Fail; } if (FSelPosWords){TrvWordDs=PosWordDs;} else {TrvWordDs=WordDs;} int WordIdN=TrvWordDs->FFirstWordId(); int WordId; while (TrvWordDs->FNextWordId(WordIdN, WordId)){ if (SelWordIdH.IsKey(WordId)){continue;} double WordEst; if (AttrEst.Empty()){ // Shortcut: Odds-Ratio double PriorSumW=YBs->GetDocs(); // double PriorSumW=PosWordDs->GetDocs()+NegWordDs->GetDocs(); IAssert(PriorSumW>0); double S1C0Prb=NegWordDs->GetWordPrb(WordId); double S1C1Prb=PosWordDs->GetWordPrb(WordId); if (S1C0Prb==0){S1C0Prb=1/sqr(PriorSumW);} if (S1C0Prb==1){S1C0Prb=1-(1/sqr(PriorSumW));} double OddsS1C0=S1C0Prb/(1-S1C0Prb); if (S1C1Prb==0){S1C1Prb=1/sqr(PriorSumW);} if (S1C1Prb==1){S1C1Prb=1-(1/sqr(PriorSumW));} double OddsS1C1=S1C1Prb/(1-S1C1Prb); WordEst=log(OddsS1C1/OddsS1C0); } else { WordEst=AttrEst->GetAttrQ(WordId, BoolValSplit, DmDs, PriorDmDs); } WordEstIdKdV.AddSorted(TFltIntKd(WordEst, WordId), false, SelWordIds); SelWordIdH.AddKey(WordId); } } TIntFltKdV& WordIdEstKdV=DocIdToWordIdEstVV[DocId]; WordIdEstKdV.Gen(WordEstIdKdV.Len(), 0); for (int WordIdN=0; WordIdN<WordEstIdKdV.Len(); WordIdN++){ double WordEst=WordEstIdKdV[WordIdN].Key; int WordId=WordEstIdKdV[WordIdN].Dat; WordIdEstKdV.Add(TIntFltKd(WordId, WordEst)); } WordIdEstKdV.Sort(); if (DocIds%100==0){ TNotify::OnNotify(Notify, ntInfo, TStr("...")+TInt::GetStr(DocIds)+" Selections.");} } TNotify::OnNotify(Notify, ntInfo, TStr("Feature Selection Finished (")+ TInt::GetStr(DocIds)+")."); }
TEST(TEmaSpVec, Simple1) { try { TSignalProc::TEmaSpVec sum(100, TSignalProc::TEmaType::etLinear, 0, 10000, 0.001); TSignalProc::TEma ema2(100, TSignalProc::TEmaType::etLinear, 0, 10000); TSignalProc::TEma ema5(100, TSignalProc::TEmaType::etLinear, 0, 10000); TSignalProc::TEma ema6(100, TSignalProc::TEmaType::etLinear, 0, 10000); uint64 timestamp1 = 10; TIntFltKdV in1; in1.Add(TIntFltKd(2, 1.0)); sum.Update(in1, timestamp1); ema2.Update(1.0, timestamp1); ema5.Update(0.0, timestamp1); ema6.Update(0.0, timestamp1); EXPECT_EQ(sum.GetTmMSecs(), timestamp1); const TIntFltKdV& res1 = sum.GetValue(); EXPECT_EQ(res1.Len(), 1); EXPECT_EQ(res1[0].Key, 2); EXPECT_EQ(res1[0].Dat, 1.0); // add another sparse vector, don't remove anything uint64 timestamp2 = timestamp1 + 1000; TIntFltKdV in2; in2.Add(TIntFltKd(5, 2.0)); sum.Update(in2, timestamp2); ema2.Update(0.0, timestamp2); ema5.Update(2.0, timestamp2); ema6.Update(0.0, timestamp2); printf("ema2: %f\n", ema2.GetValue()); printf("ema5: %f\n", ema5.GetValue()); printf("ema6: %f\n", ema6.GetValue()); EXPECT_EQ(sum.GetTmMSecs(), timestamp2); const TIntFltKdV& res2 = sum.GetValue(); EXPECT_EQ(res2.Len(), 2); EXPECT_EQ(res2[0].Key, 2); EXPECT_EQ(res2[0].Dat, ema2.GetValue()); EXPECT_EQ(res2[1].Key, 5); EXPECT_EQ(res2[1].Dat, ema5.GetValue()); uint64 timestamp3 = timestamp2 + 1000; TIntFltKdV in3; in3.Add(TIntFltKd(5, 3.0)); in3.Add(TIntFltKd(6, 6.0)); sum.Update(in3, timestamp3); ema2.Update(0.0, timestamp3); ema5.Update(3.0, timestamp3); ema6.Update(6.0, timestamp3); printf("ema2: %f\n", ema2.GetValue()); printf("ema5: %f\n", ema5.GetValue()); printf("ema6: %f\n", ema6.GetValue()); EXPECT_EQ(sum.GetTmMSecs(), timestamp3); TIntFltKdV res3(sum.GetValue()); EXPECT_EQ(res3.Len(), 3); EXPECT_EQ(res3[0].Key, 2); EXPECT_EQ(res3[0].Dat, ema2.GetValue()); EXPECT_EQ(res3[1].Key, 5); EXPECT_EQ(res3[1].Dat, ema5.GetValue()); EXPECT_EQ(res3[2].Key, 6); EXPECT_EQ(res3[2].Dat, ema6.GetValue()); printf("ema2: %f\n", ema2.GetValue()); printf("ema5: %f\n", ema5.GetValue()); printf("ema6: %f\n", ema6.GetValue()); } catch (PExcept& Except) { printf("Error: %s", Except->GetStr()); throw Except; } }
void TNumeric::AddFtr(const double& Val, TIntFltKdV& SpV, int& Offset) const { SpV.Add(TIntFltKd(Offset, GetFtr(Val))); Offset++; }