예제 #1
0
void TFtrGenToken::Add(const TStr& Val, TIntFltKdV& SpV, int& Offset) const {
    // step (1): tokenize
    TStrV TokenStrV; GetTokenV(Val, TokenStrV);
    // step (2): aggregate token counts
    TIntH TokenFqH;
    for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) {
        const TStr& TokenStr = TokenStrV[TokenStrN];
        if (TokenH.IsKey(TokenStr)) { 
            const int TokenId = TokenH.GetKeyId(TokenStr);
            TokenFqH.AddDat(TokenId)++;
        }
    }
    // step (3): make a sparse vector out of it
    TIntFltKdV ValSpV(TokenFqH.Len(), 0);
    int KeyId = TokenFqH.FFirstKeyId();
    while (TokenFqH.FNextKeyId(KeyId)) {
        const int TokenId = TokenFqH.GetKey(KeyId);
        const int TokenFq = TokenFqH[KeyId];
        const int TokenDocFq = TokenH[TokenId];
        const double IDF = log(double(Docs) / double(TokenDocFq));
        ValSpV.Add(TIntFltKd(TokenId, double(TokenFq) * IDF));
    }
    ValSpV.Sort(); TLinAlg::NormalizeL1(ValSpV);
    // step (4): add the sparse vector to the final feature vector  
    for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) {
        const int Key = ValSpV[ValSpN].Key + Offset;
        const double Dat = ValSpV[ValSpN].Dat;
        SpV.Add(TIntFltKd(Key, Dat));
    }
    Offset += TokenH.Len(); 
}
예제 #2
0
void TFtrGenNumeric::Add(
        const TStr& Val, TIntFltKdV& SpV, int& Offset) const {

    double Flt = GetFlt(Val);
    SpV.Add(TIntFltKd(Offset, Trans(Flt))); 
    Offset++; 
}
예제 #3
0
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV) const {
    // make sure we either do not have explicit values, or their dimension matches with string keys
    EAssertR(FltV.Empty() || (StrV.Len() == FltV.Len()), "TMultinomial::AddFtr:: String and double values not aligned");
    // generate internal feature vector
    SpV.Gen(StrV.Len(), 0);
    for (int StrN = 0; StrN < StrV.Len(); StrN++) {
        const int FtrId = FtrGen.GetFtr(StrV[StrN]);
        // only use features we've seen during updates
        if (FtrId != -1) {
            const double Val = FltV.Empty() ? 1.0 : FltV[StrN].Val;
            if (Val > 1e-16) { SpV.Add(TIntFltKd(FtrId, Val)); }
        }
    }
    SpV.Sort();
    // merge elements with the same id
    int GoodSpN = 0;
    for (int SpN = 1; SpN < SpV.Len(); SpN++) {
        if (SpV[GoodSpN].Key == SpV[SpN].Key) {
            // repetition of previous id, sum counts
            SpV[GoodSpN].Dat += SpV[SpN].Dat;
        } else {
            // increase the pointer to the next good position
            GoodSpN++;
            // and move the new value down to the good position
            SpV[GoodSpN] = SpV[SpN];
        }
    }
    // truncate the vector
    SpV.Trunc(GoodSpN + 1);
    // replace values with 1 if needed
    if (IsBinary()) { for (TIntFltKd& Sp : SpV) { Sp.Dat = 1.0; } }
    // final normalization, if needed
    if (IsNormalize()) { TLinAlg::Normalize(SpV); }    
}
예제 #4
0
void TMultinomial::AddFtr(const TStr& Str, TIntFltKdV& SpV, int& Offset) const {
    const int FtrId = FtrGen.GetFtr(Str);
    if (FtrId != -1) {
        SpV.Add(TIntFltKd(Offset + FtrId, 1.0));
    }
    Offset += GetDim();
}
예제 #5
0
void TDateWnd::AddFtr(const TTm& Val, TIntFltKdV& SpV, int& Offset) const {
    const int Ftr = GetFtr(Val);
    for (int FtrN = 0; FtrN < WndSize; FtrN++) {
        SpV.Add(TIntFltKd(Offset + Ftr + FtrN, Wgt));
    }
    Offset += GetDim();
}
예제 #6
0
파일: anf.cpp 프로젝트: Aleyasen/Alaki
double CalcEffDiam(const TFltPrV& DistNbrsCdfV, const double& Percentile) {
  TIntFltKdV KdV(DistNbrsCdfV.Len(), 0);
  for (int i = 0; i < DistNbrsCdfV.Len(); i++) {
    KdV.Add(TIntFltKd(int(DistNbrsCdfV[i].Val1()), DistNbrsCdfV[i].Val2));
  }
  return CalcEffDiam(KdV, Percentile);
}
예제 #7
0
void TSparseNumeric::AddFtr(const TIntFltKdV& InSpV, TIntFltKdV& SpV, int& Offset) const {
    for (int SpN = 0; SpN < InSpV.Len(); SpN++) {
        const int Id = InSpV[SpN].Key;
        double Val = FtrGen.GetFtr(InSpV[SpN].Dat);
        SpV.Add(TIntFltKd(Offset + Id, Val));
    }
    Offset += GetVals();
}
예제 #8
0
void TCategorical::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const {
    // get dimension to set to 1.0
    const int Dim = GetFtr(Val);
    // set to 1.0 if we get a dimension
    if (Dim != -1) { SpV.Add(TIntFltKd(Offset + Dim, 1.0)); }
    // update offset
    Offset += GetDim();
}
예제 #9
0
void TFtrGenNominal::Add(
        const TStr& Val, TIntFltKdV& SpV, int& Offset) const {

    if (ValH.IsKey(Val)) { 
        SpV.Add(TIntFltKd(Offset + ValH.GetKeyId(Val), 1.0)); 
    } 
    Offset += ValH.Len(); 
}
예제 #10
0
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV, int& Offset) const {
    // generate feature 
    TIntFltKdV ValSpV; AddFtr(StrV, FltV, ValSpV);
    // add to the full feature vector and increase offset count
    for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) {
        const TIntFltKd& ValSp = ValSpV[ValSpN];
        SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat));
    }
    // increase the offset by the dimension
    Offset += GetDim();
}
예제 #11
0
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV, int& Offset) const {
	// create sparse vector
    TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV);
    // add to the full feature vector and increase offset count
    for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) {
        const TIntFltKd& ValSp = ValSpV[ValSpN];
        SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat));
    }    
    // increase the offset by the dimension
    Offset += GetDim();
}
예제 #12
0
파일: json.cpp 프로젝트: lstopar/qminer
void TJsonVal::GetArrNumSpV(TIntFltKdV& NumSpV) const {
    EAssert(IsArr());
    for (int ElN = 0; ElN < GetArrVals(); ElN++) {
        PJsonVal ArrVal = GetArrVal(ElN);
        EAssert(ArrVal->IsArr());
        EAssert(ArrVal->GetArrVals() ==  2);
        int Idx = ArrVal->GetArrVal(0)->GetInt();
        double Val = ArrVal->GetArrVal(1)->GetNum();
        NumSpV.Add(TIntFltKd(Idx, Val));
    }
    NumSpV.Sort();
}
예제 #13
0
void TFtrGenMultiNom::AddFtr(const TStrV& StrV, TIntFltKdV& SpV, int& Offset) const {
    // generate feature vector just for this feature generate
    TIntFltKdV MultiNomSpV(StrV.Len(), 0);
    for (int StrN = 0; StrN < StrV.Len(); StrN++) {
        const int FtrId = FtrGen.GetFtr(StrV[StrN]);
        // only use features we've seen during updates
        if (FtrId != -1) {
            MultiNomSpV.Add(TIntFltKd(Offset + FtrId, 1.0));
        }
    }
    MultiNomSpV.Sort();
    // merge elements with same id
    double NormSq = 0.0;
    int GoodSpN = 0;
    for (int SpN = 1; SpN < MultiNomSpV.Len(); SpN++) {
        if (MultiNomSpV[GoodSpN].Key == MultiNomSpV[SpN].Key) {
            // repeatition of previous id
            MultiNomSpV[GoodSpN].Dat += MultiNomSpV[SpN].Dat;
        } else { // new id
            // keep track of norm
            NormSq += TMath::Sqr(MultiNomSpV[GoodSpN].Dat);
            // increase the pointer to the next good position
            GoodSpN++;
            // and move the new value down to the good position
            MultiNomSpV[GoodSpN] = MultiNomSpV[SpN];
        }
    }
    // only bother if there is something to add
    if (MultiNomSpV.Len() > 0) {
        // update the norm with the last element
        NormSq += TMath::Sqr(MultiNomSpV[GoodSpN].Dat);
        // truncate the vector
        MultiNomSpV.Trunc(GoodSpN+1);
        // normalize
        double Norm = TMath::Sqrt(NormSq);
        TLinAlg::MultiplyScalar(1.0 / Norm, MultiNomSpV, MultiNomSpV);
        // add the the full feature vector and increase offset count
        SpV.AddV(MultiNomSpV);
    }
    // increase the offset by the dimension
    Offset += GetVals();
}
예제 #14
0
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV) const {
    // aggregate token counts
    TIntH TermFqH;
	TStrV NgramStrV;
    GenerateNgrams(TokenStrV, NgramStrV);	
    for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
        const TStr& TokenStr = NgramStrV[TokenStrN];
        // get token ID
        const int TokenId = IsHashing() ?
            (TokenStr.GetHashTrick() % HashDim) : // hashing
            TokenSet.GetKeyId(TokenStr); // vocabulary
        // add if known token
        if (TokenId != -1) {
            TermFqH.AddDat(TokenId)++;
        }
    }
    // make a sparse vector out of it
    SpV.Gen(TermFqH.Len(), 0);
    int KeyId = TermFqH.FFirstKeyId();
    while (TermFqH.FNextKeyId(KeyId)) {
        const int TermId = TermFqH.GetKey(KeyId);
        double TermVal = 1.0;
        if (IsTf()) { TermVal *= double(TermFqH[KeyId]); }
        if (IsIdf()) {
            if (ForgetP) {
                const double DocFq = double(DocFqV[TermId]) + OldDocFqV[TermId];
                if (DocFq > 0.1) { TermVal *= log((double(Docs) + OldDocs) / DocFq); }
            } else {
                TermVal *= log(double(Docs) / double(DocFqV[TermId]));
            }
        }
        SpV.Add(TIntFltKd(TermId, TermVal));
    }
    SpV.Sort();
    // step (4): normalize the vector if so required
    if (IsNormalize()) { TLinAlg::Normalize(SpV); }
}
예제 #15
0
/////////////////////////////////////////////////
// Yahoo-Feature-Selection
TYFSelBs::TYFSelBs(
 const TYFSelType& FSelType, const double& FSels,
 const bool& FSelPosWords, const PAttrEst& AttrEst,
 const TYNegDsType& _YNegDsType, const TYPriorType& YPriorType,
 const PYBs& YBs, const PYDsBs& YDsBs, const PNotify& Notify):
  YNegDsType(_YNegDsType), DocIdToWordIdEstVV(YBs->GetDocs()){
  TNotify::OnNotify(Notify, ntInfo, "Start Feature Selection");

  PDmHd DmHd=new TYDmHd(YBs, YDsBs);
  PYWordDs NegWordDs=TYDmDs::GetNegWordDs(YNegDsType, YBs, YDsBs);
  PTbValSplit BoolValSplit=TTbValSplit::GetBoolValSplit();

  int DocId=YBs->FFirstDocId(); int DocIds=0;
  while (YBs->FNextDocId(DocId)){
    PYWordDs PosWordDs=YDsBs->GetWordDs(DocId); DocIds++;

    int SelWordIds;
    switch (FSelType){
      case yfstFix: SelWordIds=int(FSels); break;
      case yfstPosPrc:
        SelWordIds=int(FSels*double(PosWordDs->GetWordIds())); break;
      case yfstUnionPrc:{
        PYWordDs UnionWordDs=TYWordDs::GetMerged(PosWordDs, NegWordDs, 1, 1);
        SelWordIds=int(FSels*double(UnionWordDs->GetWordIds())); break;}
      default: Fail; SelWordIds=0;
    }
    if (SelWordIds<=0){SelWordIds=1;}

    PDmDs DmDs=PDmDs(new TYDmDs(
     false, DocId, YNegDsType, YPriorType, YBs, YDsBs, DmHd));
    PDmDs PriorDmDs=PDmDs(new TYDmDs(
     true, DocId, yndtAll, yptDocs, YBs, YDsBs, DmHd));
    PYWordDs WordDs; PYWordDs TrvWordDs;
    TIntH SelWordIdH(SelWordIds);
    TFltIntKdV WordEstIdKdV(SelWordIds, 0);
    for (int CDsc=0; CDsc<TTbVal::BoolVals; CDsc++){
      switch (CDsc){
        case 0: WordDs=NegWordDs; break;
        case 1: WordDs=PosWordDs; break;
        default: Fail;
      }
      if (FSelPosWords){TrvWordDs=PosWordDs;} else {TrvWordDs=WordDs;}
      int WordIdN=TrvWordDs->FFirstWordId(); int WordId;
      while (TrvWordDs->FNextWordId(WordIdN, WordId)){
        if (SelWordIdH.IsKey(WordId)){continue;}
        double WordEst;
        if (AttrEst.Empty()){
          // Shortcut: Odds-Ratio
          double PriorSumW=YBs->GetDocs();
//          double PriorSumW=PosWordDs->GetDocs()+NegWordDs->GetDocs();
          IAssert(PriorSumW>0);
          double S1C0Prb=NegWordDs->GetWordPrb(WordId);
          double S1C1Prb=PosWordDs->GetWordPrb(WordId);

          if (S1C0Prb==0){S1C0Prb=1/sqr(PriorSumW);}
          if (S1C0Prb==1){S1C0Prb=1-(1/sqr(PriorSumW));}
          double OddsS1C0=S1C0Prb/(1-S1C0Prb);

          if (S1C1Prb==0){S1C1Prb=1/sqr(PriorSumW);}
          if (S1C1Prb==1){S1C1Prb=1-(1/sqr(PriorSumW));}
          double OddsS1C1=S1C1Prb/(1-S1C1Prb);

          WordEst=log(OddsS1C1/OddsS1C0);
        } else {
          WordEst=AttrEst->GetAttrQ(WordId, BoolValSplit, DmDs, PriorDmDs);
        }
        WordEstIdKdV.AddSorted(TFltIntKd(WordEst, WordId), false, SelWordIds);
        SelWordIdH.AddKey(WordId);
      }
    }
    TIntFltKdV& WordIdEstKdV=DocIdToWordIdEstVV[DocId];
    WordIdEstKdV.Gen(WordEstIdKdV.Len(), 0);
    for (int WordIdN=0; WordIdN<WordEstIdKdV.Len(); WordIdN++){
      double WordEst=WordEstIdKdV[WordIdN].Key;
      int WordId=WordEstIdKdV[WordIdN].Dat;
      WordIdEstKdV.Add(TIntFltKd(WordId, WordEst));
    }
    WordIdEstKdV.Sort();
    if (DocIds%100==0){
      TNotify::OnNotify(Notify, ntInfo,
       TStr("...")+TInt::GetStr(DocIds)+" Selections.");}
  }
  TNotify::OnNotify(Notify, ntInfo,
   TStr("Feature Selection Finished (")+ TInt::GetStr(DocIds)+").");
}
예제 #16
0
TEST(TEmaSpVec, Simple1) {
	try {
		TSignalProc::TEmaSpVec sum(100, TSignalProc::TEmaType::etLinear, 0, 10000, 0.001);

		TSignalProc::TEma ema2(100, TSignalProc::TEmaType::etLinear, 0, 10000);
		TSignalProc::TEma ema5(100, TSignalProc::TEmaType::etLinear, 0, 10000);
		TSignalProc::TEma ema6(100, TSignalProc::TEmaType::etLinear, 0, 10000);

		uint64 timestamp1 = 10;
		TIntFltKdV in1;
		in1.Add(TIntFltKd(2, 1.0));
		
		sum.Update(in1, timestamp1);
		ema2.Update(1.0, timestamp1);
		ema5.Update(0.0, timestamp1);
		ema6.Update(0.0, timestamp1);

		EXPECT_EQ(sum.GetTmMSecs(), timestamp1);
		const TIntFltKdV& res1 = sum.GetValue();
		EXPECT_EQ(res1.Len(), 1);
		EXPECT_EQ(res1[0].Key, 2);
		EXPECT_EQ(res1[0].Dat, 1.0);

		// add another sparse vector, don't remove anything
		uint64 timestamp2 = timestamp1 + 1000;
		TIntFltKdV in2;
		in2.Add(TIntFltKd(5, 2.0));

		sum.Update(in2, timestamp2);
		ema2.Update(0.0, timestamp2);
		ema5.Update(2.0, timestamp2);
		ema6.Update(0.0, timestamp2);
		printf("ema2: %f\n", ema2.GetValue());
		printf("ema5: %f\n", ema5.GetValue());
		printf("ema6: %f\n", ema6.GetValue());

		EXPECT_EQ(sum.GetTmMSecs(), timestamp2);
		const TIntFltKdV& res2 = sum.GetValue();
		EXPECT_EQ(res2.Len(), 2);
		EXPECT_EQ(res2[0].Key, 2);
		EXPECT_EQ(res2[0].Dat, ema2.GetValue());
		EXPECT_EQ(res2[1].Key, 5);
		EXPECT_EQ(res2[1].Dat, ema5.GetValue());

		uint64 timestamp3 = timestamp2 + 1000;
		TIntFltKdV in3;
		in3.Add(TIntFltKd(5, 3.0));
		in3.Add(TIntFltKd(6, 6.0));

		sum.Update(in3, timestamp3);
		ema2.Update(0.0, timestamp3);
		ema5.Update(3.0, timestamp3);
		ema6.Update(6.0, timestamp3);
		printf("ema2: %f\n", ema2.GetValue());
		printf("ema5: %f\n", ema5.GetValue());
		printf("ema6: %f\n", ema6.GetValue());

		EXPECT_EQ(sum.GetTmMSecs(), timestamp3);
		TIntFltKdV res3(sum.GetValue());
		EXPECT_EQ(res3.Len(), 3);
		EXPECT_EQ(res3[0].Key, 2);
		EXPECT_EQ(res3[0].Dat, ema2.GetValue());
		EXPECT_EQ(res3[1].Key, 5);
		EXPECT_EQ(res3[1].Dat, ema5.GetValue());
		EXPECT_EQ(res3[2].Key, 6);
		EXPECT_EQ(res3[2].Dat, ema6.GetValue());
		printf("ema2: %f\n", ema2.GetValue());
		printf("ema5: %f\n", ema5.GetValue());
		printf("ema6: %f\n", ema6.GetValue());

	} catch (PExcept& Except) {
		printf("Error: %s", Except->GetStr());
		throw Except;
	}
}
예제 #17
0
void TNumeric::AddFtr(const double& Val, TIntFltKdV& SpV, int& Offset) const {
    SpV.Add(TIntFltKd(Offset, GetFtr(Val))); Offset++;
}