Esempio n. 1
0
void TFtrGenToken::Add(const TStr& Val, TIntFltKdV& SpV, int& Offset) const {
    // step (1): tokenize
    TStrV TokenStrV; GetTokenV(Val, TokenStrV);
    // step (2): aggregate token counts
    TIntH TokenFqH;
    for (int TokenStrN = 0; TokenStrN < TokenStrV.Len(); TokenStrN++) {
        const TStr& TokenStr = TokenStrV[TokenStrN];
        if (TokenH.IsKey(TokenStr)) { 
            const int TokenId = TokenH.GetKeyId(TokenStr);
            TokenFqH.AddDat(TokenId)++;
        }
    }
    // step (3): make a sparse vector out of it
    TIntFltKdV ValSpV(TokenFqH.Len(), 0);
    int KeyId = TokenFqH.FFirstKeyId();
    while (TokenFqH.FNextKeyId(KeyId)) {
        const int TokenId = TokenFqH.GetKey(KeyId);
        const int TokenFq = TokenFqH[KeyId];
        const int TokenDocFq = TokenH[TokenId];
        const double IDF = log(double(Docs) / double(TokenDocFq));
        ValSpV.Add(TIntFltKd(TokenId, double(TokenFq) * IDF));
    }
    ValSpV.Sort(); TLinAlg::NormalizeL1(ValSpV);
    // step (4): add the sparse vector to the final feature vector  
    for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) {
        const int Key = ValSpV[ValSpN].Key + Offset;
        const double Dat = ValSpV[ValSpN].Dat;
        SpV.Add(TIntFltKd(Key, Dat));
    }
    Offset += TokenH.Len(); 
}
Esempio n. 2
0
void TFtrGenNumeric::Add(
        const TStr& Val, TIntFltKdV& SpV, int& Offset) const {

    double Flt = GetFlt(Val);
    SpV.Add(TIntFltKd(Offset, Trans(Flt))); 
    Offset++; 
}
Esempio n. 3
0
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV) const {
    // make sure we either do not have explicit values, or their dimension matches with string keys
    EAssertR(FltV.Empty() || (StrV.Len() == FltV.Len()), "TMultinomial::AddFtr:: String and double values not aligned");
    // generate internal feature vector
    SpV.Gen(StrV.Len(), 0);
    for (int StrN = 0; StrN < StrV.Len(); StrN++) {
        const int FtrId = FtrGen.GetFtr(StrV[StrN]);
        // only use features we've seen during updates
        if (FtrId != -1) {
            const double Val = FltV.Empty() ? 1.0 : FltV[StrN].Val;
            if (Val > 1e-16) { SpV.Add(TIntFltKd(FtrId, Val)); }
        }
    }
    SpV.Sort();
    // merge elements with the same id
    int GoodSpN = 0;
    for (int SpN = 1; SpN < SpV.Len(); SpN++) {
        if (SpV[GoodSpN].Key == SpV[SpN].Key) {
            // repetition of previous id, sum counts
            SpV[GoodSpN].Dat += SpV[SpN].Dat;
        } else {
            // increase the pointer to the next good position
            GoodSpN++;
            // and move the new value down to the good position
            SpV[GoodSpN] = SpV[SpN];
        }
    }
    // truncate the vector
    SpV.Trunc(GoodSpN + 1);
    // replace values with 1 if needed
    if (IsBinary()) { for (TIntFltKd& Sp : SpV) { Sp.Dat = 1.0; } }
    // final normalization, if needed
    if (IsNormalize()) { TLinAlg::Normalize(SpV); }    
}
Esempio n. 4
0
void TDateWnd::AddFtr(const TTm& Val, TIntFltKdV& SpV, int& Offset) const {
    const int Ftr = GetFtr(Val);
    for (int FtrN = 0; FtrN < WndSize; FtrN++) {
        SpV.Add(TIntFltKd(Offset + Ftr + FtrN, Wgt));
    }
    Offset += GetDim();
}
Esempio n. 5
0
void TMultinomial::AddFtr(const TStr& Str, TIntFltKdV& SpV, int& Offset) const {
    const int FtrId = FtrGen.GetFtr(Str);
    if (FtrId != -1) {
        SpV.Add(TIntFltKd(Offset + FtrId, 1.0));
    }
    Offset += GetDim();
}
Esempio n. 6
0
void TFtrGenNominal::Add(
        const TStr& Val, TIntFltKdV& SpV, int& Offset) const {

    if (ValH.IsKey(Val)) { 
        SpV.Add(TIntFltKd(Offset + ValH.GetKeyId(Val), 1.0)); 
    } 
    Offset += ValH.Len(); 
}
Esempio n. 7
0
void TSparseNumeric::AddFtr(const TIntFltKdV& InSpV, TIntFltKdV& SpV, int& Offset) const {
    for (int SpN = 0; SpN < InSpV.Len(); SpN++) {
        const int Id = InSpV[SpN].Key;
        double Val = FtrGen.GetFtr(InSpV[SpN].Dat);
        SpV.Add(TIntFltKd(Offset + Id, Val));
    }
    Offset += GetVals();
}
Esempio n. 8
0
void TCategorical::AddFtr(const TStr& Val, TIntFltKdV& SpV, int& Offset) const {
    // get dimension to set to 1.0
    const int Dim = GetFtr(Val);
    // set to 1.0 if we get a dimension
    if (Dim != -1) { SpV.Add(TIntFltKd(Offset + Dim, 1.0)); }
    // update offset
    Offset += GetDim();
}
Esempio n. 9
0
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV, int& Offset) const {
	// create sparse vector
    TIntFltKdV ValSpV; AddFtr(TokenStrV, ValSpV);
    // add to the full feature vector and increase offset count
    for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) {
        const TIntFltKd& ValSp = ValSpV[ValSpN];
        SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat));
    }    
    // increase the offset by the dimension
    Offset += GetDim();
}
Esempio n. 10
0
void TMultinomial::AddFtr(const TStrV& StrV, const TFltV& FltV, TIntFltKdV& SpV, int& Offset) const {
    // generate feature 
    TIntFltKdV ValSpV; AddFtr(StrV, FltV, ValSpV);
    // add to the full feature vector and increase offset count
    for (int ValSpN = 0; ValSpN < ValSpV.Len(); ValSpN++) {
        const TIntFltKd& ValSp = ValSpV[ValSpN];
        SpV.Add(TIntFltKd(Offset + ValSp.Key, ValSp.Dat));
    }
    // increase the offset by the dimension
    Offset += GetDim();
}
Esempio n. 11
0
void TJsonVal::GetArrNumSpV(TIntFltKdV& NumSpV) const {
    EAssert(IsArr());
    for (int ElN = 0; ElN < GetArrVals(); ElN++) {
        PJsonVal ArrVal = GetArrVal(ElN);
        EAssert(ArrVal->IsArr());
        EAssert(ArrVal->GetArrVals() ==  2);
        int Idx = ArrVal->GetArrVal(0)->GetInt();
        double Val = ArrVal->GetArrVal(1)->GetNum();
        NumSpV.Add(TIntFltKd(Idx, Val));
    }
    NumSpV.Sort();
}
Esempio n. 12
0
void TStrFeatureSpace::FromStr(const TStr& Serialized, TIntFltKdV& Vec, char Sep) const {
	TStrV Toks;
	Serialized.SplitOnAllCh(Sep, Toks, true);
	Vec.Gen(Toks.Len(),0);
	for (int i = 0; i < Toks.Len(); i++) {
		TStr Key, Value;
		Toks[i].SplitOnCh(Key, ':', Value);
		TStrFSSize FeatureId;
		if (GetIfExistsId(Key, FeatureId)) {
			double FeatureWgt;
			if (Value.IsFlt(FeatureWgt)) {
				TIntFltKd& Kv = Vec[Vec.Add()];
				Kv.Key = FeatureId;
				Kv.Dat = FeatureWgt;
			} else {
				EFailR((Value + TStr(" is not a valid floating point number.")).CStr());
			}
		}
	}

	Vec.Sort();
}
Esempio n. 13
0
void TBagOfWords::AddFtr(const TStrV& TokenStrV, TIntFltKdV& SpV) const {
    // aggregate token counts
    TIntH TermFqH;
	TStrV NgramStrV;
    GenerateNgrams(TokenStrV, NgramStrV);	
    for (int TokenStrN = 0; TokenStrN < NgramStrV.Len(); TokenStrN++) {
        const TStr& TokenStr = NgramStrV[TokenStrN];
        // get token ID
        const int TokenId = IsHashing() ?
            (TokenStr.GetHashTrick() % HashDim) : // hashing
            TokenSet.GetKeyId(TokenStr); // vocabulary
        // add if known token
        if (TokenId != -1) {
            TermFqH.AddDat(TokenId)++;
        }
    }
    // make a sparse vector out of it
    SpV.Gen(TermFqH.Len(), 0);
    int KeyId = TermFqH.FFirstKeyId();
    while (TermFqH.FNextKeyId(KeyId)) {
        const int TermId = TermFqH.GetKey(KeyId);
        double TermVal = 1.0;
        if (IsTf()) { TermVal *= double(TermFqH[KeyId]); }
        if (IsIdf()) {
            if (ForgetP) {
                const double DocFq = double(DocFqV[TermId]) + OldDocFqV[TermId];
                if (DocFq > 0.1) { TermVal *= log((double(Docs) + OldDocs) / DocFq); }
            } else {
                TermVal *= log(double(Docs) / double(DocFqV[TermId]));
            }
        }
        SpV.Add(TIntFltKd(TermId, TermVal));
    }
    SpV.Sort();
    // step (4): normalize the vector if so required
    if (IsNormalize()) { TLinAlg::Normalize(SpV); }
}
Esempio n. 14
0
void TNumeric::AddFtr(const double& Val, TIntFltKdV& SpV, int& Offset) const {
    SpV.Add(TIntFltKd(Offset, GetFtr(Val))); Offset++;
}
Esempio n. 15
0
void TEmaSpVec::Update(const TIntFltKdV& Val, const uint64& NewTmMSecs) {
	double TmInterval1;
	// EMA(first_point) = first_point (no smoothing is possible)
	if (InitMinMSecs == 0) {
		if (LastVal.Empty()) { 
			LastVal = Val; 
			Ema = Val; 
			TmMSecs = NewTmMSecs; 
			InitP = true;  
			return; 
		}
	}
	if (NewTmMSecs == TmMSecs) {
		TmInterval1 = 1.0;
	} else {
		TmInterval1 = (double)(NewTmMSecs - TmMSecs);
	}
	if (InitP) {
		// compute parameters for EMA
		double Alpha = TmInterval1 / TmInterval;
		const double Mi = exp(-Alpha);
		const double Ni = GetNi(Alpha, Mi);
		// compute new ema
		//Ema = Mi*Ema + (Ni - Mi)*LastVal + (1.0 - Ni)*Val;
		TIntFltKdV Tmp;
		TLinAlg::LinComb(Mi, Ema, Ni - Mi, LastVal, Tmp);		
		TLinAlg::LinComb(1, Tmp, 1.0 - Ni, Val, Ema);
	} else {
		// update buffers
		InitValV.Add(Val);
		InitMSecsV.Add(NewTmMSecs);
		// initialize when enough data
		const uint64 StartInitMSecs = InitMSecsV[0] + InitMinMSecs;
		if (StartInitMSecs < NewTmMSecs) {
			// Initialize using "buildup time interval",
			//TODO: check how interpolation type influences this code
			const int Vals = InitMSecsV.Len();
			// compute weights for each value in buffer
			TFltV WeightV(Vals, 0);
			for (int ValN = 0; ValN < Vals; ValN++) {
				const double Alpha = (double)(TmInterval1);
				WeightV.Add(exp(-Alpha));
			}
			// normalize weights so they sum to 1.0
			TLinAlg::NormalizeL1(WeightV);
			// compute initial value of EMA as weighted sum
			//Ema = TLinAlg::DotProduct(WeightV, InitValV);
			TIntFltKdV Tmp;
			for (int i = 0; i < WeightV.Len(); i++) {
				TIntFltKdV Tmp2;
				TLinAlg::LinComb(1, Tmp, WeightV[i], InitValV[i], Tmp2);
				Tmp = Tmp2;
			}
			Ema = Tmp;

			// mark that we are done and clean up after us
			InitP = true; 
			InitValV.Clr();
			InitMSecsV.Clr();
		}
	}
	// remove dimensions bellow cutoff
	TIntFltKdV TmpEma;
	//printf("cutoff %f\n", Cutoff.Val);
	for (int i = 0; i < Ema.Len(); i++) {
		if (TFlt::Abs(Ema[i].Dat.Val) >= Cutoff) {
			TmpEma.Add(Ema[i]);
		}
	}
	Ema = TmpEma;

	// update last value
	LastVal = Val;
	// update current time
	TmMSecs = NewTmMSecs;
}