template<class ST> void CSparseFeatures<ST>::add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val) { REQUIRE(vec, "add_to_dense_vec(num=%d,dim=%d): vec must not be NULL\n", num, dim); REQUIRE(dim>=get_num_features(), "add_to_dense_vec(num=%d,dim=%d): dim should contain number of features %d\n", num, dim, get_num_features()); SGSparseVector<ST> sv=get_sparse_feature_vector(num); if (sv.features) { if (abs_val) { for (int32_t i=0; i<sv.num_feat_entries; i++) { vec[sv.features[i].feat_index]+=alpha *CMath::abs(sv.features[i].entry); } } else { for (int32_t i=0; i<sv.num_feat_entries; i++) { vec[sv.features[i].feat_index]+=alpha *sv.features[i].entry; } } } free_sparse_feature_vector(num); }
template< class ST > void CMatrixFeatures< ST >::get_feature_vector_col( SGVector< ST > out, int32_t num, int32_t col) const { if ( num < 0 || num >= get_num_vectors() ) { SG_ERROR("The index of the feature vector to get must be between " "0 and %d (get_num_vectors()-1)\n", get_num_vectors()-1); } // Shorthands for the dimensions of the feature vector to get int32_t num_cols = m_features[num].num_cols; int32_t num_rows = m_features[num].num_rows; if ( col < 0 || col >= num_cols ) { SG_ERROR("The index of the column to get must be between " "0 and %d (#columns of the feature vector)\n", num_cols); } if ( out.vlen < get_num_features() ) { SG_ERROR("The vector out must have space to hold at least " "%d (get_num_features()) elements\n", get_num_features()); } int32_t start = col*num_rows; for ( int32_t i = 0 ; i < get_num_features(); ++i ) { out[i] = m_features[num][start + i]; } }
template<class ST> ST CSparseFeatures<ST>::get_feature(int32_t num, int32_t index) { REQUIRE(index>=0 && index<get_num_features(), "get_feature(num=%d,index=%d): index exceeds [0;%d]\n", num, index, get_num_features()-1); SGSparseVector<ST> sv=get_sparse_feature_vector(num); ST ret = sv.get_feature(index); free_sparse_feature_vector(num); return ret; }
template<class ST> void CSparseFeatures<ST>::set_sparse_feature_matrix(SGSparseMatrix<ST> sm) { if (m_subset_stack->has_subsets()) SG_ERROR("Not allowed with subset\n"); sparse_feature_matrix=sm; // TODO: check should be implemented in sparse matrix class for (int32_t j=0; j<get_num_vectors(); j++) { SGSparseVector<ST> sv=get_sparse_feature_vector(j); REQUIRE(get_num_features() >= sv.get_num_dimensions(), "sparse_matrix[%d] check failed (matrix features %d >= vector dimension %d)\n", j, get_num_features(), sv.get_num_dimensions()); } }
template<class ST> SGVector<ST> CSparseFeatures<ST>::get_full_feature_vector(int32_t num) { SGSparseVector<ST> sv=get_sparse_feature_vector(num); SGVector<ST> dense = sv.get_dense(get_num_features()); free_sparse_feature_vector(num); return dense; }
template< class ST > void CMatrixFeatures< ST >::set_feature_vector( SGMatrix< ST > const vec, int32_t num) { if ( num < 0 || num >= get_num_vectors() ) { SG_ERROR("The index of the feature vector to set must be between " "0 and %d (get_num_vectors()-1)\n", get_num_vectors()-1); } if ( get_num_features() != 0 && vec.num_rows != get_num_features() ) { SG_ERROR("The feature vector to set must have the same features " "as the rest of the MatrixFeatures, %d " "(get_num_features())\n", get_num_features()); } m_features.set_matrix(num, vec); }
template<class ST> SGMatrix<ST> CSparseFeatures<ST>::get_full_feature_matrix() { SGMatrix<ST> full(get_num_features(), get_num_vectors()); full.zero(); SG_INFO("converting sparse features to full feature matrix of %d x %d" " entries\n", sparse_feature_matrix.num_vectors, get_num_features()) for (int32_t v=0; v<full.num_cols; v++) { int32_t idx=m_subset_stack->subset_idx_conversion(v); SGSparseVector<ST> current=sparse_feature_matrix[idx]; for (int32_t f=0; f<current.num_feat_entries; f++) { int64_t offs=(v*get_num_features()) +current.features[f].feat_index; full.matrix[offs]=current.features[f].entry; } } return full; }
CFeatures* CFeatureSelection<ST>::apply(CFeatures* features) { SG_DEBUG("Entering!\n"); // remove previously computed feature subsets m_subset->remove_all_subsets(); // sanity checks REQUIRE(features, "Features cannot be NULL!\n"); REQUIRE(features->get_num_vectors()>0, "Number of feature vectors has to be positive!\n"); REQUIRE(m_target_dim>0, "Target dimension (%d) has to be positive! Set " "a higher number via set_target_dim().\n", m_target_dim); index_t num_features=get_num_features(features); REQUIRE(num_features>0, "Invalid number of features (%d)! Most likely " "feature selection cannot be performed for %s!\n", num_features, features->get_name()); REQUIRE(num_features>m_target_dim, "Number of original features (dimensions of the feature vectors) " "(%d) has to be greater that the target dimension (%d)!\n", num_features, m_target_dim); // this method makes a deep copy of the feature object and performs // feature selection on it. This is already SG_REF'ed because of the // implementation of clone() CFeatures* feats_copy=(CFeatures*)features->clone(); switch (m_algorithm) { case BACKWARD_ELIMINATION: return apply_backward_elimination(feats_copy); default: SG_ERROR("Specified algorithm not yet supported!\n"); return features; } SG_DEBUG("Leaving!\n"); }
template<class ST> SGSparseVector<ST> CSparseFeatures<ST>::get_sparse_feature_vector(int32_t num) { REQUIRE(num>=0 && num<get_num_vectors(), "get_sparse_feature_vector(num=%d): num exceeds [0;%d]\n", num, get_num_vectors()-1); index_t real_num=m_subset_stack->subset_idx_conversion(num); if (sparse_feature_matrix.sparse_matrix) { return sparse_feature_matrix[real_num]; } else { SGSparseVector<ST> result; if (feature_cache) { result.features=feature_cache->lock_entry(num); if (result.features) return result; else { result.features=feature_cache->set_entry(num); } } //if (!result.features) // result.do_free=true; result.features=compute_sparse_feature_vector(num, result.num_feat_entries, result.features); if (get_num_preprocessors()) { int32_t tmp_len=result.num_feat_entries; SGSparseVectorEntry<ST>* tmp_feat_before=result.features; SGSparseVectorEntry<ST>* tmp_feat_after = NULL; for (int32_t i=0; i<get_num_preprocessors(); i++) { //tmp_feat_after=((CSparsePreprocessor<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len); if (i!=0) // delete feature vector, except for the the first one, i.e., feat SG_FREE(tmp_feat_before); tmp_feat_before=tmp_feat_after; } if (tmp_feat_after) { memcpy(result.features, tmp_feat_after, sizeof(SGSparseVectorEntry<ST>)*tmp_len); SG_FREE(tmp_feat_after); result.num_feat_entries=tmp_len; } SG_DEBUG("len: %d len2: %d\n", result.num_feat_entries, get_num_features()) } return result ; } }
CFeatures* CFeatureSelection<ST>::apply_backward_elimination(CFeatures* features) { SG_DEBUG("Entering!\n"); // precompute whenever appropriate for performing the rest of the tasks precompute(); // NULL check for features is handled in get_num_features index_t num_features=get_num_features(features); SG_DEBUG("Initial number of features %d!\n", num_features); // the main loop while (num_features>m_target_dim) { // tune the measurement parameters whenever necessary based on current // features adapt_params(features); // compute the measures for each of the current dimensions SGVector<float64_t> measures(num_features); for (index_t i=0; i<num_features; ++i) measures[i]=compute_measures(features, i); if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG) measures.display_vector("measures"); // rank the measures SGVector<index_t> argsorted=CMath::argsort(measures); if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG) argsorted.display_vector("argsorted"); // make sure that we don't end up with lesser feats than target dim index_t to_remove; if (m_policy==N_SMALLEST || m_policy==N_LARGEST) to_remove=m_num_remove; else to_remove=num_features*m_num_remove*0.01; index_t can_remove=num_features-m_target_dim; // if policy is to remove N feats corresponding to smallest/largest // measures, we just replace N with can_remove. if policy is to remove // N% feats, then we change the policy temporarily and remove a fixed // can_remove number of feats instead index_t orig_remove=m_num_remove; EFeatureRemovalPolicy orig_policy=m_policy; if (to_remove>can_remove) { m_num_remove=can_remove; SG_DEBUG("Can only remove %d features in this iteration!\n", can_remove); if (m_policy==PERCENTILE_SMALLEST) m_policy=N_SMALLEST; else if (m_policy==PERCENTILE_LARGEST) m_policy=N_LARGEST; } // remove appropriate number of features based on the measures and the // removal policy. this internally update the subset for selected // features as well features=remove_feats(features, argsorted); // restore original removal policy and numbers if necessary for the // sake of consistency if (to_remove>can_remove) { m_policy=orig_policy; m_num_remove=orig_remove; } // update the number of features num_features=get_num_features(features); SG_DEBUG("Current number of features %d!\n", num_features); } // sanity check ASSERT(m_subset->get_size()==m_target_dim); SG_DEBUG("Leaving!\n"); return features; }