CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(SGVector<float64_t> w) : CStringKernel<char>(10) { init(); type=E_EXTERNAL; degree=w.vlen; weights=SG_MALLOC(float64_t, degree*(1+max_mismatch)); weights_degree=degree; weights_length=(1+max_mismatch); for (int32_t i=0; i<degree*(1+max_mismatch); i++) weights[i]=w.vector[i]; w.free_vector(); }
bool CGaussianNaiveBayes::train(CFeatures* data) { // init features with data if necessary and assure type is correct if (data) { if (!data->has_property(FP_DOT)) SG_ERROR("Specified features are not of type CDotFeatures\n"); set_features((CDotFeatures*) data); } // get int labels to train_labels and check length equality ASSERT(m_labels); SGVector<int32_t> train_labels = m_labels->get_int_labels(); ASSERT(m_features->get_num_vectors()==train_labels.vlen); // init min_label, max_label and loop variables int32_t min_label = train_labels.vector[0]; int32_t max_label = train_labels.vector[0]; int i,j; // find minimal and maximal label for (i=1; i<train_labels.vlen; i++) { min_label = CMath::min(min_label, train_labels.vector[i]); max_label = CMath::max(max_label, train_labels.vector[i]); } // subtract minimal label from all labels for (i=0; i<train_labels.vlen; i++) train_labels.vector[i]-= min_label; // get number of classes, minimal label and dimensionality m_num_classes = max_label-min_label+1; m_min_label = min_label; m_dim = m_features->get_dim_feature_space(); // allocate memory for distributions' parameters and a priori probability m_means.matrix = SG_MALLOC(float64_t, m_num_classes*m_dim); m_means.num_rows = m_dim; m_means.num_cols = m_num_classes; m_variances.matrix = SG_MALLOC(float64_t, m_num_classes*m_dim); m_variances.num_rows = m_dim; m_variances.num_cols = m_num_classes; m_label_prob.vector = SG_MALLOC(float64_t, m_num_classes); m_label_prob.vlen = m_num_classes; // allocate memory for label rates m_rates.vector = SG_MALLOC(float64_t, m_num_classes); m_rates.vlen = m_num_classes; // assure that memory is allocated ASSERT(m_means.matrix); ASSERT(m_variances.matrix); ASSERT(m_rates.vector); ASSERT(m_label_prob.vector); // make arrays filled by zeros before using m_means.zero(); m_variances.zero(); m_label_prob.zero(); m_rates.zero(); // number of iterations in all cycles int32_t max_progress = 2 * train_labels.vlen + 2 * m_num_classes; // current progress int32_t progress = 0; SG_PROGRESS(progress, 0, max_progress); // get sum of features among labels for (i=0; i<train_labels.vlen; i++) { SGVector<float64_t> fea = m_features->get_computed_dot_feature_vector(i); for (j=0; j<m_dim; j++) m_means(j, train_labels.vector[i]) += fea.vector[j]; fea.free_vector(); m_label_prob.vector[train_labels.vector[i]]+=1.0; progress++; SG_PROGRESS(progress, 0, max_progress); } // get means of features of labels for (i=0; i<m_num_classes; i++) { for (j=0; j<m_dim; j++) m_means(j, i) /= m_label_prob.vector[i]; progress++; SG_PROGRESS(progress, 0, max_progress); } // compute squared residuals with means available for (i=0; i<train_labels.vlen; i++) { SGVector<float64_t> fea = m_features->get_computed_dot_feature_vector(i); for (j=0; j<m_dim; j++) { m_variances(j, train_labels.vector[i]) += CMath::sq(fea[j]-m_means(j, train_labels.vector[i])); } fea.free_vector(); progress++; SG_PROGRESS(progress, 0, max_progress); } // get variance of features of labels for (i=0; i<m_num_classes; i++) { for (j=0; j<m_dim; j++) m_variances(j, i) /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1; // get a priori probabilities of labels m_label_prob.vector[i]/= m_num_classes; progress++; SG_PROGRESS(progress, 0, max_progress); } SG_DONE(); train_labels.free_vector(); return true; }