CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(SGVector<float64_t> w)
: CStringKernel<char>(10)
{
	init();

	type=E_EXTERNAL;
	degree=w.vlen;

	weights=SG_MALLOC(float64_t, degree*(1+max_mismatch));
	weights_degree=degree;
	weights_length=(1+max_mismatch);

	for (int32_t i=0; i<degree*(1+max_mismatch); i++)
		weights[i]=w.vector[i];

	w.free_vector();
}
Esempio n. 2
0
bool CGaussianNaiveBayes::train(CFeatures* data)
{
	// init features with data if necessary and assure type is correct
	if (data)
	{
		if (!data->has_property(FP_DOT))
				SG_ERROR("Specified features are not of type CDotFeatures\n");
		set_features((CDotFeatures*) data);
	}
	// get int labels to train_labels and check length equality
	ASSERT(m_labels);
	SGVector<int32_t> train_labels = m_labels->get_int_labels();
	ASSERT(m_features->get_num_vectors()==train_labels.vlen);

	// init min_label, max_label and loop variables
	int32_t min_label = train_labels.vector[0];
	int32_t max_label = train_labels.vector[0];
	int i,j;

	// find minimal and maximal label
	for (i=1; i<train_labels.vlen; i++)
	{
		min_label = CMath::min(min_label, train_labels.vector[i]);
		max_label = CMath::max(max_label, train_labels.vector[i]);
	}

	// subtract minimal label from all labels
	for (i=0; i<train_labels.vlen; i++)
		train_labels.vector[i]-= min_label;

	// get number of classes, minimal label and dimensionality
	m_num_classes = max_label-min_label+1;
	m_min_label = min_label;
	m_dim = m_features->get_dim_feature_space();

	// allocate memory for distributions' parameters and a priori probability
	m_means.matrix = SG_MALLOC(float64_t, m_num_classes*m_dim);
	m_means.num_rows = m_dim;
	m_means.num_cols = m_num_classes;

	m_variances.matrix = SG_MALLOC(float64_t, m_num_classes*m_dim);
	m_variances.num_rows = m_dim;
	m_variances.num_cols = m_num_classes;

	m_label_prob.vector = SG_MALLOC(float64_t, m_num_classes);
	m_label_prob.vlen = m_num_classes;

	// allocate memory for label rates
	m_rates.vector = SG_MALLOC(float64_t, m_num_classes);
	m_rates.vlen = m_num_classes;

	// assure that memory is allocated
	ASSERT(m_means.matrix);
	ASSERT(m_variances.matrix);
	ASSERT(m_rates.vector);
	ASSERT(m_label_prob.vector);

	// make arrays filled by zeros before using
	m_means.zero();
	m_variances.zero();
	m_label_prob.zero();
	m_rates.zero();

	// number of iterations in all cycles
	int32_t max_progress = 2 * train_labels.vlen + 2 * m_num_classes;
	
	// current progress
	int32_t progress = 0;	
	SG_PROGRESS(progress, 0, max_progress);

	// get sum of features among labels
	for (i=0; i<train_labels.vlen; i++)
	{
		SGVector<float64_t> fea = m_features->get_computed_dot_feature_vector(i);
		for (j=0; j<m_dim; j++)
			m_means(j, train_labels.vector[i]) += fea.vector[j];
		fea.free_vector();

		m_label_prob.vector[train_labels.vector[i]]+=1.0;

		progress++;
		SG_PROGRESS(progress, 0, max_progress);
	}

	// get means of features of labels
	for (i=0; i<m_num_classes; i++)
	{
		for (j=0; j<m_dim; j++)
			m_means(j, i) /= m_label_prob.vector[i];

		progress++;
		SG_PROGRESS(progress, 0, max_progress);
	}

	// compute squared residuals with means available
	for (i=0; i<train_labels.vlen; i++)
	{
		SGVector<float64_t> fea = m_features->get_computed_dot_feature_vector(i);
		for (j=0; j<m_dim; j++)
		{
			m_variances(j, train_labels.vector[i]) += 
				CMath::sq(fea[j]-m_means(j, train_labels.vector[i]));
		}
		fea.free_vector();

		progress++;
		SG_PROGRESS(progress, 0, max_progress);
	}	

	// get variance of features of labels
	for (i=0; i<m_num_classes; i++)
	{
		for (j=0; j<m_dim; j++)
			m_variances(j, i) /= m_label_prob.vector[i] > 1 ? m_label_prob.vector[i]-1 : 1;
		
		// get a priori probabilities of labels
		m_label_prob.vector[i]/= m_num_classes;

		progress++;
		SG_PROGRESS(progress, 0, max_progress);
	}
	SG_DONE();

	train_labels.free_vector();

	return true;
}