/* compute mean, maximum, minimum, difference between min & max, variance, median, 1st quartile, 3rd quartile, interquartile range of a list of real values */ int getStatistics(mx_real_t* features, mx_real_t* list,int count) { int i; mx_real_t *_list; mx_real_t mean=0, max=-MX_REAL_MAX, min=MX_REAL_MAX, variance=0; mx_real_t median, quartile_1, quartile_3; if (count==0) { for (i=0;i<STATS;i++) features[i]=0.0; return STATS; } if (count==1) { features[0]=features[1]=features[2]=features[5]=features[6]=features[7]=list[0]; features[3]=features[4]=features[8]=0.0; return STATS; } _list= (mx_real_t *) rs_malloc(count*sizeof(mx_real_t),"statistics list"); for (i=0;i<count;i++) { _list[i]=list[i]; mean+=_list[i]; variance+=mx_sqr(_list[i]); if (_list[i] <min) min=_list[i]; if (_list[i] >max) max=_list[i]; } mean /= count; variance = variance/count - mx_sqr(mean); qsort(_list,count,sizeof(mx_real_t),_cmp_mx_real); median = _list[count/2]; quartile_1 = _list[count/4]; quartile_3 = _list[3*count/4]; features[0] = mean; features[1] = max; features[2] = min; features[3] = max - min; features[4] = variance; features[5] = median; features[6] = quartile_1; features[7] = quartile_3; features[8] = fabs(quartile_3 - quartile_1); rs_free(_list); return STATS; }
naive_bayes_classifier_t *nB_update_classifier(naive_bayes_classifier_t *nB, int class_ind, mx_real_t *features) { int i; mx_real_t temp; nB->n_instances[class_ind]++; if (!nB->finished) { for (i=0;i<nB->feature_dim;i++) { nB->means[class_ind][i]+=features[i]; nB->std_dev[class_ind][i]+=features[i]*features[i]; } } else { for (i=0;i<nB->feature_dim;i++) { temp = nB->means[class_ind][i]; nB->means[class_ind][i]=(nB->means[class_ind][i]*nB->n_instances[class_ind] + features[i]) / (nB->n_instances[class_ind]+ 1.0); nB->std_dev[class_ind][i] = sqrt(((mx_sqr(nB->std_dev[class_ind][i]) + temp * temp)*nB->n_instances[class_ind] + features[i]*features[i]) / (nB->n_instances[class_ind]+1.0) - nB->means[class_ind][i]*nB->means[class_ind][i]); } for (i=0;i<nB->n_classes; i++) { nB->class_probs[i]= 1.0*nB->n_instances[i]/(nB->n_instances[nB->n_classes]+1); } } nB->n_instances[nB->n_classes]++; return nB; }
/* Standard normal distribution; often results in 0 for many attributes */ mx_real_t nB_class_prob_simple(naive_bayes_classifier_t *nB, mx_real_t *instance, int class_ind) { int j; mx_real_t prob=0; mx_real_t diff, temp, stddev; mx_real_t norm_const=sqrt(2 * M_PI); prob = nB->class_probs[class_ind]; for (j=0; j<nB->feature_dim; j++) { diff = instance[j]-nB->means[class_ind][j]; stddev=nB->std_dev[class_ind][j]; if (stddev ==0) stddev=MX_REAL_MIN; temp = (1 / (norm_const * stddev)) * exp(-(mx_sqr(diff) / (2 * mx_sqr(stddev)))); prob *= temp; } return prob; }
/* log normal distribution; can cope with many attribute, but difficult to normalise */ mx_real_t nB_class_prob(naive_bayes_classifier_t *nB, mx_real_t *instance, int class_ind) { int j; mx_real_t prob=0, sqr; prob = mx_log(nB->class_probs[class_ind]); for (j=0; j<nB->feature_dim; j++) { // stdev == 0 -> attribute is constant if (nB->std_dev[class_ind][j]==0) continue; sqr = mx_sqr(nB->std_dev[class_ind][j]); if (sqr !=0) prob+=-mx_log(nB->std_dev[class_ind][j])-mx_sqr(instance[j]-nB->means[class_ind][j])/(2*sqr); else prob+=-mx_log(nB->std_dev[class_ind][j])-mx_sqr(instance[j]-nB->means[class_ind][j])/(2*MX_REAL_MIN); } // Transformation in vernünftigen Wertebereich return exp(prob/nB->feature_dim); }