CvDTree* mushroom_create_dtree( const CvMat* data, const CvMat* missing, const CvMat* responses, float p_weight ) { CvDTree* dtree; CvMat* var_type; int i, hr1 = 0, hr2 = 0, p_total = 0; float priors[] = { 1, p_weight }; var_type = cvCreateMat( data->cols + 1, 1, CV_8U ); cvSet( var_type, cvScalarAll(CV_VAR_CATEGORICAL) ); // all the variables are categorical dtree = new CvDTree; dtree->train( data, CV_ROW_SAMPLE, responses, 0, 0, var_type, missing, CvDTreeParams( 8, // max depth 10, // min sample count 0, // regression accuracy: N/A here true, // compute surrogate split, as we have missing data 15, // max number of categories (use sub-optimal algorithm for larger numbers) 10, // the number of cross-validation folds true, // use 1SE rule => smaller tree true, // throw away the pruned tree branches priors // the array of priors, the bigger p_weight, the more attention // to the poisonous mushrooms // (a mushroom will be judjed to be poisonous with bigger chance) )); // compute hit-rate on the training database, demonstrates predict usage. for( i = 0; i < data->rows; i++ ) { CvMat sample, mask; cvGetRow( data, &sample, i ); cvGetRow( missing, &mask, i ); double r = dtree->predict( &sample, &mask )->value; int d = fabs(r - responses->data.fl[i]) >= FLT_EPSILON; if( d ) { if( r != 'p' ) hr1++; else hr2++; } p_total += responses->data.fl[i] == 'p'; } printf( "Results on the training database:\n" "\tPoisonous mushrooms mis-predicted: %d (%g%%)\n" "\tFalse-alarms: %d (%g%%)\n", hr1, (double)hr1*100/p_total, hr2, (double)hr2*100/(data->rows - p_total) ); cvReleaseMat( &var_type ); return dtree; }
int main() { const int train_sample_count = 300; //#define LEPIOTA #ifdef LEPIOTA const char* filename = "../../../OpenCV_SVN/samples/c/agaricus-lepiota.data"; #else const char* filename = "../../../OpenCV_SVN/samples/c/waveform.data"; #endif CvDTree dtree; CvBoost boost; CvRTrees rtrees; CvERTrees ertrees; CvMLData data; CvTrainTestSplit spl( train_sample_count ); data.read_csv( filename ); #ifdef LEPIOTA data.set_response_idx( 0 ); #else data.set_response_idx( 21 ); data.change_var_type( 21, CV_VAR_CATEGORICAL ); #endif data.set_train_test_split( &spl ); printf("======DTREE=====\n"); dtree.train( &data, CvDTreeParams( 10, 2, 0, false, 16, 0, false, false, 0 )); print_result( dtree.calc_error( &data, CV_TRAIN_ERROR), dtree.calc_error( &data ), dtree.get_var_importance() ); #ifdef LEPIOTA printf("======BOOST=====\n"); boost.train( &data, CvBoostParams(CvBoost::DISCRETE, 100, 0.95, 2, false, 0)); print_result( boost.calc_error( &data, CV_TRAIN_ERROR ), boost.calc_error( &data ), 0 ); #endif printf("======RTREES=====\n"); rtrees.train( &data, CvRTParams( 10, 2, 0, false, 16, 0, true, 0, 100, 0, CV_TERMCRIT_ITER )); print_result( rtrees.calc_error( &data, CV_TRAIN_ERROR), rtrees.calc_error( &data ), rtrees.get_var_importance() ); printf("======ERTREES=====\n"); ertrees.train( &data, CvRTParams( 10, 2, 0, false, 16, 0, true, 0, 100, 0, CV_TERMCRIT_ITER )); print_result( ertrees.calc_error( &data, CV_TRAIN_ERROR), ertrees.calc_error( &data ), ertrees.get_var_importance() ); return 0; }
//Decision Tree void decisiontree ( Mat & trainingData , Mat & trainingClasses , Mat & testData , Mat & testClasses ) { CvDTree dtree ; Mat var_type (3 , 1 , CV_8U ) ; // define attributes as numerical var_type.at < unsigned int >(0 ,0) = CV_VAR_NUMERICAL; var_type.at < unsigned int >(0 ,1) = CV_VAR_NUMERICAL ; // define output node as numerical var_type.at < unsigned int >(0 ,2) = CV_VAR_NUMERICAL; dtree.train ( trainingData , CV_ROW_SAMPLE , trainingClasses , Mat () , Mat () , var_type , Mat () , CvDTreeParams () ) ; Mat predicted ( testClasses.rows , 1 , CV_32F ) ; for ( int i = 0; i < testData.rows ; i ++) { const Mat sample = testData.row ( i ) ; CvDTreeNode * prediction = dtree.predict ( sample ) ; predicted.at < float > (i , 0) = prediction->value ; } cout << " Accuracy_ { TREE } = " << evaluate ( predicted , testClasses ) << endl ; plot_binary ( testData , predicted , " Predictions tree " ) ; }
void trainFromTxt() { FILE* fin = fopen("train.txt","r"); int N,i; training = true; fscanf(fin,"%i",&N); CvMat *data = cvCreateMat(N,COLS,CV_32F); CvMat *resp = cvCreateMat(N,1,CV_32F); char c[10]; float *fdata,*fresp; for(i=0; i<N; i++) { fdata = data->data.fl + i*COLS; fresp = resp->data.fl + i; fscanf(fin,"%s",c); for (int j=0; j<COLS; j++) { fscanf(fin,"%f",&fdata[j]); } *fresp = (float)c[0]; } printMatrix(data); printMatrix(resp); CvMat *vartype = cvCreateMat( data->cols + 1, 1, CV_8U ); unsigned char *vtype = vartype->data.ptr; //Tipos de variables de entrada al árbol vtype[0]=CV_VAR_NUMERICAL; vtype[1]=CV_VAR_NUMERICAL; vtype[2]=CV_VAR_NUMERICAL; vtype[3]=CV_VAR_NUMERICAL; vtype[4]=CV_VAR_CATEGORICAL; //Tipo de la salida del árbol ptree = new CvDTree; ptree->train(data,CV_ROW_SAMPLE,resp,0,0,vartype,0,CvDTreeParams()); }
/** * Creates a matrix for training but feeds it from images taken of folder * ./training; Only JPG images are taken into account and all shapes in those * images are clasified according to the first letter of the picture. */ void train() { training = true; mostrar = (flags & SH_T) != 0; listFiles(DIR_TR,training_image); fillMatrix(); //printMatrix(t_data); //printMatrix(t_resp); CvMat *vartype = cvCreateMat( t_data->cols + 1, 1, CV_8U ); unsigned char *vtype = vartype->data.ptr; //Tipos de variables de entrada al árbol vtype[0]=CV_VAR_NUMERICAL; vtype[1]=CV_VAR_NUMERICAL; vtype[2]=CV_VAR_NUMERICAL; vtype[3]=CV_VAR_NUMERICAL; vtype[4]=CV_VAR_CATEGORICAL; //Tipo de la salida del árbol if ((flags & F_CHK) != 0) { trainMask = cvCreateMat(t_data->rows, 1, CV_8U); unsigned char *x = trainMask->data.ptr; CvRNG seed = cvRNG(time(0)); for (int i=0; i<t_data->rows; i++,x++) { double p = cvRandReal(&seed); if (p < probTrain) { *x = 1; } else { *x=0; } } } ptree = new CvDTree; ptree->train(t_data,CV_ROW_SAMPLE,t_resp,0,trainMask,vartype,0,CvDTreeParams()); }
/** * @author JIA Pei * @version 2009-10-04 * @brief Training * @param data Input - input data * @param categories Input - column vector * @return classification time cost */ void CClassificationAlgs::Training(const Mat_<float>& data, const Mat_<int>& categories) { unsigned int NbOfSamples = data.rows; set<int> ClassSet; for(int i = 0; i < categories.rows; i++) { ClassSet.insert(categories(i, 0)); } this->m_iNbOfCategories = ClassSet.size(); switch(this->m_iClassificationMethod) { case CClassificationAlgs::DecisionTree: this->m_CVDtree.train( data, CV_ROW_SAMPLE, categories, Mat(), Mat(), Mat(), Mat(), CvDTreeParams( INT_MAX, 2, 0, false, this->m_iNbOfCategories, 0, false, false, 0 ) ); break; case CClassificationAlgs::Boost: this->m_CVBoost.train( data, CV_ROW_SAMPLE, categories, Mat(), Mat(), Mat(), Mat(), CvBoostParams(CvBoost::DISCRETE, 50, 0.95, INT_MAX, false, 0), false ); break; case CClassificationAlgs::RandomForest: this->m_CVRTrees.train( data, CV_ROW_SAMPLE, categories, Mat(), Mat(), Mat(), Mat(), CvRTParams( INT_MAX, 2, 0, false, this->m_iNbOfCategories, 0, true, 0, 100, 0, CV_TERMCRIT_ITER ) ); break; case CClassificationAlgs::ExtremeRandomForest: this->m_CVERTrees.train(data, CV_ROW_SAMPLE, categories, Mat(), Mat(), Mat(), Mat(), CvRTParams( INT_MAX, 2, 0, false, this->m_iNbOfCategories, 0, true, 0, 100, 0, CV_TERMCRIT_ITER ) ); break; case CClassificationAlgs::SVM: this->m_CVSVM.train( data, categories, Mat(), Mat(), CvSVMParams(CvSVM::C_SVC, CvSVM::RBF, 0, 1, 0, 1, 0, 0, NULL, cvTermCriteria(CV_TERMCRIT_ITER, 1000, 1E-6) ) ); break; } }
int CV_DTreeTest :: train( int test_case_idx ) { int MAX_DEPTH, MIN_SAMPLE_COUNT, MAX_CATEGORIES, CV_FOLDS; float REG_ACCURACY = 0; bool USE_SURROGATE, IS_PRUNED; const char* data_name = ((CvFileNode*)cvGetSeqElem( data_sets_names, test_case_idx ))->data.str.ptr; // read validation params CvFileStorage* fs = ts->get_file_storage(); CvFileNode* fnode = cvGetFileNodeByName( fs, 0, "validation" ), *fnode1 = 0; fnode = cvGetFileNodeByName( fs, fnode, name ); fnode = cvGetFileNodeByName( fs, fnode, data_name ); fnode = cvGetFileNodeByName( fs, fnode, "model_params" ); fnode1 = cvGetFileNodeByName( fs, fnode, "max_depth" ); if ( !fnode1 ) { ts->printf( CvTS::LOG, "MAX_DEPTH can not be read from config file" ); return CvTS::FAIL_INVALID_TEST_DATA; } MAX_DEPTH = fnode1->data.i; fnode1 = cvGetFileNodeByName( fs, fnode, "min_sample_count" ); if ( !fnode1 ) { ts->printf( CvTS::LOG, "MAX_DEPTH can not be read from config file" ); return CvTS::FAIL_INVALID_TEST_DATA; } MIN_SAMPLE_COUNT = fnode1->data.i; fnode1 = cvGetFileNodeByName( fs, fnode, "use_surrogate" ); if ( !fnode1 ) { ts->printf( CvTS::LOG, "USE_SURROGATE can not be read from config file" ); return CvTS::FAIL_INVALID_TEST_DATA; } USE_SURROGATE = ( fnode1->data.i!= 0); fnode1 = cvGetFileNodeByName( fs, fnode, "max_categories" ); if ( !fnode1 ) { ts->printf( CvTS::LOG, "MAX_CATEGORIES can not be read from config file" ); return CvTS::FAIL_INVALID_TEST_DATA; } MAX_CATEGORIES = fnode1->data.i; fnode1 = cvGetFileNodeByName( fs, fnode, "cv_folds" ); if ( !fnode1 ) { ts->printf( CvTS::LOG, "CV_FOLDS can not be read from config file" ); return CvTS::FAIL_INVALID_TEST_DATA; } CV_FOLDS = fnode1->data.i; fnode1 = cvGetFileNodeByName( fs, fnode, "is_pruned" ); if ( !fnode1 ) { ts->printf( CvTS::LOG, "IS_PRUNED can not be read from config file" ); return CvTS::FAIL_INVALID_TEST_DATA; } IS_PRUNED = (fnode1->data.i != 0); if ( !tree->train( &data, CvDTreeParams(MAX_DEPTH, MIN_SAMPLE_COUNT, REG_ACCURACY, USE_SURROGATE, MAX_CATEGORIES, CV_FOLDS, false, IS_PRUNED, 0 )) ) { ts->printf( CvTS::LOG, "in test case %d model training was failed", test_case_idx ); return CvTS::FAIL_INVALID_OUTPUT; } return CvTS::OK; }
int main(int argc, char **argv) { float priors[] = { 1.0, 10.0 }; // Edible vs poisonos weights CvMat *var_type; CvMat *data; // jmh add data = cvCreateMat(20, 30, CV_8U); // jmh add var_type = cvCreateMat(data->cols + 1, 1, CV_8U); cvSet(var_type, cvScalarAll(CV_VAR_CATEGORICAL)); // all these vars // are categorical CvDTree *dtree; dtree = new CvDTree; dtree->train(data, CV_ROW_SAMPLE, responses, 0, 0, var_type, missing, CvDTreeParams(8, // max depth 10, // min sample count 0, // regression accuracy: N/A here true, // compute surrogate split, // as we have missing data 15, // max number of categories // (use sub-optimal algorithm for // larger numbers) 10, // cross-validations true, // use 1SE rule => smaller tree true, // throw away the pruned tree branches priors // the array of priors, the bigger // p_weight, the more attention // to the poisonous mushrooms ) ); dtree->save("tree.xml", "MyTree"); dtree->clear(); dtree->load("tree.xml", "MyTree"); #define MAX_CLUSTERS 5 CvScalar color_tab[MAX_CLUSTERS]; IplImage *img = cvCreateImage(cvSize(500, 500), 8, 3); CvRNG rng = cvRNG(0xffffffff); color_tab[0] = CV_RGB(255, 0, 0); color_tab[1] = CV_RGB(0, 255, 0); color_tab[2] = CV_RGB(100, 100, 255); color_tab[3] = CV_RGB(255, 0, 255); color_tab[4] = CV_RGB(255, 255, 0); cvNamedWindow("clusters", 1); for (;;) { int k, cluster_count = cvRandInt(&rng) % MAX_CLUSTERS + 1; int i, sample_count = cvRandInt(&rng) % 1000 + 1; CvMat *points = cvCreateMat(sample_count, 1, CV_32FC2); CvMat *clusters = cvCreateMat(sample_count, 1, CV_32SC1); /* generate random sample from multivariate Gaussian distribution */ for (k = 0; k < cluster_count; k++) { CvPoint center; CvMat point_chunk; center.x = cvRandInt(&rng) % img->width; center.y = cvRandInt(&rng) % img->height; cvGetRows(points, &point_chunk, k * sample_count / cluster_count, k == cluster_count - 1 ? sample_count : (k + 1) * sample_count / cluster_count); cvRandArr(&rng, &point_chunk, CV_RAND_NORMAL, cvScalar(center.x, center.y, 0, 0), cvScalar(img->width / 6, img->height / 6, 0, 0)); } /* shuffle samples */ for (i = 0; i < sample_count / 2; i++) { CvPoint2D32f *pt1 = (CvPoint2D32f *) points->data.fl + cvRandInt(&rng) % sample_count; CvPoint2D32f *pt2 = (CvPoint2D32f *) points->data.fl + cvRandInt(&rng) % sample_count; CvPoint2D32f temp; CV_SWAP(*pt1, *pt2, temp); } cvKMeans2(points, cluster_count, clusters, cvTermCriteria(CV_TERMCRIT_EPS + CV_TERMCRIT_ITER, 10, 1.0)); cvZero(img); for (i = 0; i < sample_count; i++) { CvPoint2D32f pt = ((CvPoint2D32f *) points->data.fl)[i]; int cluster_idx = clusters->data.i[i]; cvCircle(img, cvPointFrom32f(pt), 2, color_tab[cluster_idx], CV_FILLED); } cvReleaseMat(&points); cvReleaseMat(&clusters); cvShowImage("clusters", img); int key = cvWaitKey(0); if (key == 27) // 'ESC' break; } }