int spr_var_selection() { // load lib gSystem->Load("/raid1/narsky/SPRroot/lib/libSPR.so"); // create main SPR object SprRootAdapter spr; // load training data spr.loadDataFromAscii(1,"cleveland.data","train"); // split data into train/test spr.split(0.7,true,2627277); // choose classes spr.chooseClasses("0:."); // show how much data we have const int nClasses = spr.nClasses(); char classes[nClasses][200]; int events [nClasses]; double weights [nClasses]; spr.showDataInClasses(classes,events,weights,"train"); plotClasses("SPR_Class_1","train",nClasses,classes,events,weights); spr.showDataInClasses(classes,events,weights,"test"); plotClasses("SPR_Class_2","test",nClasses,classes,events,weights); // compute correlations between variables const unsigned dim = spr.dim(); double corr [dim*dim]; char vars[dim][200]; spr.vars(vars); spr.correlation(0,corr,"train");// background plotCorrelation("SPR_1B","background",dim,vars,corr); spr.correlation(1,corr,"train");// signal plotCorrelation("SPR_1S","signal",dim,vars,corr); // compute correlation with the class label double corr [dim]; spr.correlationClassLabel("normal",vars,corr,"test"); plotImportance("SPR_0","Correlation with class label", dim,vars,corr,0,true); // select classifiers spr.addRandomForest("rf",2,200,0,10); // train int verbose = 0;// use >0 to increase verbosity level spr.train(verbose); // save classifier spr.saveClassifier("rf","rf.spr"); // test spr.test(); // estimate importance of all variables const unsigned nVars = spr.nClassifierVars("rf"); double importance [nVars]; double impError [nVars]; spr.variableImportance("rf",10,vars,importance,impError); plotImportance("SPR_1","Variable Importance for RF from permutations", nVars,vars,importance,impError,true); // estimate interactions // Note: the errors are zero because all points are used for // integration! const char* subset = ""; unsigned nPoints = 0;// use all points double interaction [nVars]; double intError [nVars]; spr.variableInteraction("rf",subset,nPoints, vars,interaction,intError); plotImportance("SPR_3","Variable Interaction for RF", nVars,vars,interaction,intError,true); // choose a subset of variables char useVars[5][200]; strcpy(useVars[0],"sex"); strcpy(useVars[1],"cp"); strcpy(useVars[2],"oldpeak"); strcpy(useVars[3],"ca"); strcpy(useVars[4],"thal"); spr.chooseVars(5,useVars); spr.loadDataFromAscii(1,"cleveland.data","train"); // use identical splitting spr.split(0.7,true,2627277); spr.chooseClasses("0:."); // train RF on the reduced subset spr.addRandomForest("rf_5vars",2,200,0,10); spr.train(verbose); // save spr.saveClassifier("rf_5vars","rf_5vars.spr"); // compute classifier responses spr.test(); // recompute variable importance const unsigned nVarsReduced = spr.nClassifierVars("rf_5vars"); char varsReduced [nVarsReduced][200]; double importanceReduced [nVarsReduced]; double impErrorReduced [nVarsReduced]; spr.variableImportance("rf_5vars",10,varsReduced, importanceReduced,impErrorReduced); plotImportance("SPR_2","Variable Importance for RF Reduced", nVarsReduced,varsReduced, importanceReduced,impErrorReduced,true); // choose a subset of variables char useVars2[6][200]; strcpy(useVars2[0],"age"); strcpy(useVars2[1],"chol"); strcpy(useVars2[2],"trestbps"); strcpy(useVars2[3],"thalach"); strcpy(useVars2[4],"cp"); strcpy(useVars2[5],"oldpeak"); spr.chooseVars(6,useVars2); spr.loadDataFromAscii(1,"cleveland.data","train"); // use identical splitting spr.split(0.7,true,2627277); spr.chooseClasses("0:."); // train RF on the reduced subset spr.addRandomForest("rf_6vars",2,200,0,10); spr.train(verbose); // save spr.saveClassifier("rf_6vars","rf_6vars.spr"); // choose a subset of variables char useVars3[6][200]; strcpy(useVars3[0],"ca"); strcpy(useVars3[1],"exang"); strcpy(useVars3[2],"fbs"); strcpy(useVars3[3],"sex"); strcpy(useVars3[4],"slope"); strcpy(useVars3[5],"thal"); spr.chooseVars(6,useVars3); spr.loadDataFromAscii(1,"cleveland.data","train"); // use identical splitting spr.split(0.7,true,2627277); spr.chooseClasses("0:."); // train RF on the reduced subset spr.addRandomForest("rf_add2rem1",2,200,0,10); spr.train(verbose); // save spr.saveClassifier("rf_add2rem1","rf_add2rem1.spr"); // reload data spr.chooseAllVars(); spr.loadDataFromAscii(1,"cleveland.data","train"); spr.split(0.7,true,2627277); spr.chooseClasses("0:."); // reload save classifiers spr.loadClassifier("rf","rf.spr"); spr.loadClassifier("rf_5vars","rf_5vars.spr"); spr.loadClassifier("rf_6vars","rf_6vars.spr"); spr.loadClassifier("rf_add2rem1","rf_add2rem1.spr"); // rerun test spr.test(); // get the signal-vs-bgrnd curve const int ntrained = spr.nTrained(); const int npts = 9; double signalEff [npts] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 }; double bgrndEff [ntrained*npts]; double bgrndErr [ntrained*npts]; double fom [ntrained*npts]; char classifiers[ntrained][200]; spr.allEffCurves(npts,signalEff,classifiers,bgrndEff,bgrndErr,fom); plotEffCurveMulti("SPR_RF",ntrained,npts,signalEff, classifiers,bgrndEff,bgrndErr,0); // exit return 0; }
/* The following shows how to apply a PCA transformation to a subset of input variables. You need to prepare tmva_subset_root.pat. Copy tmva_root.pat and remove var2 on the Leaves: line. This will force the reader to read in only 3 variables: var1, var3, and var4. After you plot the results, you can see that only variables var1, var3 and var4 have been decorrelated. Variable var2 shows substantial correlation with others. */ int spr_transform_2() { // load lib gSystem->Load("/afs/cern.ch/user/n/narsky/w0/CMSSW_1_8_X_2007-11-09-0200/lib/slc4_ia32_gcc345/libPhysicsToolsStatPatternRecognition.so"); // create main SPR object SprRootAdapter spr; // load training data spr.loadDataFromRoot("tmva_subset_root.pat","train"); // split data into train/test spr.split(0.5,true); // choose classes spr.chooseClasses("0:1"); // show how much data we have const int nClasses = spr.nClasses(); char classes[nClasses][200]; int events [nClasses]; double weights [nClasses]; spr.showDataInClasses(classes,events,weights,"train"); plotClasses("SPR_Class_1","train",nClasses,classes,events,weights); spr.showDataInClasses(classes,events,weights,"test"); plotClasses("SPR_Class_2","test",nClasses,classes,events,weights); // perform PCA int verbose = 0; spr.trainVarTransformer("PCA",verbose); spr.saveVarTransformer("pca.spr"); // reload training and test data, now with all variables included spr.loadDataFromRoot("tmva_root.pat","train"); spr.split(0.5,true); spr.chooseClasses("0:1"); // compute correlations between variables before PCA transform const unsigned dim = spr.dim(); double corr [dim*dim]; char vars[dim][200]; spr.vars(vars); spr.correlation(0,corr,"train");// background plotCorrelation("SPR_1B","background",dim,vars,corr); spr.correlation(1,corr,"train");// signal plotCorrelation("SPR_1S","signal",dim,vars,corr); // apply PCA transform spr.transform(); // Plot correlations after PCA transform. spr.vars(vars); spr.correlation(0,corr,"train");// background plotCorrelation("SPR_2B","decorrelated background",dim,vars,corr); spr.correlation(1,corr,"train");// signal plotCorrelation("SPR_2S","decorrelated signal",dim,vars,corr); // exit return 0; }
int spr_transform_1() { // load lib gSystem->Load("/afs/cern.ch/user/n/narsky/w0/CMSSW_1_8_X_2007-11-29-1600/lib/slc4_ia32_gcc345/libPhysicsToolsStatPatternRecognition.so"); // create main SPR object SprRootAdapter spr; // load training data spr.loadDataFromRoot("tmva_root.pat","train"); // split data into train/test spr.split(0.5,true); // choose classes spr.chooseClasses("0:1"); // show how much data we have const int nClasses = spr.nClasses(); char classes[nClasses][200]; int events [nClasses]; double weights [nClasses]; spr.showDataInClasses(classes,events,weights,"train"); plotClasses("SPR_Class_1","train",nClasses,classes,events,weights); spr.showDataInClasses(classes,events,weights,"test"); plotClasses("SPR_Class_2","test",nClasses,classes,events,weights); // compute correlations between variables const unsigned dim = spr.dim(); double corr [dim*dim]; char vars[dim][200]; spr.vars(vars); spr.correlation(0,corr,"train");// background plotCorrelation("SPR_1B","background",dim,vars,corr); spr.correlation(1,corr,"train");// signal plotCorrelation("SPR_1S","signal",dim,vars,corr); // save original test data spr.saveTestData("test_orig.root"); // perform PCA int verbose = 0; spr.trainVarTransformer("PCA",verbose); spr.transform(); // Plot correlations after PCA transform. // Note that PCA transform does not change dimensionality. // But it changes variable names! spr.vars(vars); spr.correlation(0,corr,"train");// background plotCorrelation("SPR_2B","decorrelated background",dim,vars,corr); spr.correlation(1,corr,"train");// signal plotCorrelation("SPR_2S","decorrelated signal",dim,vars,corr); // save transformer to a file for future reference spr.saveVarTransformer("pca.spr"); // select classifiers spr.addBoostedBinarySplits("splits",100,20); // train spr.train(verbose); // test spr.test(); // get signal-vs-background curve for classifiers const int npts = 9; double signalEff [npts] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 }; const int ntrained = spr.nTrained(); double bgrndEff [npts*ntrained]; double bgrndErr [npts*ntrained]; double fom [npts*ntrained]; char classifiers[ntrained][200]; spr.allEffCurves(npts,signalEff,classifiers,bgrndEff,bgrndErr,fom); plotEffCurveMulti("SPR_3",ntrained,npts,signalEff, classifiers,bgrndEff,bgrndErr,0); // save transformed test data with classifier responses spr.saveTestData("test_transformed.root"); // exit return 0; }
int spr_mlp() { // load lib gSystem->Load("/afs/cern.ch/user/n/narsky/w0/CMSSW_1_8_X_2007-11-29-1600/lib/slc4_ia32_gcc345/libPhysicsToolsStatPatternRecognition.so"); // create main SPR object SprRootAdapter spr; // load training data spr.loadDataFromRoot("mlp_root.pat","train"); // split data into train/test spr.split(0.5,true); // choose classes spr.chooseClasses("0:1"); // show how much data we have const int nClasses = spr.nClasses(); char classes[nClasses][200]; int events [nClasses]; double weights [nClasses]; spr.showDataInClasses(classes,events,weights,"train"); plotClasses("SPR_Class_1","train",nClasses,classes,events,weights); spr.showDataInClasses(classes,events,weights,"test"); plotClasses("SPR_Class_2","test",nClasses,classes,events,weights); // compute correlations between variables const unsigned dim = spr.dim(); double corr [dim*dim]; char vars[dim][200]; spr.vars(vars); spr.correlation(0,corr,"train");// background plotCorrelation("SPR_1B","background",dim,vars,corr); spr.correlation(1,corr,"train");// signal plotCorrelation("SPR_1S","signal",dim,vars,corr); // select classifiers spr.addFisher("fisher",1); spr.addStdBackprop("mlp","2:5:3:1",100,0.1,0.5,100,5); spr.addRandomForest("rf",100,400,1,20); spr.addRandomForest("arcx4",100,400,1,20,true); // train int verbose = 0;// use >0 to increase verbosity level spr.train(verbose); // test spr.test(); // get the signal-vs-bgrnd curve const int npts = 9; double signalEff [npts] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9 }; const int ntrained = spr.nTrained(); double bgrndEff [npts*ntrained]; double bgrndErr [npts*ntrained]; double fom [npts*ntrained]; char classifiers[ntrained][200]; spr.allEffCurves(npts,signalEff,classifiers,bgrndEff,bgrndErr,fom); plotEffCurveMulti("SPR_3",ntrained,npts,signalEff, classifiers,bgrndEff,bgrndErr,0); // histogram output of ArcX4 for signal and background const int nbin = 40; double xlo = 0.; double xhi = 0.7; double sig[nbin], sigerr[nbin], bgr[nbin], bgrerr[nbin]; spr.histogram("arcx4",xlo,xhi,nbin,sig,sigerr,bgr,bgrerr); plotHistogram("SPR_6","log","ArcX4 output", xlo,xhi,nbin,sig,sigerr,bgr,bgrerr); // save NN into a file spr.saveClassifier("mlp","mlp.spr"); // save test data with classifier response into a root file spr.saveTestData("mytest.root"); // estimate importance of all variables // use 3 permutations per variable const unsigned nVars = spr.nClassifierVars("arcx4"); char vars[nVars][200]; double importance [nVars]; double impError [nVars]; spr.variableImportance("arcx4",3,vars,importance,impError); plotImportance("SPR_7","Variable Importance for ArcX4", nVars,vars,importance,impError,true); // exit return 0; }