// Loads a new feature space and appends it to the multiset void loadFeatureSpace( string filename, CDataSet< vSparseSample >& ds ) { // Load the data cout<<"Loading "<<filename<<"... "; cout.flush(); CDataSet<CSparseSample> d; parseSparseFile( filename, d, ',', ',', '=' ); cout<<"Loaded "<<d.size()<<" samples; "<<nFeats( d )<<" features"<<endl; // Append the Zhang dataset to the multiset expand( ds, d ); cout<<" The combined dataset is now at "<<ds.size()<<" samples"<<endl; }
int main( int argc, char* argv[] ) { // Display usage and/or parse the parameters if( argc < 2 ) { cout<<"Usage: "<<argv[0]<<" <options file>"<<endl; return -1; } CGOStrutParams params; params.load( argv[1] ); params.display(); typedef CKernelLoss<CSparseSample> CSparseKLoss; CDataSet<vSparseSample>::binop_t fcker = CCompositeSparseKernel( false ); CDataSet<CSparseSample>::binop_t fker = CSparseKernel( true ); CDataSet<CSparseSample>::binop_t floss = CSparseKLoss( fker ); function< double( double, double ) > fioker = CProdJointKernel(); /////////////////////////////// CDataSet< vSparseSample > dsi; // Load the datasets loadFeatureSpace( "mousefunc/ge_zhang.sdat.gz", dsi ); loadFeatureSpace( "mousefunc/ge_su.sdat.gz", dsi ); loadFeatureSpace( "mousefunc/i_adj.sdat", dsi ); loadFeatureSpace( "mousefunc/dd_pfam.sdat", dsi ); loadFeatureSpace( "mousefunc/dd_inter.sdat", dsi ); loadFeatureSpace( "mousefunc/phylo.sdat", dsi ); shared_ptr< CFeatMap > pfm( new CFeatMap ); CDataSet<CSparseSample> dsoTr; CDataSet<CSparseSample> dsoTe; // Load the appropriate set of labels if( GO::hasMF( params.ontology() ) ) loadOntologySpace( "molecular function", "mf", dsoTr, dsoTe, pfm ); if( GO::hasBP( params.ontology() ) ) loadOntologySpace( "biological process", "bp", dsoTr, dsoTe, pfm ); if( GO::hasCC( params.ontology() ) ) loadOntologySpace( "cellular compoennt", "cc", dsoTr, dsoTe, pfm ); if( dsoTr.size() < 1 || dsoTe.size() < 1 ) throw std::runtime_error( "No labels loaded" ); // Append hierarchy to the samples cout<<"Training data output space has "<<dsoTr.size()<<" samples; "<<nFeats( dsoTr )<<" features"<<endl; cout<<"Test data output space has "<<dsoTe.size()<<" samples; "<<nFeats( dsoTe )<<" features"<<endl; // DEBUG /* vector< unsigned int > v; for( unsigned int i = 0; i < 400; i++ ) v.push_back( i ); dsoTr.subsample( v );*/ // END OF DEBUG // Combine the data to form a joint output space typedef CIODataSet<vSparseSample, CSparseSample> CIOSet; shared_ptr< CIOSet > pdsio( new CIOSet(fcker, fker, floss, fioker) ); pdsio->addSets( dsi, dsoTr ); pdsio->addSets( dsi, dsoTe ); cout<<"Input space has "<<pdsio->sizeI()<<" samples"<<endl; cout<<"Output space has "<<pdsio->sizeO()<<" samples, "<<nFeats( pdsio->getO() )<<" features"<<endl; string ont; if( GO::hasMF( params.ontology() ) ) ont += "mf"; if( GO::hasBP( params.ontology() ) ) ont += "bp"; if( GO::hasCC( params.ontology() ) ) ont += "cc"; string fnScores = "mfunc-score-" + params.alg_choice() + "-" + ont + ".sdat"; cout<<"Saving prediction scores to "<<fnScores<<endl; // Re-split the data into training and test sets, both using the same output space now // Retrieve the fold sizes vector< unsigned int > foldSizes; foldSizes.push_back( pdsio->sizeI() - dsoTe.size() ); foldSizes.push_back( dsoTe.size() ); // Create the ranges virange_t vTrain, vTest; splitCV( foldSizes, 1, vTrain, vTest ); display( vTrain, vTest ); // Perform the split shared_ptr< CIOSet > pTrain; shared_ptr< CIOSet > pTest; cout<<"About to split"<<endl; pdsio->splitTrainTest( vTrain, vTest, pTrain, pTest ); cout<<"Finished the split"<<endl; pTrain->cache(); pTest->cache(); // Create and add the appropriate test classifier shared_ptr< CClassifier<vSparseSample, CSparseSample> > pclsf = createClassifier<vSparseSample, CSparseSample>( params ); // Train and test the classifier pclsf->train( pTrain ); vector< double > loss = pclsf->test( pTest ); // Report the results double m = std::accumulate( loss.begin(), loss.end(), 0.0 ) / static_cast<double>( loss.size() ); cout<<"Mean loss per test sample: "<<m<<endl; pTrain->cacheIExternal( pTest->getI() ); predScores( *pclsf, pTest->getI(), fnScores ); return 0; }