// entry main function int main (int argc, char ** argv) { // exception control: illustrate the usage if get input of wrong format if (argc < 5) { cerr << "Usage: cvx_clustering [dataFile] [fw_max_iter] [max_iter] [lambda] " << endl; cerr << "Note: dataFile must be scaled to [0,1] in advance." << endl; exit(-1); } // parse arguments char * dataFile = argv[1]; int fw_max_iter = atoi(argv[2]); int max_iter = atoi(argv[3]); double lambda_base = atof(argv[4]); char * dmatFile = argv[5]; // vector<Instance*> data; // readFixDim (dataFile, data, FIX_DIM); // read in data int FIX_DIM; Parser parser; vector<Instance*>* pdata; vector<Instance*> data; pdata = parser.parseSVM(dataFile, FIX_DIM); data = *pdata; // vector<Instance*> data; // readFixDim (dataFile, data, FIX_DIM); // explore the data int dimensions = -1; int N = data.size(); // data size for (int i = 0; i < N; i++) { vector< pair<int,double> > * f = &(data[i]->fea); int last_index = f->size()-1; if (f->at(last_index).first > dimensions) { dimensions = f->at(last_index).first; } } assert (dimensions == FIX_DIM); int D = dimensions; cerr << "D = " << D << endl; // # features cerr << "N = " << N << endl; // # instances cerr << "lambda = " << lambda_base << endl; cerr << "r = " << r << endl; int seed = time(NULL); srand (seed); cerr << "seed = " << seed << endl; //create lambda with noise double* lambda = new double[N]; for(int i=0; i<N; i++) { lambda[i] = lambda_base + noise(); } // pre-compute distance matrix dist_func df = L2norm; double ** dist_mat = mat_init (N, N); // double ** dist_mat = mat_read (dmatFile, N, N); mat_zeros (dist_mat, N, N); compute_dist_mat (data, dist_mat, N, D, df, true); ofstream dist_mat_out ("dist_mat"); dist_mat_out << mat_toString(dist_mat, N, N); dist_mat_out.close(); // Run sparse convex clustering double ** W = mat_init (N, N); mat_zeros (W, N, N); cvx_clustering (dist_mat, fw_max_iter, max_iter, D, N, lambda, W); ofstream W_OUT("w_out"); W_OUT<< mat_toString(W, N, N); W_OUT.close(); // Output cluster output_objective(clustering_objective (dist_mat, W, N)); /* Output cluster centroids */ output_model (W, N); /* Output assignment */ output_assignment (W, data, N); /* reallocation */ mat_free (dist_mat, N, N); mat_free (W, N, N); }
//////////////////////////////////////////////////////////// // learn_errors // // Correct reads using a much stricter filter in order // to count the nt->nt errors and learn the errors // probabilities //////////////////////////////////////////////////////////// //static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double (&ntnt_prob)[4][4], double prior_prob[4]) { static void learn_errors(string fqf, bithash * trusted, vector<streampos> & starts, vector<unsigned long long> & counts, double ntnt_prob[Read::max_qual][4][4], double prior_prob[4]) { unsigned int ntnt_counts[Read::max_qual][4][4] = {0}; unsigned int samples = 0; unsigned int chunk = 0; #pragma omp parallel //shared(trusted) { unsigned int tchunk; string header,ntseq,strqual,corseq; int trim_length; char* nti; Read *r; ifstream reads_in(fqf.c_str()); while(chunk < threads*chunks_per_thread) { #pragma omp critical tchunk = chunk++; reads_in.seekg(starts[tchunk]); unsigned long long tcount = 0; while(getline(reads_in, header)) { //cout << header << endl; // get sequence getline(reads_in, ntseq); //cout << ntseq << endl; // convert ntseq to iseq vector<unsigned int> iseq; for(int i = 0; i < ntseq.size(); i++) { nti = strchr(nts, ntseq[i]); iseq.push_back(nti - nts); } // get quality values getline(reads_in,strqual); //cout << strqual << endl; getline(reads_in,strqual); //cout << strqual << endl; vector<int> untrusted; if(iseq.size() < trim_t) trim_length = 0; else { for(int i = 0; i < iseq.size()-k+1; i++) { if(!trusted->check(&iseq[i])) { untrusted.push_back(i); } } trim_length = quick_trim(strqual, untrusted); } // fix error reads if(untrusted.size() > 0) { // correct r = new Read(header, &iseq[0], strqual, untrusted, trim_length); corseq = r->correct(trusted, ntnt_prob, prior_prob, true); // if trimmed to long enough if(corseq.size() >= trim_t) { if(r->trusted_read != 0) { // else no guarantee there was a correction for(int c = 0; c < r->trusted_read->corrections.size(); c++) { correction cor = r->trusted_read->corrections[c]; if(iseq[cor.index] < 4) { // P(obs=o|actual=a,a!=o) for Bayes ntnt_counts[strqual[cor.index]-Read::quality_scale][cor.to][iseq[cor.index]]++; // P(actual=a|obs=o,a!=o) //ntnt_counts[iseq[cor.index]][cor.to]++; samples++; } } } } delete r; } if(++tcount == counts[tchunk] || samples > 200000) break; } } reads_in.close(); } regress_probs(ntnt_prob, ntnt_counts); output_model(ntnt_prob, ntnt_counts, fqf); }