void test_pontius () { size_t i, j; gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (pontius_n, pontius_p); gsl_multifit_robust_workspace * work_rob = gsl_multifit_robust_alloc (gsl_multifit_robust_ols, pontius_n, pontius_p); gsl_matrix * X = gsl_matrix_alloc (pontius_n, pontius_p); gsl_vector_view y = gsl_vector_view_array (pontius_y, pontius_n); gsl_vector * c = gsl_vector_alloc (pontius_p); gsl_vector * r = gsl_vector_alloc (pontius_n); gsl_matrix * cov = gsl_matrix_alloc (pontius_p, pontius_p); double chisq, chisq_res; double expected_c[3] = { 0.673565789473684E-03, 0.732059160401003E-06, -0.316081871345029E-14}; double expected_sd[3] = { 0.107938612033077E-03, 0.157817399981659E-09, 0.486652849992036E-16 }; double expected_chisq = 0.155761768796992E-05; gsl_vector_view diag = gsl_matrix_diagonal (cov); gsl_vector_view exp_c = gsl_vector_view_array(expected_c, pontius_p); gsl_vector_view exp_sd = gsl_vector_view_array(expected_sd, pontius_p); for (i = 0 ; i < pontius_n; i++) { for (j = 0; j < pontius_p; j++) { gsl_matrix_set(X, i, j, pow(pontius_x[i], j)); } } /* test unweighted least squares */ gsl_multifit_linear (X, &y.vector, c, cov, &chisq, work); gsl_multifit_linear_residuals(X, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq_res); test_pontius_results("pontius gsl_multifit_linear", c, &exp_c.vector, &diag.vector, &exp_sd.vector, chisq, chisq_res, expected_chisq); /* test robust least squares */ gsl_multifit_robust (X, &y.vector, c, cov, work_rob); test_pontius_results("pontius gsl_multifit_robust", c, &exp_c.vector, &diag.vector, &exp_sd.vector, 1.0, 1.0, 1.0); /* test weighted least squares */ { gsl_vector * w = gsl_vector_alloc (pontius_n); double expected_cov[3][3] ={ {2.76754385964916e-01 , -3.59649122807024e-07, 9.74658869395731e-14}, {-3.59649122807024e-07, 5.91630591630603e-13, -1.77210703526497e-19}, {9.74658869395731e-14, -1.77210703526497e-19, 5.62573661988878e-26} }; gsl_vector_set_all (w, 1.0); gsl_multifit_wlinear (X, w, &y.vector, c, cov, &chisq, work); gsl_multifit_linear_residuals(X, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq_res); test_pontius_results("pontius gsl_multifit_wlinear", c, &exp_c.vector, NULL, NULL, chisq, chisq_res, expected_chisq); for (i = 0; i < pontius_p; i++) { for (j = 0; j < pontius_p; j++) { gsl_test_rel (gsl_matrix_get(cov,i,j), expected_cov[i][j], 1e-10, "pontius gsl_multifit_wlinear cov(%d,%d)", i, j) ; } } gsl_vector_free(w); } gsl_vector_free(c); gsl_vector_free(r); gsl_matrix_free(cov); gsl_matrix_free(X); gsl_multifit_linear_free (work); gsl_multifit_robust_free (work_rob); }
void test_longley () { gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (longley_n, longley_p); gsl_multifit_robust_workspace * work_rob = gsl_multifit_robust_alloc (gsl_multifit_robust_ols, longley_n, longley_p); gsl_matrix_view X = gsl_matrix_view_array (longley_x, longley_n, longley_p); gsl_vector_view y = gsl_vector_view_array (longley_y, longley_n); gsl_vector * c = gsl_vector_alloc (longley_p); gsl_vector * r = gsl_vector_alloc (longley_n); gsl_matrix * cov = gsl_matrix_alloc (longley_p, longley_p); double chisq, chisq_res; double expected_c[7] = { -3482258.63459582, 15.0618722713733, -0.358191792925910E-01, -2.02022980381683, -1.03322686717359, -0.511041056535807E-01, 1829.15146461355 }; double expected_sd[7] = { 890420.383607373, 84.9149257747669, 0.334910077722432E-01, 0.488399681651699, 0.214274163161675, 0.226073200069370, 455.478499142212 } ; double expected_chisq = 836424.055505915; gsl_vector_view diag = gsl_matrix_diagonal (cov); gsl_vector_view exp_c = gsl_vector_view_array(expected_c, longley_p); gsl_vector_view exp_sd = gsl_vector_view_array(expected_sd, longley_p); /* test unweighted least squares */ gsl_multifit_linear (&X.matrix, &y.vector, c, cov, &chisq, work); gsl_multifit_linear_residuals(&X.matrix, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq_res); test_longley_results("longley gsl_multifit_linear", c, &exp_c.vector, &diag.vector, &exp_sd.vector, chisq, chisq_res, expected_chisq); /* test robust least squares */ gsl_multifit_robust (&X.matrix, &y.vector, c, cov, work_rob); test_longley_results("longley gsl_multifit_robust", c, &exp_c.vector, &diag.vector, &exp_sd.vector, 1.0, 1.0, 1.0); /* test weighted least squares */ { size_t i, j; gsl_vector * w = gsl_vector_alloc (longley_n); double expected_cov[7][7] = { { 8531122.56783558, -166.727799925578, 0.261873708176346, 3.91188317230983, 1.1285582054705, -0.889550869422687, -4362.58709870581}, {-166.727799925578, 0.0775861253030891, -1.98725210399982e-05, -0.000247667096727256, -6.82911920718824e-05, 0.000136160797527761, 0.0775255245956248}, {0.261873708176346, -1.98725210399982e-05, 1.20690316701888e-08, 1.66429546772984e-07, 3.61843600487847e-08, -6.78805814483582e-08, -0.00013158719037715}, {3.91188317230983, -0.000247667096727256, 1.66429546772984e-07, 2.56665052544717e-06, 6.96541409215597e-07, -9.00858307771567e-07, -0.00197260370663974}, {1.1285582054705, -6.82911920718824e-05, 3.61843600487847e-08, 6.96541409215597e-07, 4.94032602583969e-07, -9.8469143760973e-08, -0.000576921112208274}, {-0.889550869422687, 0.000136160797527761, -6.78805814483582e-08, -9.00858307771567e-07, -9.8469143760973e-08, 5.49938542664952e-07, 0.000430074434198215}, {-4362.58709870581, 0.0775255245956248, -0.00013158719037715, -0.00197260370663974, -0.000576921112208274, 0.000430074434198215, 2.23229587481535 }} ; gsl_vector_set_all (w, 1.0); gsl_multifit_wlinear (&X.matrix, w, &y.vector, c, cov, &chisq, work); gsl_multifit_linear_residuals(&X.matrix, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq_res); test_longley_results("longley gsl_multifit_wlinear", c, &exp_c.vector, NULL, NULL, chisq, chisq_res, expected_chisq); for (i = 0; i < longley_p; i++) { for (j = 0; j < longley_p; j++) { gsl_test_rel (gsl_matrix_get(cov,i,j), expected_cov[i][j], 1e-7, "longley gsl_multifit_wlinear cov(%d,%d)", i, j) ; } } gsl_vector_free(w); } gsl_vector_free(c); gsl_vector_free(r); gsl_matrix_free(cov); gsl_multifit_linear_free (work); gsl_multifit_robust_free (work_rob); } /* test_longley() */
void test_pontius () { size_t i, j; { gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (pontius_n, pontius_p); gsl_matrix * X = gsl_matrix_alloc (pontius_n, pontius_p); gsl_vector_view y = gsl_vector_view_array (pontius_y, pontius_n); gsl_vector * c = gsl_vector_alloc (pontius_p); gsl_vector * r = gsl_vector_alloc (pontius_n); gsl_matrix * cov = gsl_matrix_alloc (pontius_p, pontius_p); gsl_vector_view diag; double chisq; double expected_c[3] = { 0.673565789473684E-03, 0.732059160401003E-06, -0.316081871345029E-14}; double expected_sd[3] = { 0.107938612033077E-03, 0.157817399981659E-09, 0.486652849992036E-16 }; double expected_chisq = 0.155761768796992E-05; for (i = 0 ; i < pontius_n; i++) { for (j = 0; j < pontius_p; j++) { gsl_matrix_set(X, i, j, pow(pontius_x[i], j)); } } gsl_multifit_linear (X, &y.vector, c, cov, &chisq, work); gsl_test_rel (gsl_vector_get(c,0), expected_c[0], 1e-10, "pontius gsl_fit_multilinear c0") ; gsl_test_rel (gsl_vector_get(c,1), expected_c[1], 1e-10, "pontius gsl_fit_multilinear c1") ; gsl_test_rel (gsl_vector_get(c,2), expected_c[2], 1e-10, "pontius gsl_fit_multilinear c2") ; diag = gsl_matrix_diagonal (cov); gsl_test_rel (gsl_vector_get(&diag.vector,0), pow(expected_sd[0],2.0), 1e-10, "pontius gsl_fit_multilinear cov00") ; gsl_test_rel (gsl_vector_get(&diag.vector,1), pow(expected_sd[1],2.0), 1e-10, "pontius gsl_fit_multilinear cov11") ; gsl_test_rel (gsl_vector_get(&diag.vector,2), pow(expected_sd[2],2.0), 1e-10, "pontius gsl_fit_multilinear cov22") ; gsl_test_rel (chisq, expected_chisq, 1e-10, "pontius gsl_fit_multilinear chisq") ; gsl_multifit_linear_residuals(X, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq); gsl_test_rel (chisq, expected_chisq, 1e-10, "pontius gsl_fit_multilinear residuals") ; gsl_vector_free(c); gsl_vector_free(r); gsl_matrix_free(cov); gsl_matrix_free(X); gsl_multifit_linear_free (work); } { gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (pontius_n, pontius_p); gsl_matrix * X = gsl_matrix_alloc (pontius_n, pontius_p); gsl_vector_view y = gsl_vector_view_array (pontius_y, pontius_n); gsl_vector * w = gsl_vector_alloc (pontius_n); gsl_vector * c = gsl_vector_alloc (pontius_p); gsl_vector * r = gsl_vector_alloc (pontius_n); gsl_matrix * cov = gsl_matrix_alloc (pontius_p, pontius_p); double chisq; double expected_c[3] = { 0.673565789473684E-03, 0.732059160401003E-06, -0.316081871345029E-14}; double expected_chisq = 0.155761768796992E-05; double expected_cov[3][3] ={ {2.76754385964916e-01 , -3.59649122807024e-07, 9.74658869395731e-14}, {-3.59649122807024e-07, 5.91630591630603e-13, -1.77210703526497e-19}, {9.74658869395731e-14, -1.77210703526497e-19, 5.62573661988878e-26} }; for (i = 0 ; i < pontius_n; i++) { for (j = 0; j < pontius_p; j++) { gsl_matrix_set(X, i, j, pow(pontius_x[i], j)); } } gsl_vector_set_all (w, 1.0); gsl_multifit_wlinear (X, w, &y.vector, c, cov, &chisq, work); gsl_test_rel (gsl_vector_get(c,0), expected_c[0], 1e-10, "pontius gsl_fit_multilinear c0") ; gsl_test_rel (gsl_vector_get(c,1), expected_c[1], 1e-10, "pontius gsl_fit_multilinear c1") ; gsl_test_rel (gsl_vector_get(c,2), expected_c[2], 1e-10, "pontius gsl_fit_multilinear c2") ; for (i = 0; i < pontius_p; i++) { for (j = 0; j < pontius_p; j++) { gsl_test_rel (gsl_matrix_get(cov,i,j), expected_cov[i][j], 1e-10, "pontius gsl_fit_wmultilinear cov(%d,%d)", i, j) ; } } gsl_test_rel (chisq, expected_chisq, 1e-10, "pontius gsl_fit_wmultilinear chisq") ; gsl_multifit_linear_residuals(X, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq); gsl_test_rel (chisq, expected_chisq, 1e-10, "pontius gsl_fit_wmultilinear residuals") ; gsl_vector_free(w); gsl_vector_free(c); gsl_vector_free(r); gsl_matrix_free(cov); gsl_matrix_free(X); gsl_multifit_linear_free (work); } }
int gsl_multifit_robust(const gsl_matrix * X, const gsl_vector * y, gsl_vector * c, gsl_matrix * cov, gsl_multifit_robust_workspace *w) { /* check matrix and vector sizes */ if (X->size1 != y->size) { GSL_ERROR ("number of observations in y does not match rows of matrix X", GSL_EBADLEN); } else if (X->size2 != c->size) { GSL_ERROR ("number of parameters c does not match columns of matrix X", GSL_EBADLEN); } else if (cov->size1 != cov->size2) { GSL_ERROR ("covariance matrix is not square", GSL_ENOTSQR); } else if (c->size != cov->size1) { GSL_ERROR ("number of parameters does not match size of covariance matrix", GSL_EBADLEN); } else if (X->size1 != w->n || X->size2 != w->p) { GSL_ERROR ("size of workspace does not match size of observation matrix", GSL_EBADLEN); } else { int s; double chisq; const double tol = GSL_SQRT_DBL_EPSILON; int converged = 0; size_t numit = 0; const size_t n = y->size; double sigy = gsl_stats_sd(y->data, y->stride, n); double sig_lower; size_t i; /* * if the initial fit is very good, then finding outliers by comparing * them to the residual standard deviation is difficult. Therefore we * set a lower bound on the standard deviation estimate that is a small * fraction of the standard deviation of the data values */ sig_lower = 1.0e-6 * sigy; if (sig_lower == 0.0) sig_lower = 1.0; /* compute initial estimates using ordinary least squares */ s = gsl_multifit_linear(X, y, c, cov, &chisq, w->multifit_p); if (s) return s; /* save Q S^{-1} of original matrix */ gsl_matrix_memcpy(w->QSI, w->multifit_p->QSI); gsl_vector_memcpy(w->D, w->multifit_p->D); /* compute statistical leverage of each data point */ s = gsl_linalg_SV_leverage(w->multifit_p->A, w->resfac); if (s) return s; /* correct residuals with factor 1 / sqrt(1 - h) */ for (i = 0; i < n; ++i) { double h = gsl_vector_get(w->resfac, i); if (h > 0.9999) h = 0.9999; gsl_vector_set(w->resfac, i, 1.0 / sqrt(1.0 - h)); } /* compute residuals from OLS fit r = y - X c */ s = gsl_multifit_linear_residuals(X, y, c, w->r); if (s) return s; /* compute estimate of sigma from ordinary least squares */ w->stats.sigma_ols = gsl_blas_dnrm2(w->r) / sqrt((double) w->stats.dof); while (!converged && ++numit <= w->maxiter) { double sig; /* adjust residuals by statistical leverage (see DuMouchel and O'Brien) */ s = gsl_vector_mul(w->r, w->resfac); if (s) return s; /* compute estimate of standard deviation using MAD */ sig = robust_madsigma(w->r, w); /* scale residuals by standard deviation and tuning parameter */ gsl_vector_scale(w->r, 1.0 / (GSL_MAX(sig, sig_lower) * w->tune)); /* compute weights using these residuals */ s = w->type->wfun(w->r, w->weights); if (s) return s; gsl_vector_memcpy(w->c_prev, c); /* solve weighted least squares with new weights */ s = gsl_multifit_wlinear(X, w->weights, y, c, cov, &chisq, w->multifit_p); if (s) return s; /* compute new residuals r = y - X c */ s = gsl_multifit_linear_residuals(X, y, c, w->r); if (s) return s; converged = robust_test_convergence(w->c_prev, c, tol); } /* compute final MAD sigma */ w->stats.sigma_mad = robust_madsigma(w->r, w); /* compute robust estimate of sigma */ w->stats.sigma_rob = robust_robsigma(w->r, w->stats.sigma_mad, w->tune, w); /* compute final estimate of sigma */ w->stats.sigma = robust_sigma(w->stats.sigma_ols, w->stats.sigma_rob, w); /* store number of iterations */ w->stats.numit = numit; { double dof = (double) w->stats.dof; double rnorm = w->stats.sigma * sqrt(dof); /* see DuMouchel, sec 4.2 */ double ss_err = rnorm * rnorm; double ss_tot = gsl_stats_tss(y->data, y->stride, n); /* compute R^2 */ w->stats.Rsq = 1.0 - ss_err / ss_tot; /* compute adjusted R^2 */ w->stats.adj_Rsq = 1.0 - (1.0 - w->stats.Rsq) * (n - 1.0) / dof; /* compute rmse */ w->stats.rmse = sqrt(ss_err / dof); /* store SSE */ w->stats.sse = ss_err; } /* calculate covariance matrix = sigma^2 (X^T X)^{-1} */ s = robust_covariance(w->stats.sigma, cov, w); if (s) return s; /* raise an error if not converged */ if (numit > w->maxiter) { GSL_ERROR("maximum iterations exceeded", GSL_EMAXITER); } return s; } } /* gsl_multifit_robust() */
int main(int argc, char* argv[]) { // parameters that you can set. string delim = "\t "; string chipFile = ""; vector<string> ctrlFiles; string outFile = ""; int readLen = 50; int chunkSize = 100000; int windowSize = 5; int interval = 5; bool talk = false; string errorLine = "usage " + string(argv[0]) + " [Parameters]\n" + "\t-i <infile, BED-formated file containing the ChIP-reads, sorted on chromosome and position.>\n" + "\t-c <space\\tab separataed list of infile(s), BED-formated file(s) \n" + "\t containing the control-reads (e.g. Input/IgG et cetera), sorted as the file given in '-i' \n" + "\t-o <outfile, BED-formated file of resulting reads after normalization, \n" + "\t with read lengths as defined by -l>\n" + "\t-rl <read length, defaults to 50 >\n" + "\t-cs <chunk size, number of bp considered at a time when building the model>\n" + "\t-ws <window size, at every point used to build the model a window of +/- \n" + "\t this size is averaged to create an observed data point.>\n" + "\t-iv <interval, the step size determining the distance between points \n" + "\t used as observations in the regression model.>\n" + "\t-v <set verbose>\n" + "example: \n" + string(argv[0]) + " -i myreads.bed -c input.bed igg.bed noise.bed -o normalized.bed -rl 50 -cs 100000 -ws 5 -iv 5 \n" ; bool fail = false; bool ctrlfiles = false; string failmessage = ""; for (int i=1;i<argc;i++) { if(strcmp(argv[i],"-i") == 0) { chipFile.assign(argv[++i]); ctrlfiles = false; } else if(strcmp(argv[i],"-o") == 0) { outFile.assign(argv[++i]); ctrlfiles = false; } else if(strcmp(argv[i],"-c") == 0) { ctrlfiles = true; } else if(strcmp(argv[i],"-rl") == 0) { readLen = atoi(argv[++i]); ctrlfiles = false; } else if(strcmp(argv[i],"-cs") == 0) { chunkSize = atoi(argv[++i]); ctrlfiles = false; } else if(strcmp(argv[i],"-ws") == 0) { windowSize = atoi(argv[++i]); ctrlfiles = false; } else if(strcmp(argv[i],"-iv") == 0) { interval = atoi(argv[++i]); ctrlfiles = false; } else if(strcmp(argv[i],"-v") == 0) { talk = true; ctrlfiles = false; } else { if(ctrlfiles) // assume that all things not parsable after -c are control files. Check for existance/readability below. { ctrlFiles.push_back(argv[i]); } else { failmessage.assign("Unknown argument: "); failmessage.append(argv[i]); failmessage.append("\n"); fail = true; } } } // Check infile and readability. if(chipFile == "") { failmessage.append("infile (-i) must be specified.\n"); fail = true; } ifstream inf; inf.open(chipFile.c_str()); if(!inf) { failmessage.append("Could not open infile '" + chipFile + "' (does the file exist?)\n"); fail = true; } // Check control files. if(ctrlFiles.size() < 1) { failmessage.append("at least one control file (-c) must be specified.\n"); fail = true; } ifstream infc[ctrlFiles.size()]; if(!fail) for (int i = 0;i<ctrlFiles.size();i++) { infc[i].open(ctrlFiles[i].c_str()); if(!infc[i]) { failmessage.append("Could not open ctrlfile '" + ctrlFiles[i] + "' (does the file exist?)\n"); fail = true; } } // Check outfile and readability. if(outFile == "") { failmessage.append("outfile (-o) must be specified.\n"); fail = true; } ofstream outf; if(!fail) outf.open(outFile.c_str(),ios::trunc); if(!outf) { failmessage.append("Could not open outfile '" + outFile + "' (do we have permission ?)\n"); fail = true; } // are we ok so far? if (fail) { cerr << endl << failmessage.c_str() << endl << errorLine << endl; //try and close any opened files inf.close(); for (int i = 0;i<ctrlFiles.size();i++) infc[i].close(); outf.close(); return(1); } /* * Get some initial parameters. */ map <string,seqStats> seqMapChip; map <string,seqStats> *seqMapCtrls; seqMapCtrls = new map<string,seqStats>[ctrlFiles.size()]; map <string,seqStats>::iterator it; map <int,int*>::iterator valIt; // Read the reference sequences and the range of each file cout<<"Reading BED-files."<<endl; int nlinesChIP, nlinesCtrl=0; cout<<"ChIP file.."<<endl; nlinesChIP = initControlBEDlite(&inf,&seqMapChip,0,1,2,5,true); cout<<"Control file(s) .."<<endl; for (int i = 0;i<ctrlFiles.size();i++) nlinesCtrl += initControlBEDlite(&infc[i],&seqMapCtrls[i],0,1,2,5,true); cout<<"ChIP-data consists of "<<nlinesChIP<<" mapped fragments."<<endl; cout<<"Control-data consists of "<<nlinesCtrl<<" mapped fragments."<<endl; // print some stats. cout <<"ChIP Read Statistics::"<<endl; cout <<setw(10)<<"Name\t"<<setw(10)<<"minCrd\t"<<setw(10)<<"maxCrd\t"<<setw(10)<<"F_counts\t"<<setw(10)<<"R_counts\t"<<endl; for ( it=seqMapChip.begin() ; it != seqMapChip.end(); it++ ) { cout <<setw(10)<< (*it).first << "\t" <<setw(10)<< (*it).second.minPos << "\t" << setw(10)<<(*it).second.maxPos<<"\t"; cout <<setw(10)<< (*it).second.countF << "\t" <<setw(10)<< (*it).second.countR<<endl; } cout <<"Control Statistics::"<<endl; for (int i = 0;i<ctrlFiles.size();i++) { cout<<ctrlFiles[i]<<endl; cout <<setw(10)<<"Name\t"<<setw(10)<<"minCrd\t"<<setw(10)<<"maxCrd\t"<<setw(10)<<"F_counts\t"<<setw(10)<<"R_counts\t"<<endl; for ( it=seqMapCtrls[i].begin() ; it != seqMapCtrls[i].end(); it++ ) { cout <<setw(10)<< (*it).first << "\t" <<setw(10)<< (*it).second.minPos << "\t" << setw(10)<<(*it).second.maxPos<<"\t"; cout <<setw(10)<< (*it).second.countF << "\t" <<setw(10)<< (*it).second.countR<<endl; } } cout<<"Processing reads in chunks of "<<chunkSize<<" bp."<<endl; int lowPos,highPos; int chunkRange; int chunkObs; int obsCount; int *winSumF = new int[ctrlFiles.size()+1]; int *winSumR = new int[ctrlFiles.size()+1]; double chisqF,chisqR; gsl_matrix *XF, *covF,*XR, *covR; gsl_vector *yF,*cF,*rF,*yR,*cR,*rR; double *resiF,*resiR; int *cntF,*cntR; int posOff; // initialize. string line; inputLine chipLine; inputLine *ctrlLines; ctrlLines = new inputLine[ctrlFiles.size()]; // read first line from each file. check position and chr. assume that the files are ordered inside chr. i.e don't loop // over chr by seqMap but over info in the files. retrieve min/max position from the seqMap depending on file contents. // also assume that the chr ordering is the same in chip & control files. getline(inf,line); parseBEDline(line,&chipLine,0,1,2,5); for(int i=0;i<ctrlFiles.size();i++) { getline(infc[i],line); parseBEDline(line,&ctrlLines[i],0,1,2,5); } // initialize with the "first" chromosome and its min/max pos. string currChr = chipLine.seq; int chrMinPos,chrMaxPos; int chrMinPosCtrl,chrMaxPosCtrl; int currLine = 1; int memNeeded; int chrPosChip,chrPosCtrl; int ctrlIndex; // tmp. storage for the chip/control signals. unsigned short *chipF,*chipR,*ctrlF,*ctrlR; // introduce curr pos, curr Chr etc. and a loop on !EOF in the chip file. // no point in normalizing where there are no signals in chip... while(currLine <= nlinesChIP) { chrMinPos = seqMapChip.find(currChr)->second.minPos; chrMaxPos = seqMapChip.find(currChr)->second.maxPos;; if(talk) cout<<"ChIP: "<<chipLine.seq<<" "<<chrMinPos<<" "<<chrMaxPos<<endl; chrPosChip = chrMaxPos - chrMinPos +1; // check the min/max for this chr in ctrl-data chrMinPosCtrl = INT_MAX; chrMaxPosCtrl = -1; for(int i = 0;i<ctrlFiles.size();i++) { if(seqMapCtrls[i].count(currChr)) { chrMinPosCtrl = min(chrMinPosCtrl,seqMapCtrls[i][currChr].minPos); chrMaxPosCtrl = max(chrMaxPosCtrl,seqMapCtrls[i][currChr].maxPos); } } if(talk) cout<<"Control: "<<chipLine.seq<<" "<<chrMinPosCtrl<<" "<<chrMaxPosCtrl<<endl; chrPosCtrl = chrMaxPosCtrl - chrMinPosCtrl +1; memNeeded = sizeof(unsigned short)*(chrPosChip + ctrlFiles.size()*chrPosCtrl); // allocate memory to hold the entire chromosome, do the regression in chunks. try{ cout<<"Trying to allocate: "; if(memNeeded > 1000000000) cout<<memNeeded/1000000000<<" Gb for "<<currChr<<"."; else if (memNeeded > 1000000) cout<<memNeeded/1000000<<" Mb for "<<currChr<<"."; else if (memNeeded > 1000) cout<<memNeeded/1000<<" kb for "<<currChr<<"."; else cout<<memNeeded<<" bytes for raw signals"<<currChr<<"."; chipF = new unsigned short[chrPosChip]; chipR = new unsigned short[chrPosChip]; ctrlF = new unsigned short[chrPosCtrl*ctrlFiles.size()]; // these will need to be accessed in a "[i + chrPosCtrl*j]"-type of fashion. ctrlR = new unsigned short[chrPosCtrl*ctrlFiles.size()]; cout<<" Done."<<endl; }catch (std::bad_alloc &f){ cerr<<string(argv[0])<<" couldn't allocate as much memory as it wanted. Failure: '"<<f.what()<<endl; // close files. inf.close(); for (int i = 0;i<ctrlFiles.size();i++) infc[i].close(); outf.close(); delete[] chipF; delete[] chipR; delete[] ctrlF; delete[] ctrlR; delete[] resiF; delete[] resiR; delete[] cntF; delete[] cntR; return(-1); } // make sure it's all zeroes. for(int i=0;i<chrPosChip;i++) { chipF[i] = 0; chipR[i] = 0; } for(int i=0;i<chrPosCtrl;i++) for(int j = 0;j<ctrlFiles.size();j++) { ctrlF[i + j*chrPosCtrl] = 0; ctrlR[i + j*chrPosCtrl] = 0; } // read in the sought chip-data while((chipLine.seq == currChr) && !(inf.eof())) // chip-file { //cout<<chipLine.seq<<"\t"<<line<<endl; // update previous line's data. if(chipLine.strand == 1) chipF[chipLine.pos-chrMinPos]++; else chipR[chipLine.pos-chrMinPos+chipLine.len]++; // read in the nextline. getline(inf,line); parseBEDline(line,&chipLine,0,1,2,5); currLine++; } if((chipLine.seq == currChr) && (inf.eof())) // chip-file, last read, ok chr, use. { //cout<<"Last line of the ChipFIle"<<endl; //cout<<chipLine.seq<<"\t"<<line<<endl; if(chipLine.strand == 1) chipF[chipLine.pos-chrMinPos]++; else chipR[chipLine.pos-chrMinPos+chipLine.len-1]++; } // read in the sought ctrl-data for(int i = 0;i<ctrlFiles.size();i++) { // is there data at all for this chr in this control file? if(seqMapCtrls[i].count(currChr) == 1) { // we're assuming that the chromosomes are in the same order in the chip & ctrl files. // cases: // chr on current line is not the same as in chip // => we know that we should have chr data on this chr & that chrs comes in the same order. this can prob. only // happen for a chr-specific chromosome, e.g. its safe to read past and check again. // chr on current line is the same as in chip // => this is good. last time (either preFirst or not) should have read prev. chr completely. so just start reading until we hit // another chr. // if(ctrlLines[i].seq != currChr) { // read past th "wrong" chromosome(s). while(!infc[i].eof() && ctrlLines[i].seq != currChr) { getline(infc[i],line); parseBEDline(line,&ctrlLines[i],0,1,2,5); } } // now we have the first line of the the correct chr in 'ctrlLines[i]' // Read in the complete chr and store the data accordingly. while(!infc[i].eof() && ctrlLines[i].seq == currChr) { if(ctrlLines[i].strand == 1) ctrlF[ctrlLines[i].pos-chrMinPosCtrl + i*chrPosCtrl]++; else ctrlR[ctrlLines[i].pos-chrMinPosCtrl+ctrlLines[i].len + i*chrPosCtrl-1]++; getline(infc[i],line); parseBEDline(line,&ctrlLines[i],0,1,2,5); } } } cout<<"Analysing "<<currChr<<endl; currChr = chipLine.seq; // store "next" chromosome // now all data for this chr is read. Start analysing in chunks. lowPos = chrMinPos; while(lowPos < chrMaxPos) // loop over this chromosome data in chunks. { if(!talk) { cout<<lowPos<<" of "<<chrMaxPos<<"\r"; } highPos = lowPos + chunkSize-1; if(highPos >= (chrMaxPos - 0.5*chunkSize)) // less than 0.8 of a chunk left. merge. highPos = chrMaxPos; chunkRange = highPos - lowPos + 1; if(talk) cout<<"["<<lowPos<<","<<highPos<<"]\tsize: "<<chunkRange<<endl; resiF = new double[chunkRange]; resiR = new double[chunkRange]; cntF = new int[chunkRange]; cntR = new int[chunkRange]; for(int i=0;i<chunkRange;i++) { resiF[i] = 0.0; resiR[i] = 0.0; cntF[i] = 0; cntR[i] = 0; } // for each chunk, step forward in 'interval' steps and average signals in that window. chunkObs = (chunkRange-2*windowSize)/interval + 1; if (talk) cout<<"\tsampling this chunk at "<<chunkObs<<" positions."<<endl; // Storage for the signals on '+' XF = gsl_matrix_alloc (chunkObs, ctrlFiles.size()); yF = gsl_vector_alloc (chunkObs); rF = gsl_vector_alloc (chunkObs); cF = gsl_vector_alloc (ctrlFiles.size()); covF = gsl_matrix_alloc (ctrlFiles.size(), ctrlFiles.size()); // Storage for the signals on '-' XR = gsl_matrix_alloc (chunkObs, ctrlFiles.size()); yR = gsl_vector_alloc (chunkObs); rR = gsl_vector_alloc (chunkObs); cR = gsl_vector_alloc (ctrlFiles.size()); covR = gsl_matrix_alloc (ctrlFiles.size(), ctrlFiles.size()); // loop over the signals in interval steps and average in +/- windowSize. fill in the matrices. obsCount = 0; for (int i = lowPos+windowSize;i<(highPos-windowSize);) { // collect sums over each signal in the sough window for (int j = 0;j < ctrlFiles.size() + 1;j++) { winSumF[j] = 0; winSumR[j] = 0; } for (int j = -windowSize;j<=windowSize;j++) { winSumF[0] += chipF[i - chrMinPos + j]; winSumR[0] += chipR[i - chrMinPos + j]; for(int k = 0;k<ctrlFiles.size();k++) { ctrlIndex = i - chrMinPosCtrl + j + k*chrPosCtrl; if(ctrlIndex >= 0 && ctrlIndex <chrPosCtrl) // is there ctrl data for this position? { winSumF[1+k] += ctrlF[ctrlIndex]; winSumR[1+k] += ctrlR[ctrlIndex]; } } } // the chip signal gsl_vector_set (yF, obsCount, (double)winSumF[0]/(double)(2*windowSize+1)); gsl_vector_set (yR, obsCount, (double)winSumR[0]/(double)(2*windowSize+1)); // the control signals for (int j = 0;j < ctrlFiles.size();j++) { gsl_matrix_set (XF, obsCount, j, (double)winSumF[j+1]/(double)(2*windowSize+1)); gsl_matrix_set (XR, obsCount, j, (double)winSumR[j+1]/(double)(2*windowSize+1)); } obsCount++; i+=interval; } // fit the models. gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (chunkObs, ctrlFiles.size()); /* * '+' Strand */ gsl_multifit_linear (XF, yF, cF, covF,&chisqF, work); if(talk) { cout<<"\t'+' chisq: "<<chisqF<<"\t"<<"c's:"; for (int j = 0;j < ctrlFiles.size();j++) cout<<gsl_vector_get(cF,j)<<" "; cout<<endl; } /* * '-' Strand */ gsl_multifit_linear (XR, yR, cR, covR,&chisqR, work); if(talk) { cout<<"\t'-' chisq: "<<chisqR<<"\t"<<"c's:"; for (int j = 0;j < ctrlFiles.size();j++) cout<<gsl_vector_get(cR,j)<<" "; cout<<endl; } gsl_multifit_linear_free (work); // calculate residuals. if(talk) cout<<"\tCaclulating residuals.."; gsl_multifit_linear_residuals (XF,yF,cF,rF); gsl_multifit_linear_residuals (XR,yR,cR,rR); if(talk) cout<<"done."<<endl<<"\tRebuilding signal.."; // rebuild a per-bp-signal for (int i=0;i<chunkObs;i++) { // center of this observation. posOff = 1+(i+1)*interval; if(posOff > chunkRange) // outside of our chunk, should never happen. continue; if(gsl_vector_get(rF,i) > 0.5) // original R-code used 'round' on the residuals, ceiling(x-0.5) does the same thing for (int j = -windowSize;j<=windowSize;j++) { resiF[j+posOff] += ceil(gsl_vector_get(rF,i)-0.5); cntF[j+posOff] += 1; } if(gsl_vector_get(rR,i) > 0.5) for (int j = -windowSize;j<=windowSize;j++) { resiR[j+posOff] += ceil(gsl_vector_get(rR,i)-0.5); cntR[j+posOff] += 1; } } if(talk) cout<<"done."<<endl<<"\tWriting output.."; for (int i=0;i<chunkRange;i++) { if(cntF[i] > 0) { resiF[i] = resiF[i]/(double)cntF[i]; if(resiF[i] > 0) { for(int j=0;j<ceil(resiF[i]);j++) { outf<<currChr<<"\t"<<lowPos + i-1<<"\t"<<lowPos+i+readLen-2<<"\tDUMMY\t"; // bed is zero-based, halfopen (ie.-1/-2) outf<<resiF[i]<<"\t+"<<endl; } } } if(cntR[i] > 0) { resiR[i] = resiR[i]/(double)cntR[i]; if(resiR[i] > 0) { for(int j=0;j<ceil(resiR[i]);j++) { outf<<currChr<<"\t"<<lowPos + i-readLen-2<<"\t"<<lowPos+i-1<<"\tDUMMY\t"; outf<<resiR[i]<<"\t-"<<endl; } } } } if(talk) cout<<"done."<<endl; lowPos = highPos+1; delete[] resiF; delete[] resiR; delete[] cntF; delete[] cntR; gsl_matrix_free(XF); gsl_vector_free(yF); gsl_vector_free(rF); gsl_vector_free(cF); gsl_matrix_free(covF); gsl_matrix_free(XR); gsl_vector_free(yR); gsl_vector_free(rR); gsl_vector_free(cR); gsl_matrix_free(covR); } if(!talk) cout<<endl; } // close files. inf.close(); for (int i = 0;i<ctrlFiles.size();i++) infc[i].close(); outf.close(); string statFname = "readStats.txt"; bool writeStats = true; ofstream ofc; ofc.open(statFname.c_str(),ios::trunc); if (ofc.fail()) { failmessage.clear(); failmessage.append("ERROR: Output file \""); failmessage.append(statFname.c_str()); failmessage.append("\" could not be created, skipping.\n"); writeStats = false; } if(writeStats) { ofc <<"Chip reads"<<endl<<"Name\t"<<"minCrd\t"<<"maxCrd\t"<<"F_counts\t"<<"R_counts\t"<<endl; for ( it=seqMapChip.begin() ; it != seqMapChip.end(); it++ ) { ofc << (*it).first << "\t" << (*it).second.minPos << "\t" << (*it).second.maxPos<<"\t"; ofc << (*it).second.countF << "\t" << (*it).second.countR; ofc <<endl; } for (int i = 0;i<ctrlFiles.size();i++) { ofc<<ctrlFiles[i]<<endl; ofc <<"Control reads"<<endl<<"Name\t"<<"minCrd\t"<<"maxCrd\t"<<"F_counts\t"<<"R_counts\t"<<endl; for ( it=seqMapCtrls[i].begin() ; it != seqMapCtrls[i].end(); it++ ) { ofc << (*it).first << "\t" << (*it).second.minPos << "\t" << (*it).second.maxPos<<"\t"; ofc << (*it).second.countF << "\t" << (*it).second.countR; ofc <<endl; } } }else{ cerr<<failmessage.c_str()<<endl; } ofc.close(); return(0); }
void test_longley () { size_t i, j; { gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (longley_n, longley_p); gsl_matrix_view X = gsl_matrix_view_array (longley_x, longley_n, longley_p); gsl_vector_view y = gsl_vector_view_array (longley_y, longley_n); gsl_vector * c = gsl_vector_alloc (longley_p); gsl_vector * r = gsl_vector_alloc (longley_n); gsl_matrix * cov = gsl_matrix_alloc (longley_p, longley_p); gsl_vector_view diag; double chisq; double expected_c[7] = { -3482258.63459582, 15.0618722713733, -0.358191792925910E-01, -2.02022980381683, -1.03322686717359, -0.511041056535807E-01, 1829.15146461355 }; double expected_sd[7] = { 890420.383607373, 84.9149257747669, 0.334910077722432E-01, 0.488399681651699, 0.214274163161675, 0.226073200069370, 455.478499142212 } ; double expected_chisq = 836424.055505915; gsl_multifit_linear (&X.matrix, &y.vector, c, cov, &chisq, work); gsl_test_rel (gsl_vector_get(c,0), expected_c[0], 1e-10, "longley gsl_fit_multilinear c0") ; gsl_test_rel (gsl_vector_get(c,1), expected_c[1], 1e-10, "longley gsl_fit_multilinear c1") ; gsl_test_rel (gsl_vector_get(c,2), expected_c[2], 1e-10, "longley gsl_fit_multilinear c2") ; gsl_test_rel (gsl_vector_get(c,3), expected_c[3], 1e-10, "longley gsl_fit_multilinear c3") ; gsl_test_rel (gsl_vector_get(c,4), expected_c[4], 1e-10, "longley gsl_fit_multilinear c4") ; gsl_test_rel (gsl_vector_get(c,5), expected_c[5], 1e-10, "longley gsl_fit_multilinear c5") ; gsl_test_rel (gsl_vector_get(c,6), expected_c[6], 1e-10, "longley gsl_fit_multilinear c6") ; diag = gsl_matrix_diagonal (cov); gsl_test_rel (gsl_vector_get(&diag.vector,0), pow(expected_sd[0],2.0), 1e-10, "longley gsl_fit_multilinear cov00") ; gsl_test_rel (gsl_vector_get(&diag.vector,1), pow(expected_sd[1],2.0), 1e-10, "longley gsl_fit_multilinear cov11") ; gsl_test_rel (gsl_vector_get(&diag.vector,2), pow(expected_sd[2],2.0), 1e-10, "longley gsl_fit_multilinear cov22") ; gsl_test_rel (gsl_vector_get(&diag.vector,3), pow(expected_sd[3],2.0), 1e-10, "longley gsl_fit_multilinear cov33") ; gsl_test_rel (gsl_vector_get(&diag.vector,4), pow(expected_sd[4],2.0), 1e-10, "longley gsl_fit_multilinear cov44") ; gsl_test_rel (gsl_vector_get(&diag.vector,5), pow(expected_sd[5],2.0), 1e-10, "longley gsl_fit_multilinear cov55") ; gsl_test_rel (gsl_vector_get(&diag.vector,6), pow(expected_sd[6],2.0), 1e-10, "longley gsl_fit_multilinear cov66") ; gsl_test_rel (chisq, expected_chisq, 1e-10, "longley gsl_fit_multilinear chisq") ; gsl_multifit_linear_residuals(&X.matrix, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq); gsl_test_rel (chisq, expected_chisq, 1e-10, "longley gsl_fit_multilinear residuals") ; gsl_vector_free(c); gsl_vector_free(r); gsl_matrix_free(cov); gsl_multifit_linear_free (work); } { gsl_multifit_linear_workspace * work = gsl_multifit_linear_alloc (longley_n, longley_p); gsl_matrix_view X = gsl_matrix_view_array (longley_x, longley_n, longley_p); gsl_vector_view y = gsl_vector_view_array (longley_y, longley_n); gsl_vector * w = gsl_vector_alloc (longley_n); gsl_vector * c = gsl_vector_alloc (longley_p); gsl_vector * r = gsl_vector_alloc (longley_n); gsl_matrix * cov = gsl_matrix_alloc (longley_p, longley_p); double chisq; double expected_c[7] = { -3482258.63459582, 15.0618722713733, -0.358191792925910E-01, -2.02022980381683, -1.03322686717359, -0.511041056535807E-01, 1829.15146461355 }; double expected_cov[7][7] = { { 8531122.56783558, -166.727799925578, 0.261873708176346, 3.91188317230983, 1.1285582054705, -0.889550869422687, -4362.58709870581}, {-166.727799925578, 0.0775861253030891, -1.98725210399982e-05, -0.000247667096727256, -6.82911920718824e-05, 0.000136160797527761, 0.0775255245956248}, {0.261873708176346, -1.98725210399982e-05, 1.20690316701888e-08, 1.66429546772984e-07, 3.61843600487847e-08, -6.78805814483582e-08, -0.00013158719037715}, {3.91188317230983, -0.000247667096727256, 1.66429546772984e-07, 2.56665052544717e-06, 6.96541409215597e-07, -9.00858307771567e-07, -0.00197260370663974}, {1.1285582054705, -6.82911920718824e-05, 3.61843600487847e-08, 6.96541409215597e-07, 4.94032602583969e-07, -9.8469143760973e-08, -0.000576921112208274}, {-0.889550869422687, 0.000136160797527761, -6.78805814483582e-08, -9.00858307771567e-07, -9.8469143760973e-08, 5.49938542664952e-07, 0.000430074434198215}, {-4362.58709870581, 0.0775255245956248, -0.00013158719037715, -0.00197260370663974, -0.000576921112208274, 0.000430074434198215, 2.23229587481535 }} ; double expected_chisq = 836424.055505915; gsl_vector_set_all (w, 1.0); gsl_multifit_wlinear (&X.matrix, w, &y.vector, c, cov, &chisq, work); gsl_test_rel (gsl_vector_get(c,0), expected_c[0], 1e-10, "longley gsl_fit_wmultilinear c0") ; gsl_test_rel (gsl_vector_get(c,1), expected_c[1], 1e-10, "longley gsl_fit_wmultilinear c1") ; gsl_test_rel (gsl_vector_get(c,2), expected_c[2], 1e-10, "longley gsl_fit_wmultilinear c2") ; gsl_test_rel (gsl_vector_get(c,3), expected_c[3], 1e-10, "longley gsl_fit_wmultilinear c3") ; gsl_test_rel (gsl_vector_get(c,4), expected_c[4], 1e-10, "longley gsl_fit_wmultilinear c4") ; gsl_test_rel (gsl_vector_get(c,5), expected_c[5], 1e-10, "longley gsl_fit_wmultilinear c5") ; gsl_test_rel (gsl_vector_get(c,6), expected_c[6], 1e-10, "longley gsl_fit_wmultilinear c6") ; for (i = 0; i < longley_p; i++) { for (j = 0; j < longley_p; j++) { gsl_test_rel (gsl_matrix_get(cov,i,j), expected_cov[i][j], 1e-7, "longley gsl_fit_wmultilinear cov(%d,%d)", i, j) ; } } gsl_test_rel (chisq, expected_chisq, 1e-10, "longley gsl_fit_wmultilinear chisq") ; gsl_multifit_linear_residuals(&X.matrix, &y.vector, c, r); gsl_blas_ddot(r, r, &chisq); gsl_test_rel (chisq, expected_chisq, 1e-10, "longley gsl_fit_wmultilinear residuals") ; gsl_vector_free(w); gsl_vector_free(c); gsl_vector_free(r); gsl_matrix_free(cov); gsl_multifit_linear_free (work); } }