void reorder ( zerone_t * Z, int * map ) { // NB: assume that r < 63 (checked by 'zerone()'). const unsigned int r = Z->ChIP->r; const unsigned int n = nobs(Z->ChIP); // Reorder 'Q'. double Q[9] = {0}; for (int i = 0 ; i < 3 ; i++) { for (int j = 0 ; j < 3 ; j++) { Q[i+j*3] = Z->Q[map[i]+map[j]*3]; } } memcpy(Z->Q, Q, 9 * sizeof(double)); // Reorder 'p'. double p[3*64] = {0}; for (int i = 0 ; i < 3 ; i++) { for (int j = 0 ; j < r+1 ; j++) { p[j+i*(r+1)] = Z->p[j+map[i]*(r+1)]; } } memcpy(Z->p, p, 3*(r+1) * sizeof(double)); // Reorder 'phi'. double buffer[3] = {0}; for (int i = 0 ; i < n ; i++) { for (int j = 0 ; j < 3 ; j++) buffer[j] = Z->phi[map[j]+i*3]; memcpy(Z->phi + 3*i, buffer, 3 * sizeof(double)); } // Reorder 'pem'. for (int i = 0 ; i < n ; i++) { for (int j = 0 ; j < 3 ; j++) buffer[j] = Z->pem[map[j]+i*3]; memcpy(Z->pem + 3*i, buffer, 3 * sizeof(double)); } return; }
jahmm_t * do_jahmm ( ChIP_t *ChIP ) { // Extract the dimensions of the observations. const unsigned int r = ChIP->r; const unsigned int n = nobs(ChIP); // Extract the first ChIP profile, which is the sum of // negative controls. int *ctrl = malloc(n * sizeof(int)); if (ctrl == NULL) { fprintf(stderr, "memory error %s:%d\n", __FILE__, __LINE__); return NULL; } for (size_t i = 0 ; i < n ; i++) { ctrl[i] = ChIP->y[0+i*r]; } zinb_par_t *z = mle_zinb(ctrl, n); if (z == NULL) { fprintf(stderr, "jahmm failure %s:%d\n", __FILE__, __LINE__); apologize(); return NULL; } free(ctrl); // Jahmm uses 3 states. const unsigned int m = 3; double *Q = malloc(m*m * sizeof(double)); double *p = malloc(m*(r+1) * sizeof(double)); if (Q == NULL || p == NULL) { fprintf(stderr, "memory error: %s:%d\n", __FILE__, __LINE__); return NULL; } // Set initial values of 'Q'. for (size_t i = 0 ; i < m ; i++) { for (size_t j = 0 ; j < m ; j++) { Q[i+j*m] = (i==j) ? .95 : .05/(m-1); } } // Set initial values of 'p'. The are not normalize, but the // call to 'bw_zinm' will normalize them. for (size_t i = 0 ; i < m ; i++) { p[0+i*(r+1)] = z->p; p[1+i*(r+1)] = 1 - z->p; for (size_t j = 2 ; j < r+1 ; j++) { p[j+i*(r+1)] = i + 0.5; } } jahmm_t *jahmm = new_jahmm(m, ChIP); if (jahmm == NULL) { fprintf(stderr, "memory error %s:%d\n", __FILE__, __LINE__); free(Q); free(p); return NULL; } set_jahmm_par(jahmm, Q, z->a, z->pi, p); free(Q); free(p); Q = jahmm->Q; p = jahmm->p; // Run the Baum-Welch algorithm. bw_zinm(jahmm); // Reorder the states in case they got scrambled. // We use the value of p0 as a sort key. int tmp, map[3] = {0,1,2}; if (p[0*(r+1)] < p[1*(r+1)]) { tmp = map[0]; map[0] = map[1]; map[1] = tmp; } if (p[1*(r+1)] < p[2*(r+1)]) { tmp = map[1]; map[1] = map[2]; map[2] = tmp; } if (p[0*(r+1)] < p[1*(r+1)]) { tmp = map[0]; map[0] = map[1]; map[1] = tmp; } if (map[0] != 0 || map[1] != 1) { // The states have beem scrambled. We need to reorder // 'Q', 'p', 'phi' and 'pem'. double *phi = jahmm->phi; double *pem = jahmm->pem; double *Q_ = malloc(m*m * sizeof(double)); double *p_ = malloc(m*(r+1) * sizeof(double)); double *phi_ = malloc(m*n * sizeof(double)); double *pem_ = malloc(m*n * sizeof(double)); if (Q_ == NULL || p_ == NULL || phi_ == NULL || pem_ == NULL) { // TODO: free everything fprintf(stderr, "memory error %s:%d\n", __FILE__, __LINE__); return NULL; } for (size_t j = 0 ; j < m ; j++) { for (size_t i = 0 ; i < m ; i++) { Q_[map[i]+map[j]*m] = Q[i+j*m]; } } memcpy(jahmm->Q, Q_, m*m * sizeof(double)); for (size_t j = 0 ; j < m ; j++) { for (size_t i = 0 ; i < r+1 ; i++) { p_[i+map[j]*(r+1)] = p[i+j*(r+1)]; } } memcpy(jahmm->p, p_, m*(r+1) * sizeof(double)); for (size_t j = 0 ; j < m ; j++) { for (size_t i = 0 ; i < n ; i++) { phi_[map[j]+i*m] = phi[j+i*m]; pem_[map[j]+i*m] = pem[j+i*m]; } } memcpy(jahmm->phi, phi_, m*n * sizeof(double)); memcpy(jahmm->pem, pem_, m*n * sizeof(double)); free(Q_); free(p_); free(phi_); free(pem_); } // Run the Viterbi algorithm. int *path = malloc(n * sizeof(int)); double *initp = malloc(m * sizeof(double)); double *log_Q = malloc(m*m * sizeof(double)); if (path == NULL || initp == NULL || log_Q == NULL) { fprintf(stderr, "memory error %s:%d\n", __FILE__, __LINE__); // TODO: free everything. return NULL; } for (size_t i = 0 ; i < m ; i++) initp[i] = log(1.0/m); for (size_t i = 0 ; i < m*m ; i++) log_Q[i] = log(Q[i]); block_viterbi(m, ChIP->nb, ChIP->size, log_Q, initp, jahmm->pem, path); jahmm->path = path; free(initp); free(log_Q); return jahmm; }
zerone_t * do_zerone ( ChIP_t * ChIP ) { // The number of state in Zerone is an important constant. // So much depends on it that it may be frozen in the code. const unsigned int m = 3; int * mock = NULL; // The mock ChIP profile. int * path = NULL; // The Viterbi path. zinb_par_t * par = NULL; // The parameter estimates. zerone_t * Z = NULL; // The Zerone instance. // Extract the dimensions of the observations. const unsigned int r = ChIP->r; const unsigned int n = nobs(ChIP); if (r > 63) { fprintf(stderr, "maximum number of profiles exceeded\n"); goto clean_and_return; } // Extract the first ChIP profile (the sum of mock controls). mock = malloc(n * sizeof(int)); if (mock == NULL) { fprintf(stderr, "memory error %s:%d\n", __FILE__, __LINE__); goto clean_and_return; } // Copy data to 'mock'. for (size_t i = 0 ; i < n ; i++) { mock[i] = ChIP->y[0+i*r]; } par = mle_zinb(mock, n); if (par == NULL) { // TODO: Change parametrization so that failure does not happen. // fprintf(stderr, "zerone failure %s:%d\n", __FILE__, __LINE__); apologize(); goto clean_and_return; } free(mock); mock = NULL; double Q[9] = {0}; double p[3*64] = {0}; // Set initial values of 'Q'. for (size_t i = 0 ; i < 3 ; i++) { for (size_t j = 0 ; j < 3 ; j++) { Q[i+j*3] = (i==j) ? .95 : .025; } } // Set initial values of 'p'. They are not normalized, // but the call to 'bw_zinm' will normalize them. for (size_t i = 0 ; i < 3 ; i++) { p[0+i*(r+1)] = par->p; p[1+i*(r+1)] = 1 - par->p; for (size_t j = 2 ; j < r+1 ; j++) { p[j+i*(r+1)] = i + 0.5; } } Z = new_zerone(3, ChIP); if (Z == NULL) { // TODO: handle this case properly. // fprintf(stderr, "error in function '%s()' %s:%d\n", __func__, __FILE__, __LINE__); goto clean_and_return; } set_zerone_par(Z, Q, par->a, par->pi, p); // Run the Baum-Welch algorithm. bw_zinm(Z); // Reorder the states in case they got scrambled. // We use the value of p0 as a sort key (high p0 // means low average signal and vice versa). int map[3] = {0,1,2}; for (int i = 1 ; i < 3 ; i++) { if (Z->p[i*(r+1)] > Z->p[map[0]*(r+1)]) map[0] = i; if (Z->p[(i-1)*(r+1)] < Z->p[map[2]*(r+1)]) map[2] = i-1; } // The middle state is the remaining one. map[1] = 3-map[0]-map[2]; debug_print("map: [%d, %d, %d]\n", map[0], map[1], map[2]); if (map[0] != 0 || map[1] != 1 || map[2] != 2) reorder(Z, map); // Run the Viterbi algorithm. double log_Q[9] = {0}; double initp[3] = {0}; path = malloc(n * sizeof(int)); if (path == NULL) { fprintf(stderr, "memory error %s:%d\n", __FILE__, __LINE__); goto clean_and_return; } for (size_t i = 0 ; i < 3 ; i++) initp[i] = log(1.0/3); for (size_t i = 0 ; i < 9 ; i++) log_Q[i] = log(Z->Q[i]); // Find Viterbi path. block_viterbi(m, ChIP->nb, ChIP->sz, log_Q, initp, Z->pem, path); Z->path = path; clean_and_return: // TODO: Do the cleaning if necessary. // return Z; }
int main(int argc, char **argv) { debug_print("%s (DEBUG)\n", VERSION); if (argc == 1) { say_usage(); exit(EXIT_SUCCESS); } // Input file names (mock and ChIP). char *mock_fnames[MAXNARGS+1] = {0}; char *ChIP_fnames[MAXNARGS+1] = {0}; int n_mock_files = 0; int n_ChIP_files = 0; int no_mock_specified = 1; int no_ChIP_specified = 1; static int list_flag = 0; static int minmapq = 20; static int window = 300; static int mock_flag = 1; static double minconf = 0.0; // Needed to check 'strtoul()'. char *endptr; // Parse options. debug_print("%s", "arguments:'\n"); while(1) { int option_index = 0; static struct option long_options[] = { {"chip", required_argument, 0, '1'}, {"confidence", required_argument, 0, 'c'}, {"help", no_argument, 0, 'h'}, {"list-output", no_argument, &list_flag, 1 }, {"mock", required_argument, 0, '0'}, {"no-mock", no_argument, &mock_flag, 0 }, {"quality", required_argument, 0, 'q'}, {"version", no_argument, 0, 'v'}, {"window", required_argument, 0, 'w'}, {0, 0, 0, 0} }; int c = getopt_long(argc, argv, "0:1:c:hlq:vw:", long_options, &option_index); // Done parsing named options. // if (c == -1) break; switch (c) { case 0: break; case '0': debug_print("| mock files(s): %s\n", optarg); parse_fname(mock_fnames, optarg, &n_mock_files); no_mock_specified = 0; break; case '1': debug_print("| ChIP files(s): %s\n", optarg); parse_fname(ChIP_fnames, optarg, &n_ChIP_files); no_ChIP_specified = 0; break; case 'h': say_usage(); return EXIT_SUCCESS; case 'l': list_flag = 1; break; case 'c': // Decode argument with 'strtod()' errno = 0; endptr = NULL; minconf = strtod(optarg, &endptr); if (!check_strtoX(optarg, endptr) || minconf < 0 || minconf > 1) { fprintf(stderr, "zerone error: confidence must be " "a float between 0 and 1\n"); say_usage(); return EXIT_FAILURE; } debug_print("| minconf: %f\n", minconf); break; case 'q': // Decode argument with 'strtoul()' errno = 0; endptr = NULL; minmapq = strtoul(optarg, &endptr, 10); if (!check_strtoX(optarg, endptr) || minmapq < 0 || minmapq > 254) { fprintf(stderr, "zerone error: minimum mapping quality must be " "an integer between 0 and 254\n"); say_usage(); return EXIT_FAILURE; } debug_print("| minmapq: %d\n", minmapq); break; case 'v': say_version(); return EXIT_SUCCESS; case 'w': window = atoi(optarg); if (window <= 0) { fprintf(stderr, "zerone error: window must be a " "positive integer\n"); say_usage(); return EXIT_FAILURE; } debug_print("| window: %d\n", window); break; default: // Cannot parse. // say_usage(); return EXIT_FAILURE; } } // Now parse positional arguments (file names). while (optind < argc) { parse_fname(ChIP_fnames, argv[optind++], &n_ChIP_files); } debug_print("%s", "done parsing arguments\n"); // Check options. if (no_mock_specified && mock_flag) { fprintf(stderr, "zerone error: specify a file for mock control experiment\n"); say_usage(); return EXIT_FAILURE; } if (no_ChIP_specified) { fprintf(stderr, "zerone error: specify a file for ChIP-seq experiment\n"); say_usage(); return EXIT_FAILURE; } // Process input files. zerone_parser_args_t args; args.window = window; args.minmapq = minmapq; ChIP_t *ChIP = parse_input_files(mock_fnames, ChIP_fnames, args); if (ChIP == NULL) { fprintf(stderr, "error while reading input\n"); exit(EXIT_FAILURE); } // debug info // { debug_print("%s", "done reading input files\n"); debug_print("%s", "ChIP:\n"); debug_print("| r = %ld (dimension)\n", ChIP->r); debug_print("| nb = %d (block number)\n", ChIP->nb); for (int j = 0 ; j < ChIP->nb ; j++) { debug_print("| block %s (size: %d)\n", ChIP->nm + 32*j, ChIP->sz[j]); } // Sum reads of all blocks. size_t *nreads = calloc(ChIP->r, sizeof(size_t)); if (nreads == NULL) { fprintf(stderr, "memory error\n"); exit(EXIT_FAILURE); } for (int i = 0 ; i < nobs(ChIP) ; i++) { for (int j = 0 ; j < ChIP->r ; j++) { nreads[j] += ChIP->y[j + i*ChIP->r]; } } debug_print("| aggregated mock: %ld reads\n", nreads[0]); for (int j = 0 ; j < ChIP->r-1 ; j++) { debug_print("| %s: %ld reads\n", ChIP_fnames[j], nreads[j+1]); } free(nreads); } // Do zerone. debug_print("%s", "starting zerone\n"); zerone_t *Z = do_zerone(ChIP); if (Z == NULL) { fprintf(stderr, "run time error (sorry)\n"); exit(EXIT_FAILURE); } // debug info // { debug_print("%s", "Q:\n"); debug_print("%.3f %.3f %.3f\n", Z->Q[0], Z->Q[3], Z->Q[6]); debug_print("%.3f %.3f %.3f\n", Z->Q[1], Z->Q[4], Z->Q[7]); debug_print("%.3f %.3f %.3f\n", Z->Q[2], Z->Q[5], Z->Q[8]); debug_print("%s", "p:\n"); for (int j = 0 ; j < 3 ; j++) { int off = 0; char debuf[512]; for (int i = 0 ; i < Z->r+1 ; i++) { off += sprintf(debuf + off, "%.3f ", Z->p[i+j*(Z->r+1)]); if (off > 499) break; } debug_print("%s\n", debuf); } } // Quality control. double feat[5]; double QC = zerone_qc(Z, feat); fprintf(stdout, "# QC score: %.3f\n", QC); fprintf(stdout, "# features: %.3f, %.3f, %.3f, %.3f, %.3f\n", feat[0], feat[1], feat[2], feat[3], feat[4]); fprintf(stdout, "# advice: %s discretization.\n", QC >= 0 ? "accept" : "reject"); // List output. if (list_flag) { int wid = 0; int target = 0; double best = 0.0; for (int i = 0 ; i < ChIP->nb ; i++) { char *name = ChIP->nm + 32*i; // Do not print the last bin because it may extend // beyond the limit of the chromosome. for (int j = 0 ; j < ChIP->sz[i]-1 ; j++) { // Toggle on target state. double conf = Z->phi[2+wid*3]; if (!target && Z->path[wid] == 2 && conf > minconf) { fprintf(stdout, "%s\t%d\t", name, window*j + 1); best = conf; target = 1; } // Toggle off target state. else if (target) { // Update best score. if (conf > best) best = conf; if (Z->path[wid] != 2 || conf < minconf) { fprintf(stdout, "%d\t%.5f\n", window*(j+1), best); best = 0.0; target = 0; } } wid++; } // In case the end of the block is a target. if (target) { fprintf(stdout, "%d\t%.5f\n", window * ChIP->sz[i], best); best = 0.0; target = 0; } } } // Table output. else { // Use 'offset' to navigate in the ChIP blocks. uint64_t offset = 0; // In case no mock was provided, skip the column. const int skipmock = mock_flag ? 0 : 1; for (int i = 0 ; i < ChIP->nb ; i++) { char *name = ChIP->nm + 32*i; // Do not print the last bin because it may extend // beyond the limit of the chromosome. for (int j = 0 ; j < ChIP->sz[i]-1 ; j++) { // Skip if 'confidence' too low. if (Z->phi[2+(offset+j)*3] < minconf) continue; fprintf(stdout, "%s\t%d\t%d\t%d", name, window*j + 1, // Block name, window start, end, state. window*(j+1), Z->path[offset+j] == 2 ? 1 : 0); for (int k = skipmock ; k < Z->ChIP->r ; k++) { fprintf(stdout, "\t%d", // Read numbers of each file. Z->ChIP->y[(offset+j)*Z->ChIP->r+k]); } fprintf(stdout, "\t%.5f\n", // Confidence score. Z->phi[2+(offset+j)*3]); } // End of the block. Update 'offset' before // local window number is reset to 0. offset += Z->ChIP->sz[i]; } } destroy_zerone_all(Z); // Also frees ChIP. for (int i = 0 ; i < MAXNARGS ; i++) { free(mock_fnames[i]); free(ChIP_fnames[i]); } return 0; }