int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; kstring_t s, t[N_TMPSTR]; int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; long m_cigar = 0, n_cigar = 0; unsigned *af, *cigar = 0; while ((c = getopt(argc, argv, "pc")) >= 0) { switch (c) { case 'p': is_padded = 1; break; case 'c': write_cns = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n"); fprintf(stderr, "Options: -p output padded SAM\n"); fprintf(stderr, " -c write the contig sequence in SAM\n\n"); fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); return 1; } s.l = s.m = 0; s.s = 0; af_n = af_max = af_i = 0; af = 0; for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { if (strcmp(s.s, "CO") == 0) { // contig sequence kstring_t *cns; t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line af_n = af_i = 0; // reset the af array ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name ks_getuntil(ks, '\n', &s, &dret); // read the whole line while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence remove_pads(&t[1], &t[2]); // construct the unpadded sequence // compute the array for mapping padded positions to unpadded positions p2u = realloc(p2u, t[1].m * sizeof(int)); for (i = k = 0; i < t[1].l; ++i) { p2u[i] = k; if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; fprintf(stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { fputs("S ", stderr); for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) fputc(cns->s[i + k], stderr); fputc('\n', stderr); } #define __padded2cigar(sp) do { \ int i, l_M = 0, l_D = 0; \ for (i = 0; i < sp.l; ++i) { \ if (sp.s[i] == '*') { \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ ++l_D; l_M = 0; \ } else { \ if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ ++l_M; l_D = 0; \ } \ } \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ } while (0) if (write_cns) { // write the consensus SAM line (dummy read) n_cigar = 0; if (is_padded) __padded2cigar(t[1]); else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); for (i = 0; i < n_cigar; ++i) { kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); } kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); } } else if (strcmp(s.s, "BQ") == 0) { // contig quality
/* * Iterates through all sequences in a contig realigning them against the * consensus vector. * * It then adds the newly aligned sequence back into the consensus, editing the * sequence and tag positions/lengths too. * To do this we may need to shuffle the start position of sequences * downstream, and hence also move consensus tags. */ MALIGN *realign_seqs(int contig, MALIGN *malign, int band) { CONTIGL *lastl = NULL, *contigl; int nsegs, r; int old_start, old_end, new_start, new_end; for (contigl = malign->contigl, nsegs = 0; contigl; nsegs++) contigl = contigl->next; /* Loop through all sequences in the contig */ contigl = malign->contigl; while (contigl) { int len; MOVERLAP *o; ALIGN_PARAMS *p; int cons_pos; int npads; /* Obtain a depadded copy of this mseg */ len = contigl->mseg->length; /* Remove sequence from malign */ malign_remove_contigl(malign, lastl, contigl); /* Align sequence to malign */ p = create_align_params(); set_align_params (p, band, 8, /*gap_open*/ 8, /*gap_extend*/ /* EDGE_GAPS_COUNT, */ EDGE_GAPS_ZEROX | BEST_EDGE_TRACE, RETURN_EDIT_BUFFERS | RETURN_SEQ | RETURN_NEW_PADS, 0, /*seq1_start*/ 0, /*seq2_start*/ 0, /*old pad sym*/ 0, /*new pad sym*/ 0 /*set_job*/); o = create_moverlap(); init_moverlap(o, malign, contigl->mseg->seq, malign->length, len); cons_pos = contigl->mseg->offset; o->malign_len = malign->length - cons_pos; #if 1 /* 3 bases overhang to the right */ if (o->malign_len > contigl->mseg->length+band/2+1) o->malign_len = contigl->mseg->length+band/2+1; /* And 3 to the left */ if (cons_pos > band/2+1) { cons_pos -= band/2+1; o->malign_len += band/2+1; contigl->mseg->offset -= band/2+1; } else { o->malign_len += cons_pos; contigl->mseg->offset -= cons_pos; cons_pos = 0; } #else if (o->malign_len > contigl->mseg->length) o->malign_len = contigl->mseg->length; #endif { char *old_cons = malign->consensus; int **old_scores = malign->scores; int **old_counts = malign->counts; malign->consensus += cons_pos; malign->counts += cons_pos; malign->scores += cons_pos; /* fixed_malign(o, p); */ r = realigner_malign(o, p); /* o->score = alignment score */ /* if (!r) print_moverlap(malign, o, cons_pos); else puts("FAILED"); */ malign->consensus = old_cons; malign->counts = old_counts; malign->scores = old_scores; } /* Edit the sequence with the alignment */ old_start = contigl->mseg->offset; old_end = contigl->mseg->offset + contigl->mseg->length-1; if (r == 0 && o->S1) npads = edit_mseqs(malign, contigl, o, cons_pos); else npads = 0; new_start = contigl->mseg->offset; new_end = contigl->mseg->offset + contigl->mseg->length-1; /* Put sequence back */ malign_add_contigl(malign, lastl, contigl); /* Update the malign structure */ if (npads > 0) { malign_recalc_scores(malign, MIN(old_start, new_start), MAX(old_end, new_end)); } /* TODO: * * X Realloc malign->consensus / malign->score * X Move malign->consensus from here to end right by npads. * X Move malign->score " ... * X Update malign->length * X Recompute consensus and score over the length of this reading. * * If contigl was doubly linked (sorted on left and right ends * separately) then we could chain left/right to only update * those readings which overlap this region. For now we can * just chain from left each time. Not optimal (O(N^2) for * full realignment method then) but workable perhaps. * * See get_malign_counts, scale_malign_scores and get_malign_consensus */ /* * Check if the short-cut method gives the same result as rebuilding * from scratch. */ #if 0 { int i, j; MALIGN *copy; copy = contigl_to_malign(malign->contigl, -4, -4); for (i = 0; i < copy->length; i++) { for (j = 0; j < copy->charset_size+2; j++) { if (copy->scores[i][j] != malign->scores[i][j]) { printf("[%d][%d] = %d (should be %d)\n", i, j, malign->scores[i][j], copy->scores[i][j]); } } } copy->contigl = NULL; destroy_malign(copy, 0); } #endif destroy_moverlap(o); destroy_alignment_params(p); lastl = contigl; contigl = contigl->next; } resort_contigl(malign); remove_pads(malign); return malign; }