Ejemplo n.º 1
0
int main(int argc, char *argv[])
{
    gzFile fp;
    kstream_t *ks;
    kstring_t s, t[N_TMPSTR];
    int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
    long m_cigar = 0, n_cigar = 0;
    unsigned *af, *cigar = 0;

    while ((c = getopt(argc, argv, "pc")) >= 0) {
        switch (c) {
            case 'p': is_padded = 1; break;
            case 'c': write_cns = 1; break;
        }
    }
    if (argc == optind) {
        fprintf(stderr, "\nUsage:   ace2sam [-pc] <in.ace>\n\n");
        fprintf(stderr, "Options: -p     output padded SAM\n");
        fprintf(stderr, "         -c     write the contig sequence in SAM\n\n");
        fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
        fprintf(stderr, "       2. The order of reads in AF and in RD must be identical\n");
        fprintf(stderr, "       3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
        fprintf(stderr, "       4. This program writes the headerless SAM to stdout and header to stderr\n\n");
        return 1;
    }

    s.l = s.m = 0; s.s = 0;
    af_n = af_max = af_i = 0; af = 0;
    for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
    fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
    ks = ks_init(fp);
    while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
        if (strcmp(s.s, "CO") == 0) { // contig sequence
            kstring_t *cns;
            t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
            af_n = af_i = 0; // reset the af array
            ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
            ks_getuntil(ks, '\n', &s, &dret); // read the whole line
            while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
            remove_pads(&t[1], &t[2]); // construct the unpadded sequence
            // compute the array for mapping padded positions to unpadded positions
            p2u = realloc(p2u, t[1].m * sizeof(int));
            for (i = k = 0; i < t[1].l; ++i) {
                p2u[i] = k;
                if (t[1].s[i] != '*') ++k;
            }
            // write out the SAM header and contig sequences
            fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line
            cns = &t[is_padded?1:2];
            fprintf(stderr, "S >%s\n", t[0].s);
            for (i = 0; i < cns->l; i += LINE_LEN) {
                fputs("S ", stderr);
                for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
                    fputc(cns->s[i + k], stderr);
                fputc('\n', stderr);
            }

#define __padded2cigar(sp) do { \
        int i, l_M = 0, l_D = 0; \
        for (i = 0; i < sp.l; ++i) { \
            if (sp.s[i] == '*') { \
                if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
                ++l_D; l_M = 0; \
            } else { \
                if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
                ++l_M; l_D = 0; \
            } \
        } \
        if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
        else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
    } while (0)

            if (write_cns) { // write the consensus SAM line (dummy read)
                n_cigar = 0;
                if (is_padded) __padded2cigar(t[1]);
                else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
                kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
                for (i = 0; i < n_cigar; ++i) {
                    kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
                }
                kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
            }
        } else if (strcmp(s.s, "BQ") == 0) { // contig quality
Ejemplo n.º 2
0
/*
 * Iterates through all sequences in a contig realigning them against the
 * consensus vector.
 *
 * It then adds the newly aligned sequence back into the consensus, editing the
 * sequence and tag positions/lengths too.
 * To do this we may need to shuffle the start position of sequences
 * downstream, and hence also move consensus tags.
 */
MALIGN *realign_seqs(int contig, MALIGN *malign, int band) {
    CONTIGL *lastl = NULL, *contigl;
    int nsegs, r;
    int old_start, old_end, new_start, new_end;

    for (contigl = malign->contigl, nsegs = 0; contigl; nsegs++)
	contigl = contigl->next;

    /* Loop through all sequences in the contig */
    contigl = malign->contigl;
    while (contigl) {
	int len;
	MOVERLAP *o;
	ALIGN_PARAMS *p;
	int cons_pos;
	int npads;

	/* Obtain a depadded copy of this mseg */
	len = contigl->mseg->length;


	/* Remove sequence from malign */
	malign_remove_contigl(malign, lastl, contigl);


	/* Align sequence to malign */
	p = create_align_params();
	set_align_params (p,
			  band,
			  8, /*gap_open*/
			  8, /*gap_extend*/
			  /* EDGE_GAPS_COUNT, */
			  EDGE_GAPS_ZEROX | BEST_EDGE_TRACE,
			  RETURN_EDIT_BUFFERS | RETURN_SEQ |
			  RETURN_NEW_PADS,
			  0,  /*seq1_start*/
			  0,  /*seq2_start*/
			  0,  /*old pad sym*/
			  0,  /*new pad sym*/
			  0   /*set_job*/);

	o = create_moverlap();
	init_moverlap(o, malign, contigl->mseg->seq, malign->length, len);

	cons_pos = contigl->mseg->offset;
	o->malign_len = malign->length - cons_pos;
#if 1
	/* 3 bases overhang to the right */
	if (o->malign_len > contigl->mseg->length+band/2+1)
	    o->malign_len = contigl->mseg->length+band/2+1;

	/* And 3 to the left */
	if (cons_pos > band/2+1) {
	    cons_pos -= band/2+1;
	    o->malign_len += band/2+1;
	    contigl->mseg->offset -= band/2+1;
	} else {
	    o->malign_len += cons_pos;
	    contigl->mseg->offset -= cons_pos;
	    cons_pos = 0;
	}
#else
	if (o->malign_len > contigl->mseg->length)
	    o->malign_len = contigl->mseg->length;
#endif

	{
	    char *old_cons   = malign->consensus;
	    int **old_scores = malign->scores;
	    int **old_counts = malign->counts;

	    malign->consensus += cons_pos;
	    malign->counts    += cons_pos;
	    malign->scores    += cons_pos;

	    /* fixed_malign(o, p); */
	    r = realigner_malign(o, p); /* o->score = alignment score */
	    
	    /*
	    if (!r)
		print_moverlap(malign, o, cons_pos);
	    else
		puts("FAILED");
	    */

	    malign->consensus = old_cons;
	    malign->counts    = old_counts;
	    malign->scores    = old_scores;
	}

	/* Edit the sequence with the alignment */
	old_start = contigl->mseg->offset;
	old_end   = contigl->mseg->offset + contigl->mseg->length-1;
	if (r == 0 && o->S1)
	    npads = edit_mseqs(malign, contigl, o, cons_pos);
	else
	    npads = 0;
	new_start = contigl->mseg->offset;
	new_end   = contigl->mseg->offset + contigl->mseg->length-1;


	/* Put sequence back */
	malign_add_contigl(malign, lastl, contigl);


	/* Update the malign structure */
	if (npads > 0) {
	    malign_recalc_scores(malign,
				 MIN(old_start, new_start),
				 MAX(old_end, new_end));
	}
	    
	/* TODO:
	 *
	 * X Realloc malign->consensus / malign->score
	 * X Move malign->consensus from here to end right by npads.
	 * X Move malign->score      " ...
	 * X Update malign->length
	 * X Recompute consensus and score over the length of this reading.
	 *
	 * If contigl was doubly linked (sorted on left and right ends
	 * separately) then we could chain left/right to only update
	 * those readings which overlap this region. For now we can
	 * just chain from left each time.  Not optimal (O(N^2) for
	 * full realignment method then) but workable perhaps.
	 *
	 * See get_malign_counts, scale_malign_scores and get_malign_consensus
	 */


	/*
	 * Check if the short-cut method gives the same result as rebuilding
	 * from scratch.
	 */
#if 0
	{
	    int i, j;
	    MALIGN *copy;
	    copy = contigl_to_malign(malign->contigl, -4, -4);

	    for (i = 0; i < copy->length; i++) {
		for (j = 0; j < copy->charset_size+2; j++) {
		    if (copy->scores[i][j] != malign->scores[i][j]) {
			printf("[%d][%d] = %d (should be %d)\n",
			       i, j,
			       malign->scores[i][j],
			       copy->scores[i][j]);
		    }
		}
	    }
	    copy->contigl = NULL;
	    destroy_malign(copy, 0);
	}
#endif

	destroy_moverlap(o);
	destroy_alignment_params(p); 

	lastl = contigl;
	contigl = contigl->next;
    }

    resort_contigl(malign);
    remove_pads(malign);

    return malign;
}