コード例 #1
0
/*
 * Exports Scaffold information to an AGP file
 *
 * Returns 0 on success
 *        -1 on failure
 */
int scaffold_to_agp(GapIO *io, char *fn) {
    FILE *fp;
    int i, j;

    if (NULL == (fp = fopen(fn, "w+"))) {
	verror(ERR_WARN, "scaffold_from_agp", "%s: %s", fn, strerror(errno));
	return -1;
    }

    for (i = 0; io->scaffold && i < ArrayMax(io->scaffold); i++) {
	scaffold_t *f = cache_search(io, GT_Scaffold,
				     arr(tg_rec, io->scaffold, i));
	int start = 1, end = 1;
	int k = 1;

	if (!f) {
	    verror(ERR_WARN, "scaffold_from_agp", "Failed to load scaffold\n");
	    fclose(fp);
	    return -1;
	}

	cache_incr(io, f);

	for (j = 0; f->contig && j < ArrayMax(f->contig); j++) {
	    scaffold_member_t *m = arrp(scaffold_member_t, f->contig, j);
	    contig_t *c = cache_search(io, GT_Contig, m->rec);
	    int ustart, uend;
	    int len;

	    /* Get the unpadded clipped contig length */
	    consensus_valid_range(io, m->rec, &ustart, &uend);
	    consensus_unpadded_pos(io, m->rec, uend, &uend);
	    len = uend - ustart + 1;

	    if (j) {
		int gap = m->gap_size;
		fprintf(fp, "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n",
			f->name, start, start+gap-1, k++, gap);
		start += gap;
	    }
	    fprintf(fp, "%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n",
		    f->name, start, start + len-1,
		    k++, c->name, ustart, uend);
	    start += len;
	}

	cache_decr(io, f);
    }

    if (0 != fclose(fp)) {
	verror(ERR_WARN, "scaffold_from_agp", "%s: %s", fn, strerror(errno));
	return -1;
    }

    return 0;
}
コード例 #2
0
int shuffle_contigs_io(GapIO *io, int ncontigs, contig_list_t *contigs,
		       int band, int flush) {
    int i; //, start;
    Array indels;
    
    set_malign_lookup(5);
    /* set_alignment_matrix("/tmp/nuc_matrix", "ACGTURYMWSKDHVB-*"); */

    indels = ArrayCreate(sizeof(con_indel_t), 0);

    for (i = 0; i < ncontigs; i++) {
	tg_rec cnum = contigs[i].contig;
	int64_t old_score, new_score, tot_score, orig_score;
	//for (start = 0; start < 1000000; start += 1000) {
	//  MALIGN *malign = build_malign(io, cnum, start, start + 1000);
	MALIGN *malign;
	int c_start, c_shift;

	vmessage("Shuffling pads for contig %s\n", get_contig_name(io, cnum));

	/*
	 * The shuffle pads code (malign) comes from gap4 and has lots of
	 * assumptions that the contig goes from base 1 to base N.
	 * Fixing these assumptions is a lot of work, so for now we will take
	 * the cheat route of moving the contig to ensure the assumption
	 * is valid.
	 */
	if (-1 == consensus_valid_range(io, cnum, &c_start, NULL)) {
	    verror(ERR_WARN, "shuffle_contigs_io",
		   "Failure in consensus_valid_range()");
	    return -1;
	}
	//printf("Contig starts at base %d\n", c_start);
	c_shift = 1-c_start;
	if (c_shift != 0) {
	    if (move_contig(io, cnum, c_shift) != 0)
		return -1;
	}

	//printf("Shuffle #%"PRIrec" from %d..%d, shift %d\n",
	//       contigs[i].contig, contigs[i].start, contigs[i].end, c_shift);

	malign = build_malign(io, cnum,
			      contigs[i].start + c_shift,
			      contigs[i].end   + c_shift);
	resort_contigl(malign);

	malign_add_region(malign,
			  contigs[i].start + c_shift,
			  contigs[i].end + c_shift);

	ArrayMax(indels) = 0;
	orig_score = new_score = malign_diffs(malign, &tot_score);
	vmessage("Initial score %.2f%% mismatches (%"PRId64" mismatches)\n",
		 (100.0 * orig_score)/tot_score, orig_score/128);
	if (flush)
	    UpdateTextOutput();
	//print_malign(malign);
	do {
	    old_score = new_score;
	    malign = realign_seqs(cnum, malign, band, indels);
	    //print_malign(malign);
	    new_score = malign_diffs(malign, &tot_score);
	    vmessage("  Consensus difference score: %"PRId64"\n", new_score);
	    if (flush)
		UpdateTextOutput();
	} while (new_score < old_score);

	if (new_score < orig_score) {
	    //print_malign(malign);
	    update_io(io, cnum, malign, indels);

	    /*
	     * It's possible the contig ends could move if a sequence that
	     * was previously the end of a contig has been moved such that
	     * it's no longer the contig end. This can lead to tags off the
	     * end of the contig, so trim them (reusing break_contig
	     * code).
	     */
	     contig_visible_start(io, cnum, CITER_CSTART);
	     contig_visible_end(io, cnum, CITER_CEND);
	} else {
	    vmessage("Could not reduce number of consensus differences.\n");
	}

	/* Remove pad columns */
	//printf("New score=%d, orig_score=%d\n", new_score, orig_score);
	if (new_score < orig_score) {
	    contigs[i].start += c_shift;
	    contigs[i].end += c_shift;
	    remove_pad_columns(io, 1, &contigs[i], 100, 1);

	    //contig_t *c;
	    //c = cache_search(io, GT_Contig, cnum);
	    //cache_incr(io, c);
	    //remove_pads(io, malign, c, contigs[i].start, contigs[i].end);
	    //cache_decr(io, c);
	}

	destroy_malign(malign, 1);

	vmessage("Final score %.2f%% mismatches\n",
		 (100.0 * new_score)/tot_score);

	/*
	 * Sequences like
	 *   AGCT**GATGC
	 *             TGGATCGA
	 * can end up causing holes. We break the contig in this case to
	 * avoid minor database inconsistencies.
	 */
	// remove_contig_holes(io, cnum);

	/* reassign_confidence_values(io, cnum); */
      //}

	/* Shift contig back */
	if (c_shift != 0) {
	    if (move_contig(io, cnum, -c_shift) != 0)
		return -1;
	}

	if (flush)
	    cache_flush(io);
    }

    ArrayDestroy(indels);

    return 0;
}
コード例 #3
0
/*
 * Extends the right hand end of a single contig.
 *
 * Min_depth is the minimum depth for extension. If lower then even if the
 * data matches we'll not extend further.
 *
 * Match_score (+ve) and mismatch_score (-ve) are accumulated during
 * extension to ensure that we don't extend into junk mismatching DNA.
 */
static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth,
				int match_score, int mismatch_score) {
    int end;
    rangec_t *r;
    int nr, i;
    contig_t *c;
    char cons[CSZ], new_cons[ESZ];
    int freqs[ESZ][4], depth[ESZ];
    double score, best_score;
    int best_pos, nseq;

    vmessage("Processing contig #%"PRIrec", %s end\n",
	     crec, dir ? "left" : "right");

    for (i = 0; i < ESZ; i++) {
	freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0;
	depth[i] = 0;
    }

    c = cache_search(io, GT_Contig, crec);
    if (NULL == c) return -1;
    cache_incr(io, c);

    if (consensus_valid_range(io, crec, NULL, &end) != 0) {
	cache_decr(io, c);
	return -1;
    }

    calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL);

    /* Start */
    /* Not implemented for now: rev complement and go again! */

    /* End */
    r = contig_seqs_in_range(io, &c, end, end, 0, &nr);
    if (!r) {
	cache_decr(io, c);
	return -1;
    }

    for (i = 0; i < nr; i++) {
	seq_t *s = cache_search(io, GT_Seq, r[i].rec);
	seq_t *sorig = s;
	int cstart, cend;
	int j, k, slen;

	if ((s->len < 0) ^ r[i].comp) {
	    s = dup_seq(s);
	    complement_seq_t(s);
	}

	cstart = r[i].start + s->left-1;
	cend   = r[i].start + s->right-1;

	/* Does cutoff extend to contig end, if so does it match cons? */
	if (cend < end) {
	    int mis = 0, len = 0;
	    if (end - cend >= CSZ) {
		/*
		fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n",
			r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0; /* Mark for removal */
		continue;
	    }

	    for (k = s->right, j = cend+1; j <= end; j++, k++) {
		//printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]);
		if (s->seq[k] != cons[j-(end-(CSZ-1))])
		    mis++;
	    }
	    len = end - cend;
	    if (100*mis/len > 5) {
		/*
		fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement "
			"with consensus.\n", r[i].rec);
		*/
		if (sorig != s)
		    free(s);
		r[i].rec = 0;
		continue;
	    }
	}

	/* So we got here, let's accumulate extension stats */
	slen = ABS(s->len);
	for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) {
	    //printf("%d: %c\n", j + r[i].start, s->seq[j]);
	    if(s->seq[j] == 'N')
		continue;

	    freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++;
	    depth[k]++;
	}

	if (sorig != s)
	    free(s);
    }

    score = best_score = 0;
    best_pos = 0;
    
    for (i = 0; i < ESZ; i++) {
	int call, best = 0, j;
	double dd;

	if (depth[i] < min_depth)
	    break;

	for (j = 0; j < 4; j++) {
	    if (best < freqs[i][j]) {
		best = freqs[i][j];
		call = j;
	    }
	}
	new_cons[i] = "ACGT"[call];

	dd = (double)depth[i];
	switch (call) {
	case 0:
	    score +=  freqs[i][0] / dd;
	    score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 1:
	    score +=  freqs[i][1] / dd;
	    score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd;
	    break;
	case 2:
	    score +=  freqs[i][2] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd;
	    break;
	case 3:
	    score +=  freqs[i][3] / dd;
	    score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd;
	    break;
	}

	if (best_score <= score) {
	    best_score = score;
	    best_pos = i+1;
	}
	/*
	printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n",
	       i, depth[i], "ACGT"[call],
	       freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3],
	       score);
	*/
    }
    /* printf("Best score is %f at %d\n", best_score, best_pos); */

    /* Extend */
    nseq = 0;
    if (best_pos > 0) {
	int furthest_left = end;

	for (i = 0; i < nr; i++) {
	    seq_t *s;
	    int r_pos;
	    int score;

	    if (r[i].rec == 0)
		continue;

	    s = cache_search(io, GT_Seq, r[i].rec);
	    s = cache_rw(io, s);

	    if (furthest_left > r[i].start)
		furthest_left = r[i].start;

	    /*
	     * end + best_pos is the furthest right we can go, but this
	     * specific read may not be justified in reaching that far
	     * if it has too many disagreements.
	     */
	    if ((s->len > 0) ^ r[i].comp) {
		int best_r = 0, j, k;
		int len = ABS(s->len);

		//printf(">%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) {
		for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) {
		    if (new_cons[j] == toupper(s->seq[k])) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k+1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(s->seq[k])
		    //	    ? toupper(s->seq[k])
		    //        : tolower(s->seq[k]));
		}
		//putchar('\n');

		if (s->right != r_pos) {
		    s->right  = r_pos;
		    nseq++;
		}
	    } else {
		int best_r = 0, j, k;

		//printf("<%s\t", s->name);

		r_pos = 0;
		score = 0;
		//for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) {
		for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) {
		    char b = complement_base(s->seq[k]);
		    if (new_cons[j] == b) {
			score += match_score;
			if (best_r <= score) {
			    best_r  = score;
			    r_pos = k-1;
			}
		    } else {
			score += mismatch_score;
		    }

		    //putchar(new_cons[j] == toupper(b)
		    //	    ? toupper(b)
		    //	    : tolower(b));
		}
		//putchar('\n');

		if (s->left != r_pos+2) {
		    s->left  = r_pos+2;
		    nseq++;
		}
	    }
	}

	vmessage("    Extended by %d, adjusting %d sequence clip%s\n",
		 best_pos, nseq, nseq == 1 ? "" : "s");

	bin_invalidate_consensus(io, crec, furthest_left, end + best_pos);
    } else {
	vmessage("    Unable to extend contig\n");
    }
    free(r);

    cache_decr(io, c);
    cache_flush(io);
    return 0;
}