/* * Exports Scaffold information to an AGP file * * Returns 0 on success * -1 on failure */ int scaffold_to_agp(GapIO *io, char *fn) { FILE *fp; int i, j; if (NULL == (fp = fopen(fn, "w+"))) { verror(ERR_WARN, "scaffold_from_agp", "%s: %s", fn, strerror(errno)); return -1; } for (i = 0; io->scaffold && i < ArrayMax(io->scaffold); i++) { scaffold_t *f = cache_search(io, GT_Scaffold, arr(tg_rec, io->scaffold, i)); int start = 1, end = 1; int k = 1; if (!f) { verror(ERR_WARN, "scaffold_from_agp", "Failed to load scaffold\n"); fclose(fp); return -1; } cache_incr(io, f); for (j = 0; f->contig && j < ArrayMax(f->contig); j++) { scaffold_member_t *m = arrp(scaffold_member_t, f->contig, j); contig_t *c = cache_search(io, GT_Contig, m->rec); int ustart, uend; int len; /* Get the unpadded clipped contig length */ consensus_valid_range(io, m->rec, &ustart, &uend); consensus_unpadded_pos(io, m->rec, uend, &uend); len = uend - ustart + 1; if (j) { int gap = m->gap_size; fprintf(fp, "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", f->name, start, start+gap-1, k++, gap); start += gap; } fprintf(fp, "%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n", f->name, start, start + len-1, k++, c->name, ustart, uend); start += len; } cache_decr(io, f); } if (0 != fclose(fp)) { verror(ERR_WARN, "scaffold_from_agp", "%s: %s", fn, strerror(errno)); return -1; } return 0; }
int shuffle_contigs_io(GapIO *io, int ncontigs, contig_list_t *contigs, int band, int flush) { int i; //, start; Array indels; set_malign_lookup(5); /* set_alignment_matrix("/tmp/nuc_matrix", "ACGTURYMWSKDHVB-*"); */ indels = ArrayCreate(sizeof(con_indel_t), 0); for (i = 0; i < ncontigs; i++) { tg_rec cnum = contigs[i].contig; int64_t old_score, new_score, tot_score, orig_score; //for (start = 0; start < 1000000; start += 1000) { // MALIGN *malign = build_malign(io, cnum, start, start + 1000); MALIGN *malign; int c_start, c_shift; vmessage("Shuffling pads for contig %s\n", get_contig_name(io, cnum)); /* * The shuffle pads code (malign) comes from gap4 and has lots of * assumptions that the contig goes from base 1 to base N. * Fixing these assumptions is a lot of work, so for now we will take * the cheat route of moving the contig to ensure the assumption * is valid. */ if (-1 == consensus_valid_range(io, cnum, &c_start, NULL)) { verror(ERR_WARN, "shuffle_contigs_io", "Failure in consensus_valid_range()"); return -1; } //printf("Contig starts at base %d\n", c_start); c_shift = 1-c_start; if (c_shift != 0) { if (move_contig(io, cnum, c_shift) != 0) return -1; } //printf("Shuffle #%"PRIrec" from %d..%d, shift %d\n", // contigs[i].contig, contigs[i].start, contigs[i].end, c_shift); malign = build_malign(io, cnum, contigs[i].start + c_shift, contigs[i].end + c_shift); resort_contigl(malign); malign_add_region(malign, contigs[i].start + c_shift, contigs[i].end + c_shift); ArrayMax(indels) = 0; orig_score = new_score = malign_diffs(malign, &tot_score); vmessage("Initial score %.2f%% mismatches (%"PRId64" mismatches)\n", (100.0 * orig_score)/tot_score, orig_score/128); if (flush) UpdateTextOutput(); //print_malign(malign); do { old_score = new_score; malign = realign_seqs(cnum, malign, band, indels); //print_malign(malign); new_score = malign_diffs(malign, &tot_score); vmessage(" Consensus difference score: %"PRId64"\n", new_score); if (flush) UpdateTextOutput(); } while (new_score < old_score); if (new_score < orig_score) { //print_malign(malign); update_io(io, cnum, malign, indels); /* * It's possible the contig ends could move if a sequence that * was previously the end of a contig has been moved such that * it's no longer the contig end. This can lead to tags off the * end of the contig, so trim them (reusing break_contig * code). */ contig_visible_start(io, cnum, CITER_CSTART); contig_visible_end(io, cnum, CITER_CEND); } else { vmessage("Could not reduce number of consensus differences.\n"); } /* Remove pad columns */ //printf("New score=%d, orig_score=%d\n", new_score, orig_score); if (new_score < orig_score) { contigs[i].start += c_shift; contigs[i].end += c_shift; remove_pad_columns(io, 1, &contigs[i], 100, 1); //contig_t *c; //c = cache_search(io, GT_Contig, cnum); //cache_incr(io, c); //remove_pads(io, malign, c, contigs[i].start, contigs[i].end); //cache_decr(io, c); } destroy_malign(malign, 1); vmessage("Final score %.2f%% mismatches\n", (100.0 * new_score)/tot_score); /* * Sequences like * AGCT**GATGC * TGGATCGA * can end up causing holes. We break the contig in this case to * avoid minor database inconsistencies. */ // remove_contig_holes(io, cnum); /* reassign_confidence_values(io, cnum); */ //} /* Shift contig back */ if (c_shift != 0) { if (move_contig(io, cnum, -c_shift) != 0) return -1; } if (flush) cache_flush(io); } ArrayDestroy(indels); return 0; }
/* * Extends the right hand end of a single contig. * * Min_depth is the minimum depth for extension. If lower then even if the * data matches we'll not extend further. * * Match_score (+ve) and mismatch_score (-ve) are accumulated during * extension to ensure that we don't extend into junk mismatching DNA. */ static int contig_extend_single(GapIO *io, tg_rec crec, int dir, int min_depth, int match_score, int mismatch_score) { int end; rangec_t *r; int nr, i; contig_t *c; char cons[CSZ], new_cons[ESZ]; int freqs[ESZ][4], depth[ESZ]; double score, best_score; int best_pos, nseq; vmessage("Processing contig #%"PRIrec", %s end\n", crec, dir ? "left" : "right"); for (i = 0; i < ESZ; i++) { freqs[i][0] = freqs[i][1] = freqs[i][2] = freqs[i][3] = 0; depth[i] = 0; } c = cache_search(io, GT_Contig, crec); if (NULL == c) return -1; cache_incr(io, c); if (consensus_valid_range(io, crec, NULL, &end) != 0) { cache_decr(io, c); return -1; } calculate_consensus_simple(io, crec, end-(CSZ-1), end, cons, NULL); /* Start */ /* Not implemented for now: rev complement and go again! */ /* End */ r = contig_seqs_in_range(io, &c, end, end, 0, &nr); if (!r) { cache_decr(io, c); return -1; } for (i = 0; i < nr; i++) { seq_t *s = cache_search(io, GT_Seq, r[i].rec); seq_t *sorig = s; int cstart, cend; int j, k, slen; if ((s->len < 0) ^ r[i].comp) { s = dup_seq(s); complement_seq_t(s); } cstart = r[i].start + s->left-1; cend = r[i].start + s->right-1; /* Does cutoff extend to contig end, if so does it match cons? */ if (cend < end) { int mis = 0, len = 0; if (end - cend >= CSZ) { /* fprintf(stderr,"Skipping #%"PRIrec" due to length of cutoff\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; /* Mark for removal */ continue; } for (k = s->right, j = cend+1; j <= end; j++, k++) { //printf("%d: %c %c\n", j, s->seq[k], cons[j-(end-(CSZ-1))]); if (s->seq[k] != cons[j-(end-(CSZ-1))]) mis++; } len = end - cend; if (100*mis/len > 5) { /* fprintf(stderr, "Skipping #%"PRIrec" due to high disagreement " "with consensus.\n", r[i].rec); */ if (sorig != s) free(s); r[i].rec = 0; continue; } } /* So we got here, let's accumulate extension stats */ slen = ABS(s->len); for (k = 0, j = end+1 - r[i].start; j < slen && k < ESZ; j++, k++) { //printf("%d: %c\n", j + r[i].start, s->seq[j]); if(s->seq[j] == 'N') continue; freqs[k][dna_lookup[(uint8_t) s->seq[j]]]++; depth[k]++; } if (sorig != s) free(s); } score = best_score = 0; best_pos = 0; for (i = 0; i < ESZ; i++) { int call, best = 0, j; double dd; if (depth[i] < min_depth) break; for (j = 0; j < 4; j++) { if (best < freqs[i][j]) { best = freqs[i][j]; call = j; } } new_cons[i] = "ACGT"[call]; dd = (double)depth[i]; switch (call) { case 0: score += freqs[i][0] / dd; score -= (freqs[i][1] + freqs[i][2] + freqs[i][3]) / dd; break; case 1: score += freqs[i][1] / dd; score -= (freqs[i][0] + freqs[i][2] + freqs[i][3]) / dd; break; case 2: score += freqs[i][2] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][3]) / dd; break; case 3: score += freqs[i][3] / dd; score -= (freqs[i][0] + freqs[i][1] + freqs[i][2]) / dd; break; } if (best_score <= score) { best_score = score; best_pos = i+1; } /* printf("%3d %3d\t%c\t%3d %3d %3d %3d %7.1f\n", i, depth[i], "ACGT"[call], freqs[i][0], freqs[i][1], freqs[i][2], freqs[i][3], score); */ } /* printf("Best score is %f at %d\n", best_score, best_pos); */ /* Extend */ nseq = 0; if (best_pos > 0) { int furthest_left = end; for (i = 0; i < nr; i++) { seq_t *s; int r_pos; int score; if (r[i].rec == 0) continue; s = cache_search(io, GT_Seq, r[i].rec); s = cache_rw(io, s); if (furthest_left > r[i].start) furthest_left = r[i].start; /* * end + best_pos is the furthest right we can go, but this * specific read may not be justified in reaching that far * if it has too many disagreements. */ if ((s->len > 0) ^ r[i].comp) { int best_r = 0, j, k; int len = ABS(s->len); //printf(">%s\t", s->name); r_pos = 0; score = 0; //for (k = s->right, j = 0; j < best_pos && k < len; j++, k++) { for (k = end - r[i].start + 1, j = 0; j < best_pos && k < len; j++, k++) { if (new_cons[j] == toupper(s->seq[k])) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k+1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(s->seq[k]) // ? toupper(s->seq[k]) // : tolower(s->seq[k])); } //putchar('\n'); if (s->right != r_pos) { s->right = r_pos; nseq++; } } else { int best_r = 0, j, k; //printf("<%s\t", s->name); r_pos = 0; score = 0; //for (k = s->left-2, j = 0; j < best_pos && k >= 0; j++, k--) { for (k = r[i].end - end - 1, j = 0; j < best_pos && k >= 0; j++, k--) { char b = complement_base(s->seq[k]); if (new_cons[j] == b) { score += match_score; if (best_r <= score) { best_r = score; r_pos = k-1; } } else { score += mismatch_score; } //putchar(new_cons[j] == toupper(b) // ? toupper(b) // : tolower(b)); } //putchar('\n'); if (s->left != r_pos+2) { s->left = r_pos+2; nseq++; } } } vmessage(" Extended by %d, adjusting %d sequence clip%s\n", best_pos, nseq, nseq == 1 ? "" : "s"); bin_invalidate_consensus(io, crec, furthest_left, end + best_pos); } else { vmessage(" Unable to extend contig\n"); } free(r); cache_decr(io, c); cache_flush(io); return 0; }