alignel* filter_aligns_by_num_gaps (alignel* alignList, s32 maxSeparateGapsCount) { alignel* a, *next; alignel* head, *prev; unspos height, width, i, j, run; u32 opIx; s32 numGaps; // process each alignment, collecting a list of those that are long enough head = prev = NULL; for (a=alignList ; a!=NULL ; a=next) { next = a->next; // if the alignment is too gappy, skip it numGaps = 0; height = a->end1 - a->beg1 + 1; width = a->end2 - a->beg2 + 1; opIx = 0; for (i=j=0 ; (i<height)||(j<width) ; ) { // handle the next run run = edit_script_run_of_subs (a->script, &opIx); i += run; j += run; // handle the next indel if ((i < height) || (j < width)) { edit_script_indel_len (a->script, &opIx, &i, &j); if (++numGaps > maxSeparateGapsCount) break; } } if (numGaps > maxSeparateGapsCount) { // (unwanted alignment, discard it) free_if_valid ("filter_aligns_by_continuity a->script", a->script); free_if_valid ("filter_aligns_by_continuity a", a); continue; } // this alignment is ok, add it to the end of the new list we're // building if (head == NULL) head = prev = a; else { prev->next = a; prev = a; } a->next = NULL; } return head; }
void print_lav_align (FILE* f, const u8* seq1, unspos beg1, unspos end1, const u8* seq2, unspos beg2, unspos end2, editscript* script, score s) { unspos height, width, i, j, prevI, prevJ; unspos run, match; u32 opIx; beg1++; // (internally, we want origin 1, inclusive) beg2++; height = end1 - beg1 + 1; width = end2 - beg2 + 1; fprintf (f, "a {\n s " scoreFmtSimple "\n" " b " unsposFmt " " unsposFmt "\n" " e " unsposFmt " " unsposFmt "\n", s, beg1, beg2, end1, end2); opIx = 0; for (i=j=0 ; (i< height)||(j<width) ; ) { prevI = i; prevJ = j; run = edit_script_run_of_subs_match (script, &opIx, seq1+beg1+i-1, seq2+beg2+j-1, &match); i += run; j += run; fprintf (f, " l " unsposFmt " " unsposFmt " " unsposFmt " " unsposFmt " %d\n", beg1+prevI, beg2+prevJ, beg1+i-1, beg2+j-1, align_match_percent (run, match)); if ((i < height) || (j < width)) edit_script_indel_len (script, &opIx, &i, &j); } fprintf (f, "}\n"); }
void print_align_list_segments (alignel* alignList) { alignel* a; unspos beg1, end1, beg2, end2; unspos height, width, i, j, prevI, prevJ, run; u32 opIx; score s; for (a=alignList ; a!=NULL ; a=a->next) { beg1 = a->beg1; end1 = a->end1; beg2 = a->beg2; end2 = a->end2; height = end1 - beg1 + 1; width = end2 - beg2 + 1; snoopGenpaf_1; // print the alignment's segments opIx = 0; for (i=j=0 ; (i< height)||(j<width) ; ) { prevI = i; prevJ = j; run = edit_script_run_of_subs (a->script, &opIx); i += run; j += run; if ((i < height) || (j < width)) edit_script_indel_len (a->script, &opIx, &i, &j); s = score_match (currParams->scoring, currParams->seq1, beg1-1+prevI, currParams->seq2, beg2-1+prevJ, run); print_match (beg1-1+prevI, beg2-1+prevJ, run, s); } } }
void alignment_gap_rate (alignel* a, unspos* _numer, unspos* _denom) { unspos beg1 = a->beg1; unspos beg2 = a->beg2; unspos height, width, i, j, prevI, prevJ; u32 opIx; unspos run; unspos denom, gappedBases; height = a->end1 - beg1 + 1; width = a->end2 - beg2 + 1; denom = 0; opIx = 0; for (i=j=0 ; (i< height)||(j<width) ; ) { prevI = i; prevJ = j; run = edit_script_run_of_subs (a->script, &opIx); i += run; j += run; denom += run; if ((i < height) || (j < width)) edit_script_indel_len (a->script, &opIx, &i, &j); } if (denom == 0) { *_numer = *_denom = 0; return; } gappedBases = (height - denom) + (width - denom); *_numer = gappedBases; *_denom = denom; }
void print_sam_align (FILE* f, seq* seq1, unspos beg1, unspos end1, seq* seq2, unspos beg2, unspos end2, editscript* script, arg_dont_complain(score s), int softMasked, char* rgTags) { seqpartition* sp1 = &seq1->partition; seqpartition* sp2 = &seq2->partition; partition* part; unspos height, width, i, j, prevI, prevJ, run; u32 opIx; unspos len2; char* name1, *name2; unspos offset1, offset2, start1, start2; unspos startLoc1, startLoc2; unspos seq2Len, seq2True; int flag; char maskCh; unspos preMask, postMask, tmp; if (seq1->revCompFlags != rcf_forward) suicide ("attempt to print - strand or complement for sequence 1 in print_sam_align"); beg1++; // (internally, we want origin 1, inclusive) beg2++; height = end1 - beg1 + 1; len2 = width = end2 - beg2 + 1; ////////// // figure out position offsets and names ////////// if (sp1->p == NULL) // sequence 1 is not partitioned { name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; offset1 = 0; startLoc1 = seq1->startLoc; } else // sequence 1 is partitioned { part = lookup_partition (seq1, beg1-1); name1 = &sp1->pool[part->header]; offset1 = part->sepBefore + 1; startLoc1 = part->startLoc; } if (sp2->p == NULL) // sequence 2 is not partitioned { name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; offset2 = 0; seq2Len = seq2->len; seq2True = seq2->trueLen; startLoc2 = seq2->startLoc; } else // sequence 2 is partitioned { part = lookup_partition (seq2, beg2-1); name2 = &sp2->pool[part->header]; offset2 = part->sepBefore + 1; seq2Len = part->sepAfter - offset2; seq2True = part->trueLen; startLoc2 = part->startLoc; } ////////// // print sam line (field names indicate below are per sam spec) ////////// start1 = beg1-1 - offset1 + startLoc1; if ((seq2->revCompFlags & rcf_rev) == 0) { start2 = beg2-1 - offset2 + startLoc2; end2 = start2-1 + len2; flag = 0; } else { start2 = startLoc2 + offset2 + (seq2Len - beg2) - (len2-1); end2 = startLoc2 + offset2 + (seq2Len - beg2); flag = BAM_FREVERSE; } // print qname, flag, rname, pos and mapq fprintf (f, "%s\t%d\t%s\t" unsposFmt "\t%d\t", name2, flag, name1, start1, 255); // print cigar maskCh = (softMasked)? 'S' : 'H'; preMask = postMask = 0; if (start2 > 1) preMask = start2 - 1; if (end2 < seq2True) postMask = seq2True - end2; if ((seq2->revCompFlags & rcf_rev) != 0) { tmp = preMask; preMask = postMask; postMask = tmp; } if (preMask != 0) fprintf (f, unsposFmt "%c", preMask, maskCh); opIx = 0; for (i=j=0 ; (i< height)||(j<width) ; ) { run = edit_script_run_of_subs (script, &opIx); fprintf (f, unsposFmt "M", run); i += run; j += run; if ((i < height) || (j < width)) { prevI = i; prevJ = j; edit_script_indel_len (script, &opIx, &i, &j); if (i > prevI) fprintf (f, unsposFmt "D", i - prevI); if (j > prevJ) fprintf (f, unsposFmt "I", j - prevJ); } } if (postMask != 0) fprintf (f, unsposFmt "%c", postMask, maskCh); // print mrnm, mpos, and isize fprintf (f, "\t%s\t%d\t%d\t", "*", 0, 0); // print seq (data from sequence 2) print_query_bases (f, seq2, beg2-1, len2, softMasked); // print qual (if we have no qual data, we print "*") if (seq2->vq == NULL) fprintf (f, "\t%s", "*"); else { fprintf (f, "\t"); print_query_quals (f, seq2, beg2-1, len2, softMasked); } // print tags if (rgTags != NULL) fprintf (f, "\t%s", rgTags); fprintf (f, "\n"); }
void print_maf_align (FILE* f, seq* seq1, unspos beg1, unspos end1, seq* seq2, unspos beg2, unspos end2, editscript* script, score s) { seqpartition* sp1 = &seq1->partition; seqpartition* sp2 = &seq2->partition; partition* part; unspos height, width, i, j, run; u32 opIx; u8* p, *q; unspos ix; char* name1, *name2, *pref2, *suff1, *suff2; unspos offset1, offset2, start1, start2; unspos startLoc1, startLoc2; unspos seq1Len, seq2Len, seq1True, seq2True; char strand1, strand2; unspos startI, startJ; int len1, len2, nameW, startW, endW, lenW; #ifdef debugSeq1Beg if ((beg1 < debugSeq1Beg) || (end1 > debugSeq1End)) return; #endif // debugSeq1Beg beg1++; // (internally, we want origin 1, inclusive) beg2++; height = end1 - beg1 + 1; width = end2 - beg2 + 1; // report diagonal if (maf_dbgReportDiag) fprintf (f, "# diagonal=" sgnposFmt "\n", diagNumber(beg1,beg2)); ////////// // figure out position offsets and names ////////// if (sp1->p == NULL) // sequence 1 is not partitioned { name1 = (seq1->useFullNames)? seq1->header : seq1->shortHeader; if ((name1 == NULL) || (name1[0] == 0)) name1 = "seq1"; offset1 = 0; startLoc1 = seq1->startLoc; seq1Len = seq1->len; seq1True = seq1->trueLen; } else // sequence 1 is partitioned { part = lookup_partition (seq1, beg1-1); name1 = &sp1->pool[part->header]; offset1 = part->sepBefore + 1; startLoc1 = part->startLoc; seq1Len = part->sepAfter - offset1; seq1True = part->trueLen; } if (sp2->p == NULL) // sequence 2 is not partitioned { name2 = (seq2->useFullNames)? seq2->header : seq2->shortHeader; if ((name2 == NULL) || (name2[0] == 0)) name2 = "seq2"; offset2 = 0; startLoc2 = seq2->startLoc; seq2Len = seq2->len; seq2True = seq2->trueLen; } else // sequence 2 is partitioned { part = lookup_partition (seq2, beg2-1); name2 = &sp2->pool[part->header]; startLoc2 = part->startLoc; offset2 = part->sepBefore + 1; seq2Len = part->sepAfter - offset2; seq2True = part->trueLen; } ////////// // print summary line ////////// fprintf (f, "a score=" scoreFmt "\n", s); ////////// // print aligning path in sequence 1 ////////// // figure out fields and widths pref2 = ((maf_distinguishNames) && (strcmp (name1, name2) == 0))? "~" : ""; suff1 = rcfSuffix[seq1->revCompFlags]; suff2 = rcfSuffix[seq2->revCompFlags]; if ((seq1->revCompFlags & rcf_rev) == 0) { start1 = beg1-1 - offset1 + startLoc1; strand1 = '+'; } else { start1 = beg1-1 - offset1 + seq1True+2 - (startLoc1 + seq1Len); strand1 = '-'; } if ((seq2->revCompFlags & rcf_rev) == 0) { start2 = beg2-1 - offset2 + startLoc2; strand2 = '+'; } else { start2 = beg2-1 - offset2 + seq2True+2 - (startLoc2 + seq2Len); strand2 = '-'; } len1 = strlen (name1) + strlen (suff1); len2 = strlen (pref2) + strlen (name2) + strlen (suff2); nameW = (len1 >= len2)? len1 : len2; startW = max_digits (start1, start2); endW = max_digits (end1+1-beg1, end2+1-beg2); lenW = max_digits (seq1True, seq2True); // print aligning path in sequence 1 (non-printables are printed as '*' // but such should never be seen unless there is a problem elsewhere) fprintf (f, "s %s%s%*s" unsposStarFmt " " unsposStarFmt " %c " unsposStarFmt " ", name1, suff1, nameW+1-len1, " ", startW, start1-1, endW, end1+1-beg1, strand1, lenW, seq1True); opIx = 0; for (i=j=0 ; (i<height)||(j<width) ; ) { // handle the next run run = edit_script_run_of_subs (script, &opIx); p = seq1->v+beg1+i-1; q = seq2->v+beg2+j-1; for (ix=0 ; ix<run ; ix++) { fprintf (f, "%c", dna_toprint(*p)); p++; q++; } i += run; j += run; // handle the next indel if ((i < height) || (j < width)) { startI = i; p = seq1->v+beg1+i-1; startJ = j; q = seq2->v+beg2+j-1; edit_script_indel_len (script, &opIx, &i, &j); if (i != startI) { for ( ; startI<i ; startI++) { fprintf (f, "%c", dna_toprint(*p)); p++; } } if (j != startJ) { for ( ; startJ<j ; startJ++) { fprintf (f, "-"); q++; } } } } fprintf (f, "\n"); ////////// // print aligning path in sequence 2 ////////// fprintf (f, "s %s%s%s%*s" unsposStarFmt " " unsposStarFmt " %c " unsposStarFmt " ", pref2, name2, suff2, nameW+1-len2, " ", startW, start2-1, endW, end2+1-beg2, strand2, lenW, seq2True); opIx = 0; for (i=j=0 ; (i<height)||(j<width) ; ) { // handle the next run run = edit_script_run_of_subs (script, &opIx); p = seq1->v+beg1+i-1; q = seq2->v+beg2+j-1; for (ix=0 ; ix<run ; ix++) { fprintf (f, "%c", dna_toprint(*q)); p++; q++; } i += run; j += run; // handle the next indel if ((i < height) || (j < width)) { startI = i; p = seq1->v+beg1+i-1; startJ = j; q = seq2->v+beg2+j-1; edit_script_indel_len (script, &opIx, &i, &j); if (i != startI) { for ( ; startI<i ; startI++) { fprintf (f, "-"); p++; } } if (j != startJ) { for ( ; startJ<j ; startJ++) { fprintf (f, "%c", dna_toprint(*q)); q++; } } } } fprintf (f, "\n\n"); }
void print_maf_align_list (FILE* f, alignel* alignList, seq* seq1, seq* seq2, int withComments) { alignel* a; unspos numer, denom; for (a=alignList ; a!=NULL ; a=a->next) { if (withComments) { unspos height, width, i, j, prevI, prevJ, run; u32 opIx; // report identity alignment_identity (seq1, seq2, a, &numer, &denom); fprintf (f, "# identity=" unsposSlashFmt, numer, denom); if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); fprintf (f, "\n"); // report coverage alignment_coverage (seq1, seq2, a, &numer, &denom); fprintf (f, "# coverage=" unsposSlashFmt, numer, denom); if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); fprintf (f, "\n"); // report continuity alignment_continuity (a, &numer, &denom); fprintf (f, "# continuity=" unsposSlashFmt, numer, denom); if (denom != 0) fprintf (f, " (%.1f%%)", (100.0*numer) / denom); fprintf (f, "\n"); // report alignment path fprintf (f, "# cigar="); height = a->end1 - a->beg1 + 1; width = a->end2 - a->beg2 + 1; opIx = 0; for (i=j=0 ; (i< height)||(j<width) ; ) { run = edit_script_run_of_subs (a->script, &opIx); fprintf (f, unsposFmt "m", run); i += run; j += run; if ((i < height) || (j < width)) { prevI = i; prevJ = j; edit_script_indel_len (a->script, &opIx, &i, &j); if (i > prevI) fprintf (f, unsposFmt "d", i - prevI); if (j > prevJ) fprintf (f, unsposFmt "i", j - prevJ); } } fprintf (f, "\n"); } print_maf_align (f, seq1, a->beg1-1, a->end1, seq2, a->beg2-1, a->end2, a->script, a->s); } }