intScore stitchAlignToTranscript(uint rAend, uint gAend, uint rBstart, uint gBstart, uint L, uint iFragB, uint sjAB, Parameters* P, char* R, char* Q, char* G, Transcript *trA, const uint outFilterMismatchNmaxTotal) { //stitch together A and B, extend in the gap, returns max score //Q is assumed modified already int Score=0; // int score2; if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]==sjAB \ && trA->exons[trA->nExons-1][EX_iFrag]==iFragB && rBstart==rAend+1 && gAend+1<gBstart ) {//simple stitching if junction belongs to a database if (P->sjdbMotif[sjAB]==0 && (L<=P->sjdbShiftRight[sjAB] || trA->exons[trA->nExons-1][EX_L]<=P->sjdbShiftLeft[sjAB]) ) { return -1000006; //too large repeats around non-canonical junction }; trA->exons[trA->nExons][EX_L] = L; //new exon length trA->exons[trA->nExons][EX_R] = rBstart; //new exon r-start trA->exons[trA->nExons][EX_G] = gBstart; //new exon g-start trA->canonSJ[trA->nExons-1]=P->sjdbMotif[sjAB]; //mark sj-db trA->shiftSJ[trA->nExons-1][0]=P->sjdbShiftLeft[sjAB]; trA->shiftSJ[trA->nExons-1][1]=P->sjdbShiftRight[sjAB]; trA->sjAnnot[trA->nExons-1]=1; trA->sjStr[trA->nExons-1]=P->sjdbStrand[sjAB];; trA->nExons++; trA->nMatch+=L; for (uint ii=rBstart;ii<rBstart+L;ii++) Score+=int(Q[ii]); //add QS for mapped portions Score+=P->sjdbScore; } else {//general stitching trA->sjAnnot[trA->nExons-1]=0; trA->sjStr[trA->nExons-1]=0; if (trA->exons[trA->nExons-1][EX_iFrag]==iFragB) {//stitch aligns on the same fragment uint gBend=gBstart+L-1; uint rBend=rBstart+L-1; // {//debug // if (sjAB!=((uint) -1) && trA->exons[trA->nExons-1][EX_sjA]!=((uint) -1) && rBend<=rAend) {// // Score -= rAend-rBstart+1; // gAend -= rAend-rBstart+1; // rAend = rBstart-1; // trA->exons[trA->nExons-1][EX_L] =rAend-trA->exons[trA->nExons-1][EX_R]+1; // }; // }; //check if r-overlapping fully and exit if (rBend<=rAend) return -1000001; if (gBend<=gAend && trA->exons[trA->nExons-1][EX_iFrag]==iFragB) return -1000002; //shift the B 5' if overlaps A 3' if (rBstart<=rAend) { gBstart+=rAend-rBstart+1; rBstart=rAend+1; L=rBend-rBstart+1; }; for (uint ii=rBstart;ii<=rBend;ii++) Score+=int(Q[ii]); //add QS for mapped portions int gGap=gBstart-gAend-1; //could be < 0 for insertions int rGap=rBstart-rAend-1;//>0 always since we removed overlap uint nMatch=L; uint nMM=0; uint Del=0, Ins=0; uint nIns=0, nDel=0; int jR=0; //junction location in R-space int jCan=999; //canonical junction type uint gBstart1=gBstart-rGap-1;//the last base of the intron if all read gap belongs to acceptor, i.e. jR=0 // check all the different combinations of gGap and rGap if ( gGap==0 && rGap==0 ) {//just joined the pieces, w/o stiching or gaps //do nothing for now } else if ( gGap>0 && rGap>0 && rGap==gGap ) {//no gaps, just try to fill space //simple stitching, assuming no insertion in the read for (int ii=1;ii<=rGap;ii++) { if (G[gAend+ii]<4 && R[rAend+ii]<4) {//only score genome bases that are not Ns if ( R[rAend+ii]==G[gAend+ii] ) { Score+=int(Q[rAend+ii]); nMatch++; // if (Q[rAend+ii]>=P->Qgood) nMatchGood++; } else { Score-=int(Q[rAend+ii]); // trA->rMM[trA->nMM + nMM] = rAend+ii; nMM++; // if (Q[rAend+ii]>=P->Qgood) nMMgood++; }; }; }; } else if ( gGap>rGap ) {//genomic gap (Deletion) nDel=1; Del=gGap-rGap; //gGap>0 here if (Del>P->alignIntronMax && P->alignIntronMax>0) { return -1000003; //large gaps not allowed }; int Score1=0; int jR1=1; //junction location in R-space do { // 1. move left, until the score for MM is less than canonical advantage jR1--; if ( R[rAend+jR1]!=G[gBstart1+jR1] && G[gBstart1+jR1]<4 && R[rAend+jR1]==G[gAend+jR1]) Score1 -= int(Q[rAend+jR1]); } while ( Score1+P->scoreStitchSJshift >= 0 && int(trA->exons[trA->nExons-1][EX_L]) + jR1 > 1);//>=P->alignSJoverhangMin); //also check that we are still within the exon int maxScore2=-999999; Score1=0; int jPen=0; do { // 2. scan to the right to find the best junction locus // ?TODO? if genome base is N, how to score? if ( R[rAend+jR1]==G[gAend+jR1] && R[rAend+jR1]!=G[gBstart1+jR1] ) Score1+=int(Q[rAend+jR1]); if ( R[rAend+jR1]!=G[gAend+jR1] && R[rAend+jR1]==G[gBstart1+jR1] ) Score1-=int(Q[rAend+jR1]); int jCan1=-1; //this marks Deletion int jPen1=0; int Score2=Score1; if (Del>=P->alignIntronMin) {//only check intron motif for large gaps= non-Dels //check if the intron is canonical, or semi-canonical if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GTAG jCan1=1; } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//CTAC jCan1=2; } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==1 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==2 ) {//GCAG jCan1=3; jPen1=P->scoreGapGCAG; } else if ( G[gAend+jR1+1]==1 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==2 && G[gBstart1+jR1]==1 ) {//CTGC jCan1=4; jPen1=P->scoreGapGCAG; } else if ( G[gAend+jR1+1]==0 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==1 ) {//ATAC jCan1=5; jPen1=P->scoreGapATAC; } else if ( G[gAend+jR1+1]==2 && G[gAend+jR1+2]==3 && G[gBstart1+jR1-1]==0 && G[gBstart1+jR1]==3 ) {//GTAT jCan1=6; jPen1=P->scoreGapATAC; } else { jCan1=0; jPen1=P->scoreGapNoncan; }; Score2 += jPen1; }; if (maxScore2 < Score2 ) {//check if the score is the highest. TODO: record the next highest score maxScore2=Score2; jR=jR1; //this is the last base of donor jCan=jCan1; jPen=jPen1; }; jR1++; } while ( jR1 < int(rBend) - int(rAend) );// - int(P->alignSJoverhangMin) );//TODO: do not need to search the full B-transcript, can stop as soon as Score goes down by more than //repeat length: go back and forth around jR to find repeat length uint jjL=0,jjR=0; while ( gAend+jR>=jjL && G[gAend-jjL+jR]==G[gBstart1-jjL+jR] && G[gAend-jjL+jR]<4 && jjL<=MAX_SJ_REPEAT_SEARCH) {//go back jjL++; }; while ( gAend+jjR+jR+1<P->nGenome && G[gAend+jjR+jR+1]==G[gBstart1+jjR+jR+1] && G[gAend+jjR+jR+1]<4 && jjR<=MAX_SJ_REPEAT_SEARCH) {//go forward jjR++; }; if (jCan<=0) {//flush deletions and non-canonical junction to the left jR-=jjL; if (int(trA->exons[trA->nExons-1][EX_L])+jR<1) return -1000005; jjR+=jjL; jjL=0; }; //TODO check here if the internal exon length < minDa, if so exit w/o stitiching for (int ii=min(1,jR+1);ii<=max(rGap,jR);ii++) {//score donor and acceptor uint g1=(ii<=jR) ? (gAend+ii):(gBstart1+ii); if (G[g1]<4 && R[rAend+ii]<4) {//only penalize non-N bases if ( R[rAend+ii]==G[g1] ) { if (ii>=1 && ii <=rGap) {//only add +score and matches within the gap Score+=int(Q[rAend+ii]); nMatch++; }; } else {//add -score and MM for all bases Score-=int(Q[rAend+ii]); nMM++; if (ii<1 || ii>rGap) {//subtract previuosly presumed matches Score-=int(Q[rAend+ii]); nMatch--; // if (ii<=jR) nMM--; }; }; }; }; //score the gap if (P->sjdbN>0) {//check if the junction is annotated uint jS=gAend+jR+1, jE=gBstart1+jR;//intron start/end int sjdbInd=binarySearch2(jS,jE,P->sjdbStart,P->sjdbEnd,P->sjdbN); if (sjdbInd<0) { if (Del>=P->alignIntronMin) { Score += P->scoreGap + jPen; //genome gap penalty + non-canonical penalty } else {//deletion Score += Del*P->scoreDelBase + P->scoreDelOpen; jCan=-1; trA->sjAnnot[trA->nExons-1]=0; // jjR-=jjL; // jR-=jjL; // jjL=0; // trA->shiftSJ[trA->nExons-1][0]=0; // trA->shiftSJ[trA->nExons-1][1]=jjR; }; } else {//annotated jCan=P->sjdbMotif[sjdbInd]; if (P->sjdbMotif[sjdbInd]==0) {//shift to match annotations if (L<=P->sjdbShiftLeft[sjdbInd] || trA->exons[trA->nExons-1][EX_L]<=P->sjdbShiftLeft[sjdbInd]) { return -1000006; }; jR += (int) P->sjdbShiftLeft[sjdbInd]; jjL=P->sjdbShiftLeft[sjdbInd]; jjR=P->sjdbShiftRight[sjdbInd]; }; trA->sjAnnot[trA->nExons-1]=1; trA->sjStr[trA->nExons-1]=P->sjdbStrand[sjdbInd]; Score += P->sjdbScore; }; } else {//no annotation if (Del>=P->alignIntronMin) {//junction, not short deletion Score += P->scoreGap + jPen; } else { Score += Del*P->scoreDelBase + P->scoreDelOpen; jCan=-1; trA->sjAnnot[trA->nExons-1]=0; }; }; trA->shiftSJ[trA->nExons-1][0]=jjL; trA->shiftSJ[trA->nExons-1][1]=jjR; trA->canonSJ[trA->nExons-1]=jCan; if (trA->sjAnnot[trA->nExons-1]==0) {//strand for unannotated junctions if (jCan>0) { trA->sjStr[trA->nExons-1]=2-jCan%2; //1=+,2=- } else { trA->sjStr[trA->nExons-1]=0; }; }; } else if ( rGap>gGap ) {//insertion: if also gGap>0, need to stitch Ins=rGap-gGap; nIns=1; if (gGap==0) {//simple insertion, no need to stitch jR=0; } else if (gGap<0) {//reduce the score jR=0; for (int ii=0; ii<-gGap; ii++) Score -= int(Q[rBstart+ii]); } else {//stitch: define the exon boundary jR int Score1=0; int maxScore1=0; for (int jR1=1;jR1<=gGap;jR1++) {//scan to the right to find the best score if (G[gAend+jR1]<4) {//only penalize goog genome bases Score1+=( R[rAend+jR1]==G[gAend+jR1] ) ? int(Q[rAend+jR1]):-int(Q[rAend+jR1]); Score1+=( R[rAend+Ins+jR1]==G[gAend+jR1] ) ? -int(Q[rAend+Ins+jR1]):+int(Q[rAend+Ins+jR1]); }; if (Score1>maxScore1) { maxScore1=Score1; jR=jR1; }; }; for (int ii=1;ii<=gGap;ii++) {//score donor and acceptor uint r1=rAend+ii+(ii<=jR ? 0:Ins); if (G[gAend+ii]<4 && R[r1]<4) { if ( R[r1]==G[gAend+ii] ) { Score+=int(Q[r1]); nMatch++; } else {//add -score and MM for all bases Score-=int(Q[r1]); nMM++; }; }; }; }; Score += Ins*P->scoreInsBase + P->scoreInsOpen; jCan=-3; }; //different types of gaps selection #ifdef COMPILE_FOR_LONG_READS if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal ) // if ( Score>0 && nMM<=200 ) #else if ( (trA->nMM + nMM)<=outFilterMismatchNmaxTotal \ && ( jCan<0 || (jCan<7 && nMM<= (uint) P->alignSJstitchMismatchNmax[(jCan+1)/2]) ) ) #endif {//stitching worked only if there no mis-matches for non-GT/AG junctions trA->nMM += nMM; trA->nMatch += nMatch; if (Del>=P->alignIntronMin) { trA->nGap += nDel; trA->lGap += Del; } else { trA->nDel += nDel; trA->lDel += Del; }; //modify exons if (Del==0 && Ins==0) {//no gap => no new exon, extend the boundary of the previous exon trA->exons[trA->nExons-1][EX_L] += rBend-rAend; } else if (Del>0) { //deletion:ca only have Del> or Ins>0 trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary trA->exons[trA->nExons][EX_L] = rBend-rAend-jR; //new exon length trA->exons[trA->nExons][EX_R] = rAend+jR+1; //new exon r-start trA->exons[trA->nExons][EX_G] = gBstart1+jR+1; //new exon g-start trA->nExons++; } else if (Ins>0) { //Ins>0; trA->nIns += nIns; trA->lIns += Ins; trA->exons[trA->nExons-1][EX_L] += jR; //correct the previous exon boundary trA->exons[trA->nExons][EX_L] = rBend-rAend-jR-Ins; //new exon length trA->exons[trA->nExons][EX_R] = rAend+jR+Ins+1; //new exon r-start trA->exons[trA->nExons][EX_G] = gAend+1+jR; //new exon g-start trA->canonSJ[trA->nExons-1]=-2; //mark insertion trA->sjAnnot[trA->nExons-1]=0; trA->nExons++; }; } else {//stitching was not accepted return -1000007; }; } else if (gBstart+trA->exons[0][EX_R] >= trA->exons[0][EX_G] || trA->exons[0][EX_G] < trA->exons[0][EX_R]){//if (iFragA==iFragB) stitch aligns from different fragments if (P->alignMatesGapMax>0 && gBstart > trA->exons[trA->nExons-1][EX_G] + trA->exons[trA->nExons-1][EX_L] + P->alignMatesGapMax) { return -1000004; //gap between mates too large }; //extend the fragments inside //note, that this always works, i.e. Score>0 for (uint ii=rBstart;ii<rBstart+L;ii++) Score+=int(Q[ii]); //add QS for mapped portions Transcript trExtend; //TODO: compare extensions to the left and right, pick the best one to be performed first //otherwise if a large nMM is reached in the 2st extension, it will prevent the 2nd extension //use the following example: //>1 //TTCTGTGTCTCCCCCTCCCCCACTGGCTACATGGAGACAGGGGGGGGGGGCCGGGCGGTTCCCGGGCAGAAAAAAA //>1 //AATATTTGGAACACTTATGTGAAAAATGATTTGTTTTTCTGAAATTTACGTTTCTCTCTGAGTCCTGTAACTGTCC trExtend.reset(); if ( extendAlign(R, Q, G, rAend+1, gAend+1, 1, 1, DEF_readSeqLengthMax, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, \ P->alignEndsType.ext[trA->exons[trA->nExons-1][EX_iFrag]][1], &trExtend) ) { trA->add(&trExtend); Score += trExtend.maxScore; trA->exons[trA->nExons-1][EX_L] += trExtend.extendL; };// if extendAlign for read A trA->exons[trA->nExons][EX_R] = rBstart; trA->exons[trA->nExons][EX_G] = gBstart; trA->exons[trA->nExons][EX_L] = L; trA->nMatch += L; trExtend.reset(); //if ( extendAlign(R, Q, G, rBstart-1, gBstart-1, -1, -1, gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R], trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, //if end extension needs to be forced, use large length. Otherwise, only extend until the beginning of the transcript uint extlen=P->alignEndsType.ext[iFragB][1] ? DEF_readSeqLengthMax : gBstart-trA->exons[0][EX_G]+trA->exons[0][EX_R]; if ( extendAlign(R, Q, G, rBstart-1, gBstart-1, -1, -1, extlen, trA->nMatch, trA->nMM, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax, \ P->alignEndsType.ext[iFragB][1], &trExtend) ) { trA->add(&trExtend); Score += trExtend.maxScore; trA->exons[trA->nExons][EX_R] -= trExtend.extendL; trA->exons[trA->nExons][EX_G] -= trExtend.extendL; trA->exons[trA->nExons][EX_L] += trExtend.extendL; }; //if extendAlign B trA->canonSJ[trA->nExons-1]=-3; //mark different fragments junction trA->sjAnnot[trA->nExons-1]=0; trA->nExons++; } else {//no stitching possible return -1000008; }; }; trA->exons[trA->nExons-1][EX_iFrag]=iFragB; //the new exon belongs to fragment iFragB trA->exons[trA->nExons-1][EX_sjA]=sjAB; return Score; };
void stitchWindowAligns(uint iA, uint nA, int Score, bool WAincl[], uint tR2, uint tG2, Transcript trA, \ uint Lread, uiWA* WA, char* R, char* Q, char* G, char* sigG,\ Parameters* P, Transcript** wTr, uint* nWinTr, ReadAlign *RA) { //recursively stitch aligns for one gene //*nWinTr - number of transcripts for the current window if (iA>=nA && tR2==0) return; //no aligns in the transcript if (iA>=nA) {//no more aligns to add, finalize the transcript //extend first Transcript trAstep1; int vOrder[2]; //decide in which order to extend: extend the 5' of the read first #if EXTEND_ORDER==1 if ( trA.roStr==0 ) {//decide in which order to extend: extend the 5' of the read first vOrder[0]=0; vOrder[1]=1; } else { vOrder[0]=1; vOrder[1]=0; }; #elif EXTEND_ORDER==2 vOrder[0]=0; vOrder[1]=1; #else #error "EXTEND_ORDER value unrecognized" #endif for (int iOrd=0;iOrd<2;iOrd++) { switch (vOrder[iOrd]) { case 0: //extend at start if (trA.rStart>0) {// if transcript does not start at base, extend to the read start //calculate # of allowed mismatches that has been left double pMMmax=(P->alignEndsType=="Extend5pOfRead1" && trA.exons[0][EX_iFrag]==0 && trA.Str==0) ? -1 : P->outFilterMismatchNoverLmax1; trAstep1.reset(); // //avoid extending before Chr start if ( extendAlign(R, Q, G, trA.rStart-1, trA.gStart-1, -1, -1, min(trA.rStart, trA.gStart - P->chrStart[trA.Chr]), tR2-trA.rStart+1, trA.nMM, RA->outFilterMismatchNmaxTotal, pMMmax, &trAstep1) ) {//if could extend trA.add(&trAstep1); Score += trAstep1.maxScore; trA.exons[0][EX_R] = trA.rStart = trA.rStart - trAstep1.extendL; trA.exons[0][EX_G] = trA.gStart = trA.gStart - trAstep1.extendL; trA.exons[0][EX_L] += trAstep1.extendL; }; //TODO penalize the unmapped bases at the start }; break; case 1: //extend at end if ( tR2+1<Lread ) {//extend alignment to the read end //calculate # of allowed mismatches that has been left double pMMmax=(P->alignEndsType=="Extend5pOfRead1" && trA.exons[trA.nExons-1][EX_iFrag]==0 && trA.Str==0) ? -1 : P->outFilterMismatchNoverLmax1; trAstep1.reset(); // //to prevent extension past the Chr end if ( extendAlign(R, Q, G, tR2+1, tG2+1, +1, +1, min(Lread-tR2-1,P->chrStart[trA.Chr+1]-tG2-2), tR2-trA.rStart+1, trA.nMM, RA->outFilterMismatchNmaxTotal, pMMmax, &trAstep1) ) {//if could extend trA.add(&trAstep1); Score += trAstep1.maxScore; tR2 += trAstep1.extendL; tG2 += trAstep1.extendL; trA.exons[trA.nExons-1][EX_L] += trAstep1.extendL;//extend the length of the last exon }; //TODO penalize unmapped bases at the end }; }; }; if (P->alignSoftClipAtReferenceEnds=="No" && \ ( (trA.exons[trA.nExons-1][EX_G] + Lread-trA.exons[trA.nExons-1][EX_R]) > (P->chrStart[trA.Chr]+P->chrLength[trA.Chr]) || \ trA.exons[0][EX_G]<(P->chrStart[trA.Chr]+trA.exons[0][EX_R]) ) ) { return; //no soft clipping past the ends of the chromosome }; trA.rLength = 0; for (uint isj=0;isj<trA.nExons;isj++) { trA.rLength += trA.exons[isj][EX_L]; }; trA.gLength = tG2+1-trA.gStart; //check exons lenghts including repeats, do not report a transcript with short exons for (uint isj=0;isj<trA.nExons-1;isj++) {//check exons for min length, if they are not annotated and precede a junction if ( trA.canonSJ[isj]>=0 ) {//junction if (trA.sjAnnot[isj]==1) {//sjdb if ( ( trA.exons[isj][EX_L] < P->alignSJDBoverhangMin && (isj==0 || trA.canonSJ[isj-1]==-3 || (trA.sjAnnot[isj-1]==0 && trA.canonSJ[isj-1]>=0) ) )\ || ( trA.exons[isj+1][EX_L] < P->alignSJDBoverhangMin && (isj==trA.nExons-2 || trA.canonSJ[isj+1]==-3 || (trA.sjAnnot[isj+1]==0 && trA.canonSJ[isj+1]>=0) ) ) )return; } else {//non-sjdb if ( trA.exons[isj][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][0] \ || trA.exons[isj+1][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][1] ) return; }; }; }; if (trA.nExons>1 && trA.sjAnnot[trA.nExons-2]==1 && trA.exons[trA.nExons-1][EX_L] < P->alignSJDBoverhangMin) return; //this exon was not checkedin the cycle above trA.intronMotifs[0]=0;trA.intronMotifs[1]=0;trA.intronMotifs[2]=0; for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0) { ++trA.intronMotifs[0]; } else if (trA.canonSJ[iex]>0) { ++trA.intronMotifs[2-trA.canonSJ[iex]%2]; }; }; //filter strand consistency trA.sjMotifStrand=0; uint sjN=0; for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]>=0) sjN++; if (trA.sjStr[iex]>0) {//only these sjs have defined strand if (trA.sjMotifStrand==0) { trA.sjMotifStrand=trA.sjStr[iex]; } else if (trA.sjMotifStrand != trA.sjStr[iex]) {//inconsistent strand return; //kill this transcript }; }; }; if (sjN>0 && trA.sjMotifStrand==0 && P->outSAMstrandField=="intronMotif") {//strand not defined for a junction return; }; if (P->outFilterIntronMotifs=="None") {//no filtering } else if (P->outFilterIntronMotifs=="RemoveNoncanonical") { for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0) return; }; } else if (P->outFilterIntronMotifs=="RemoveNoncanonicalUnannotated") { for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0 && trA.sjAnnot[iex]==0) return; }; } else { ostringstream errOut; errOut << "EXITING because of FATAL INPUT error: unrecognized value of --outFilterIntronMotifs=" <<P->outFilterIntronMotifs <<"\n"; errOut << "SOLUTION: re-run STAR with --outFilterIntronMotifs = None -OR- RemoveNoncanonical -OR- RemoveNoncanonicalUnannotated\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, EXIT_CODE_INPUT_FILES, *P); }; {//check mapped length for each mate uint nsj=0,exl=0; for (uint iex=0;iex<trA.nExons;iex++) {// exl+=trA.exons[iex][EX_L]; if (iex==trA.nExons-1 || trA.canonSJ[iex]==-3) {//mate is completed, make the checks if (nsj>0 && (exl<P->alignSplicedMateMapLmin || exl < (uint) (P->alignSplicedMateMapLminOverLmate*RA->readLength[trA.exons[iex][EX_iFrag]])) ) { return; //do not record this transcript }; exl=0;nsj=0; } else if (trA.canonSJ[iex]>=0) { nsj++; }; }; }; if (P->outFilterBySJoutStage==2) {//junctions have to be present in the filtered set P->sjnovel for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]>=0 && trA.sjAnnot[iex]==0) { uint jS=trA.exons[iex][EX_G]+trA.exons[iex][EX_L]; uint jE=trA.exons[iex+1][EX_G]-1; if ( binarySearch2(jS,jE,P->sjNovelStart,P->sjNovelEnd,P->sjNovelN) < 0 ) return; }; }; }; if ( trA.exons[0][EX_iFrag]!=trA.exons[trA.nExons-1][EX_iFrag] ) {//check for correct overlap between mates if (trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] <= trA.exons[0][EX_G]) return; //to avoid negative insert size uint iexM2=trA.nExons; for (uint iex=0;iex<trA.nExons-1;iex++) {//find the first exon of the second mate if (trA.canonSJ[iex]==-3) {// iexM2=iex+1; break; }; }; if ( trA.exons[iexM2-1][EX_G] + trA.exons[iexM2-1][EX_L] > trA.exons[iexM2][EX_G] ) {//mates overlap - check consistency of junctions if (trA.exons[0][EX_G] > trA.exons[iexM2][EX_G]+trA.exons[0][EX_R]) return; //LeftMateStart > RightMateStart if (trA.exons[iexM2-1][EX_G]+trA.exons[iexM2-1][EX_L] > trA.exons[trA.nExons-1][EX_G]+Lread-trA.exons[trA.nExons-1][EX_R]) return; //LeftMateEnd > RightMateEnd //check for junctions consistency uint iex1=1, iex2=iexM2+1; //last exons of the junction for (; iex1<iexM2; iex1++) {//find first junction that overlaps 2nd mate if (trA.exons[iex1][EX_G] >= trA.exons[iex2-1][EX_G] + trA.exons[iex2-1][EX_L]) break; }; while (iex1<iexM2 && iex2<trA.nExons) {//cycle through all overlapping exons if (trA.canonSJ[iex1-1]<0) {//skip non-junctions iex1++; continue; }; if (trA.canonSJ[iex2-1]<0) {//skip non-junctions iex2++; continue; }; if ( ( trA.exons[iex1][EX_G]!=trA.exons[iex2][EX_G] ) || ( (trA.exons[iex1-1][EX_G]+trA.exons[iex1-1][EX_L]) != (trA.exons[iex2-1][EX_G]+trA.exons[iex2-1][EX_L]) ) ) { return; //inconsistent junctions on overlapping mates }; iex1++; iex2++; };//cycle through all overlapping exons };//mates overlap - check consistency of junctions };//check for correct overlap between mates if (P->scoreGenomicLengthLog2scale!=0) {//add gap length score Score += int(ceil( log2( (double) ( trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] - trA.exons[0][EX_G]) ) \ * P->scoreGenomicLengthLog2scale - 0.5)); Score = max(0,Score); }; //calculate some final values for the transcript trA.roStart = (trA.roStr == 0) ? trA.rStart : Lread - trA.rStart - trA.rLength; trA.maxScore=Score; if (trA.exons[0][EX_iFrag]==trA.exons[trA.nExons-1][EX_iFrag]) {//mark single fragment transcripts trA.iFrag=trA.exons[0][EX_iFrag]; RA->maxScoreMate[trA.iFrag] = max (RA->maxScoreMate[trA.iFrag] , Score); } else { trA.iFrag=-1; }; if ( Score+P->outFilterMultimapScoreRange >= wTr[0]->maxScore \ || ( trA.iFrag>=0 && Score+P->outFilterMultimapScoreRange >= RA->maxScoreMate[trA.iFrag] ) \ || P->chimSegmentMin>0) { //only record the transcripts within the window that are in the Score range //OR within the score range of each mate //OR all transcript if chimeric detection is activated if (P->outFilterMismatchNoverLmax1<0) {//check that the alignment is end-to-end uint rTotal=trA.rLength+trA.lIns; // for (uint iex=1;iex<trA.nExons;iex++) {//find the inside exons // rTotal+=trA.exons[iex][EX_R]-trA.exons[iex-1][EX_R]; // }; if ( (trA.iFrag<0 && rTotal<(RA->readLength[0]+RA->readLength[1])) || (trA.iFrag>=0 && rTotal<RA->readLength[trA.iFrag])) return; }; uint iTr=0; //transcript insertion/replacement place trA.mappedLength=0; for (uint iex=0;iex<trA.nExons;iex++) {//caclulate total mapped length trA.mappedLength += trA.exons[iex][EX_L]; }; while (iTr < *nWinTr) {//scan through all recorded transcripts for this window - check for duplicates //another way to calculate uOld, uNew: w/o gMap uint nOverlap=blocksOverlap(trA,*wTr[iTr]); uint uNew=trA.mappedLength-nOverlap; uint uOld=wTr[iTr]->mappedLength-nOverlap; if (uNew==0 && Score < wTr[iTr]->maxScore) {//new transript is a subset of the old ones break; } else if (uOld==0) {//old transcript is a subset of the new one, remove old transcript Transcript *pTr=wTr[iTr]; for (uint ii=iTr+1;ii<*nWinTr;ii++) wTr[ii-1]=wTr[ii]; //shift transcripts (*nWinTr)--; wTr[*nWinTr]=pTr; } else if (uOld>0 && (uNew>0 || Score >= wTr[iTr]->maxScore) ) {//check next transcript iTr++; }; }; if (iTr==*nWinTr) {//insert the new transcript for (iTr=0;iTr<*nWinTr;iTr++) {//find inseriton location if (Score>wTr[iTr]->maxScore || (Score==wTr[iTr]->maxScore && trA.gLength<wTr[iTr]->gLength) ) break; }; Transcript *pTr=wTr[*nWinTr]; for (int ii=*nWinTr; ii> int(iTr); ii--) {//shift all the transcript pointers down from iTr wTr[ii]=wTr[ii-1]; }; wTr[iTr]=pTr; //the new transcript pointer is now at *nWinTr+1, move it into the iTr *(wTr[iTr])=trA; if (*nWinTr<P->alignTranscriptsPerWindowNmax) { (*nWinTr)++; //increment number of transcripts per window; } else { //"WARNING: too many recorded transcripts per window: iRead="<<RA->iRead<< "\n"; }; }; }; return; }; /////////////////////////////////////////////////////////////////////////////////// int dScore=0; Transcript trAi=trA; //trA copy with this align included, to be used in the 1st recursive call of StitchAlign if (trA.nExons>0) {//stitch, a transcript has already been originated dScore=stitchAlignToTranscript(tR2, tG2, WA[iA][WA_rStart], WA[iA][WA_gStart], WA[iA][WA_Length], WA[iA][WA_iFrag], WA[iA][WA_sjA], P, R, Q, G, &trAi, RA->outFilterMismatchNmaxTotal); //TODO check if the new stitching creates too many MM, quit this transcript if so } else { //this is the first align in the transcript trAi.exons[0][EX_R]=trAi.rStart=WA[iA][WA_rStart]; //transcript start/end trAi.exons[0][EX_G]=trAi.gStart=WA[iA][WA_gStart]; trAi.exons[0][EX_L]=WA[iA][WA_Length]; trAi.exons[0][EX_iFrag]=WA[iA][WA_iFrag]; trAi.exons[0][EX_sjA]=WA[iA][WA_sjA]; trAi.nExons=1; //recorded first exon for (uint ii=0;ii<WA[iA][WA_Length];ii++) dScore+=int(Q [ WA[iA][WA_rStart] + ii ]); //sum all the scores trAi.nMatch=WA[iA][WA_Length]; //# of matches for (uint ii=0; ii<nA; ii++) WAincl[ii]=false; }; if (dScore>-1000000) {//include this align WAincl[iA]=true; if ( WA[iA][WA_Nrep]==1 ) trAi.nUnique++; //unique piece if ( WA[iA][WA_Anchor]>0 ) trAi.nAnchor++; //anchor piece piece stitchWindowAligns(iA+1, nA, Score+dScore, WAincl, WA[iA][WA_rStart]+WA[iA][WA_Length]-1, WA[iA][WA_gStart]+WA[iA][WA_Length]-1, trAi, Lread, WA, R, Q, G, sigG, P, wTr, nWinTr, RA); } else { }; //also run a transcript w/o including this align if (WA[iA][WA_Anchor]!=2 || trA.nAnchor>0) {//only allow exclusion if this is not the last anchor, or other anchors have been used WAincl[iA]=false; stitchWindowAligns(iA+1, nA, Score, WAincl, tR2, tG2, trA, Lread, WA, R, Q, G, sigG, P, wTr, nWinTr, RA); }; return; };
void ReadAlign::stitchWindowSeeds (uint iW, uint iWrec, char* R, char* Q, char* G) {//stitches all seeds in one window: iW for (uint iS1=0;iS1<nWA[iW];iS1++) { scoreSeedBest[iS1]=0; scoreSeedBestMM[iS1]=0; scoreSeedBestInd[iS1]=-1; intScore score2=0; for (uint iS2=0;iS2<=iS1;iS2++) { trA1=*trInit;//initialize trA1 if (iS2<iS1) { trA1.nExons=1; trA1.nMM=scoreSeedBestMM[iS2]; trA1.exons[0][EX_R] = WA[iW][iS2][WA_rStart]; trA1.exons[0][EX_G] = WA[iW][iS2][WA_gStart]; trA1.exons[0][EX_L] = WA[iW][iS2][WA_Length]; trA1.exons[0][EX_iFrag]=WA[iW][iS2][WA_iFrag]; trA1.exons[0][EX_sjA]=WA[iW][iS2][WA_sjA]; score2=\ stitchAlignToTranscript(WA[iW][iS2][WA_rStart]+WA[iW][iS2][WA_Length]-1, WA[iW][iS2][WA_gStart]+WA[iW][iS2][WA_Length]-1,\ WA[iW][iS1][WA_rStart], WA[iW][iS1][WA_gStart], WA[iW][iS1][WA_Length], WA[iW][iS1][WA_iFrag], WA[iW][iS1][WA_sjA], \ P, R, Q, G, &trA1, outFilterMismatchNmaxTotal); if (score2>0 && score2+scoreSeedBest[iS2] > scoreSeedBest[iS1] ) { scoreSeedBest[iS1]=score2+scoreSeedBest[iS2]; scoreSeedBestMM[iS1]=trA1.nMM; scoreSeedBestInd[iS1]=iS2; }; } else {//extend to the left score2=WA[iW][iS1][WA_Length]; if ( WA[iW][iS1][WA_rStart]>0 \ && extendAlign(R, Q, G, WA[iW][iS1][WA_rStart]-1, WA[iW][iS1][WA_gStart]-1, -1, -1, min(WA[iW][iS1][WA_rStart], WA[iW][iS1][WA_gStart] - P->chrStart[WC[iW][WC_Str]]), 100000, 0, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax1, &trA1) ) {//if could extend score2 += trA1.maxScore; }; if (score2 > scoreSeedBest[iS1] ) { scoreSeedBest[iS1]=score2; scoreSeedBestInd[iS1]=iS1; // scoreSeedBestMM[iS1]=trA1.nMM; }; }; }; }; intScore scoreBest=0; uint scoreBestInd=0; for (uint iS1=0;iS1<nWA[iW];iS1++) {//find the best alignment trA1=*trInit;//initialize trA1 uint tR2=WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length]; uint tG2=WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length]; if ( tR2 < Lread-1 \ && extendAlign(R, Q, G, tR2, tG2, +1, +1, min(Lread-tR2,P->chrStart[WC[iW][WC_Str]+1]-tG2-1), 100000, scoreSeedBestMM[iS1], outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax1, &trA1) ) {//if could extend scoreSeedBest[iS1]+=trA1.maxScore; }; if (scoreSeedBest[iS1]>scoreBest) {//record new best transcript scoreBest=scoreSeedBest[iS1]; scoreBestInd=iS1; }; }; uint seedN=0; while (true) {//construct the sequence of seeds seedChain[seedN++]=scoreBestInd; if (scoreBestInd>scoreSeedBestInd[scoreBestInd]){//keep going scoreBestInd=scoreSeedBestInd[scoreBestInd]; } else {//this seed is hte first one break; }; }; int Score=0; {//build final transcript form seedChain {//initiate transcript uint iS1=seedChain[seedN-1]; Score= WA[iW][iS1][WA_Length]; trA.maxScore = Score; trA.nMatch = WA[iW][iS1][WA_Length]; //# of matches trA.nMM = 0; trA.exons[0][EX_R] = trA.rStart = WA[iW][iS1][WA_rStart]; trA.exons[0][EX_G] = trA.gStart = WA[iW][iS1][WA_gStart]; trA.exons[0][EX_L] = WA[iW][iS1][WA_Length]; trA.exons[0][EX_iFrag]=WA[iW][iS1][WA_iFrag]; trA.exons[0][EX_sjA]=WA[iW][iS1][WA_sjA]; trA.nExons=1; }; for (uint iSc=seedN-1; iSc>0; iSc--) {//stitch seeds from the chain uint iS1=seedChain[iSc], iS2=seedChain[iSc-1]; int scoreStitch= stitchAlignToTranscript(WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length]-1, WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length]-1,\ WA[iW][iS2][WA_rStart], WA[iW][iS2][WA_gStart], WA[iW][iS2][WA_Length], WA[iW][iS2][WA_iFrag], WA[iW][iS2][WA_sjA], \ P, R, Q, G, &trA, outFilterMismatchNmaxTotal); // if (scoreStitch>0) { Score+=scoreStitch; // } else { // cout <<"BUG"<<endl; // return;//this should not happen // }; }; trA.maxScore=Score; {//extend to the left uint iS1=seedChain[seedN-1]; trA1=*trInit; if ( trA.exons[0][EX_R]>0 \ && extendAlign(R, Q, G, trA.exons[0][EX_R]-1, trA.exons[0][EX_G]-1, -1, -1, min(trA.exons[0][EX_R], trA.exons[0][EX_G] - P->chrStart[WC[iW][WC_Str]]), 100000, 0, outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax1, &trA1) ) {//if could extend trA.add(&trA1); // trA.maxScore += trA1.maxScore + WA[iW][iS1][WA_Length]; // trA.nMatch += trA1.nMatch + WA[iW][iS1][WA_Length]; //# of matches // trA.nMM += trA1.nMM; trA.exons[0][EX_R] -= trA1.extendL; trA.exons[0][EX_G] -= trA1.extendL; trA.exons[0][EX_L] += trA1.extendL; trA.rStart = trA.exons[0][EX_R]; trA.gStart = trA.exons[0][EX_G]; }; }; {//extend to the right uint iS1=seedChain[0]; trA1=*trInit;//initialize trA1 uint tR2=WA[iW][iS1][WA_rStart]+WA[iW][iS1][WA_Length]; uint tG2=WA[iW][iS1][WA_gStart]+WA[iW][iS1][WA_Length]; if ( tR2 < Lread-1 \ && extendAlign(R, Q, G, tR2, tG2, +1, +1, min(Lread-tR2,P->chrStart[WC[iW][WC_Str]+1]-tG2-1), 100000, scoreSeedBestMM[iS1], outFilterMismatchNmaxTotal, P->outFilterMismatchNoverLmax1, &trA1) ) {//if could extend trA.add(&trA1); trA.exons[trA.nExons-1][EX_L] += trA1.extendL;//extend the length of the last exon }; }; }; //debug: recalculate the number of MM // { // uint nMM1=0; // for (uint iex=0;iex<trA.nExons;iex++) { // for (uint ii=0;ii<trA.exons[iex][EX_L];ii++) { // if ( R[ii+trA.exons[iex][EX_R]]!=G[ii+trA.exons[iex][EX_G]] && G[ii+trA.exons[iex][EX_G]]<4 && R[ii+trA.exons[iex][EX_R]]<4) { // nMM1++; // }; // }; // }; // if (nMM1!=trA.nMM) { // cout <<nMM1<<" "<<trA.nMM<<" "<<readName<<" "<<iRead<<endl; // }; // }; {//calculate some final values for the transcript trA.rLength = 0; for (uint isj=0;isj<trA.nExons;isj++) { trA.rLength += trA.exons[isj][EX_L]; }; trA.gLength = trA.exons[trA.nExons-1][EX_G]+1-trA.gStart; //calculate some final values for the transcript trA.roStart = (trA.roStr == 0) ? trA.rStart : Lread - trA.rStart - trA.rLength; if (trA.exons[0][EX_iFrag]==trA.exons[trA.nExons-1][EX_iFrag]) {//mark single fragment transcripts trA.iFrag=trA.exons[0][EX_iFrag]; maxScoreMate[trA.iFrag] = max (maxScoreMate[trA.iFrag] , trA.maxScore); } else { trA.iFrag=-1; }; trA.intronMotifs[0]=0;trA.intronMotifs[1]=0;trA.intronMotifs[2]=0; for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]==0) { ++trA.intronMotifs[0]; } else if (trA.canonSJ[iex]>0) { ++trA.intronMotifs[2-trA.canonSJ[iex]%2]; }; }; if (P->scoreGenomicLengthLog2scale!=0) {//add gap length score trA.maxScore += int(ceil( log2( (double) ( trA.exons[trA.nExons-1][EX_G]+trA.exons[trA.nExons-1][EX_L] - trA.exons[0][EX_G]) ) \ * P->scoreGenomicLengthLog2scale - 0.5)); trA.maxScore = max(0,trA.maxScore); }; //filter strand consistency trA.sjMotifStrand=0; uint sjN=0; for (uint iex=0;iex<trA.nExons-1;iex++) { if (trA.canonSJ[iex]>=0) sjN++; if (trA.sjStr[iex]>0) {//only these sjs have defined strand if (trA.sjMotifStrand==0) { trA.sjMotifStrand=trA.sjStr[iex]; } else if (trA.sjMotifStrand != trA.sjStr[iex]) {//inconsistent strand return; //kill this transcript }; }; }; // if (P->outFilterIntronMotifs=="KeepCanonical" && (trA.intronMotifs[0]>0 || (trA.intronMotifs[1]>0 && trA.intronMotifs[2]>0) ) ) {//keep only conistent canonical introns // return; // }; //check exons lenghts including repeats, do not report a transcript with short exons // for (uint isj=0;isj<trA.nExons-1;isj++) {//check exons for min length, if they precede a junction // if ( trA.canonSJ[isj]>=0 && \ // ( trA.exons[isj][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][0] \ // || trA.exons[isj+1][EX_L] < P->alignSJoverhangMin + trA.shiftSJ[isj][1]) ) { // return;//do not record this transcript in wTr // }; // }; }; {//record the transcript TODO: allow for multiple transcripts in one window *(trAll[iWrec][0])=trA; nWinTr[iWrec]=1; }; };