unsigned Get_Location(BWT* revfmi,RQINDEX & R,Tag_Info & Tag, unsigned Offset,unsigned Conversion_Factor) { unsigned SAPos; if(0==Offset) return Tag.First; if (Offset==Tag.Gap-1) return Tag.Last; Offset--; if(R.COMPRESS) { SAPos=Tag.SA_Start + bfx((unsigned char*)R.SA_Blocks,Tag.Block_Start+(Offset*Tag.Field_Length),Tag.Field_Length); return Conversion_Factor-BWTSaValue(revfmi,SAPos); } else { return (unsigned)R.SA_Blocks[Tag.Block_Start+Offset]; } }
//根据saIndex还原出seq在ref中的位置 void BWTRetrievePositionFromSAIndex(Idx2BWT * idx2BWT, unsigned int saIndex, unsigned int * sequenceId, unsigned int * offset) { BWT * bwt = idx2BWT->bwt; BWT * rev_bwt = idx2BWT->rev_bwt; HSP * hsp = idx2BWT->hsp; unsigned short * ambiguityMap = hsp->ambiguityMap; Translate * translate = hsp->translate; unsigned int ambPosition = BWTSaValue(bwt,saIndex); unsigned int approxIndex = ambPosition>>GRID_SAMPLING_FACTOR_2_POWER; unsigned int approxValue = ambiguityMap[approxIndex]; while (translate[approxValue].startPos>ambPosition) { approxValue--; } ambPosition-=translate[approxValue].correction; (*sequenceId) = translate[approxValue].chrID; (*offset) = ambPosition; }
char Read_Pair(BWT* fwfmi,BWT* revfmi,SARange* Head_Hits_Pos,SARange* Head_Hits_Neg, SARange* Tail_Hits_Pos,SARange* Tail_Hits_Neg,FILE* Data_File,In_File & IN,Record_Info & Hit_Details,char & MAX_HIT_FAULT,unsigned Conversion_Factor) { #undef READ_ASSERT #define READ_ASSERT(X,Y) {if ((X)<(Y)) {if(LOG_SUCCESS_FILE)fprintf(Log_SFile,"Read_Pair(): Read error...\n"); printf("Read_Pair(): Read error...\n");exit(-1);}} MAX_HIT_FAULT=FALSE; int Head_Hit_Pos_Count=0; int Head_Hit_Neg_Count=0; int Tail_Hit_Pos_Count=0; int Tail_Hit_Neg_Count=0; unsigned i, Bitsread; unsigned Start,End; unsigned Location; int Desc_End; unsigned Previous_Tag,Hits; char letter; unsigned pos; int StringLength; unsigned Progress=0,Average_Length; unsigned Number_of_Tags=100; char Mismatches_Desc[1000]; char Quality[MAXTAG+1]; char* Mismatch_Desc_Ptr; char Last_Orientation,Last_Half; char* Ins_Format; char* Del_Format; char* Mis_Format; static char First_Pass=TRUE; static int Break=0; char Tag_Type; char* Seek_Tail; char N[500]; unsigned short Stats1[7]; unsigned short Stats2[7]; char New_Record[4]; Mismatches_Record_GIS MismatchesGIS; Output_Record Record; if (First_Pass) {READ_ASSERT(fread(&Tag_Type,1,1,Data_File),1);} if(fgets(Hit_Details.Description,MAXDES,Data_File)==NULL) {if(LOG_SUCCESS_FILE) fprintf(Log_SFile,"Read_Pair(): error reading file...\n");printf("Read_Pair(): error reading file...\n");exit(-1);}; Seek_Tail=Hit_Details.Description; while(*(Seek_Tail)!='\n'&&*(Seek_Tail)!='\t'&&*(Seek_Tail)!=' ') Seek_Tail++;*Seek_Tail=0;//make description to a string... READ_ASSERT(fread(Stats1,IN.Stat_Size,1,Data_File),1); READ_ASSERT(fread(Stats2,IN.Stat_Size,1,Data_File),1); READ_ASSERT(fread(IN.Tag_Copy,IN.TAG_COPY_LEN,1,Data_File),1);//if(NORMAL_TAGS && FILETYPE==FQ) gzread(Data_File,Quality,TAG_COPY_LEN); if(IN.FILETYPE==FQ) READ_ASSERT(fread(Hit_Details.Quality,1,IN.TAG_COPY_LEN,Data_File),IN.TAG_COPY_LEN); READ_ASSERT(fread(&Tag_Type,1,1,Data_File),1); if (First_Pass) { IN.Positive_Tail=IN.Tag_Copy;First_Pass=FALSE; while (*(IN.Positive_Tail++)!='\t'){Break++;}; } IN.Tag_Copy[Break]=0; //-------------------------------------------------------------------------------------------- for(;;) { //fread(&Tag_Type,1,1,Data_File); if('&' == Tag_Type) return FALSE; if('@' == Tag_Type)//start of new tag { Head_Hits_Pos[Head_Hit_Pos_Count].Start=0; Head_Hits_Neg[Head_Hit_Neg_Count].Start=0; Tail_Hits_Pos[Tail_Hit_Pos_Count].Start=0; Tail_Hits_Neg[Tail_Hit_Neg_Count].Start=0; if((Head_Hit_Pos_Count + Head_Hit_Neg_Count ==1) && (Tail_Hit_Pos_Count + Tail_Hit_Neg_Count==1)) //unique pair.. { if (Head_Hit_Pos_Count ==1 && !Head_Hit_Neg_Count)// H + uniq { if (Tail_Hit_Pos_Count ==1 && !Tail_Hit_Neg_Count)//T + uniq { return PP; } else if (Tail_Hit_Pos_Count ==0 && Tail_Hit_Neg_Count==1)//T - uniq { return PM; } } else if (Head_Hit_Pos_Count ==0 && Head_Hit_Neg_Count ==1)// H - uniq { if (Tail_Hit_Pos_Count ==1 && !Tail_Hit_Neg_Count)//T + uniq { return MP; } else if (Tail_Hit_Pos_Count ==0 && Tail_Hit_Neg_Count==1)//T - uniq { return MM; } } } return REPEAT; } else//Process hit.... { if ('&' == Tag_Type) return FALSE; //-------------------------------------------------------------------------------------------- //gzread(Data_File,New_Record,3); READ_ASSERT(fread(New_Record,1,3,Data_File),3); //if (New_Record[2]) gzread(Data_File,N,New_Record[2]*2); if (New_Record[2]) READ_ASSERT(fread(N,1,New_Record[2]*2,Data_File),New_Record[2]*2); //gzread(Data_File,&Record,sizeof(Output_Record)); READ_ASSERT(fread(&Record,1,sizeof(Output_Record),Data_File),sizeof(Output_Record)); //gzread(Data_File,&MismatchesGIS,Record.Mismatches+sizeof(unsigned)); READ_ASSERT(fread(&MismatchesGIS,1,Record.Mismatches+sizeof(unsigned),Data_File),Record.Mismatches+sizeof(unsigned)); StringLength=IN.Length_Array[New_Record[0]]; if (!Record.Gap) { if (Record.Skip) Record.Start = Conversion_Factor-revfmi->saValue[Record.Start/revfmi->saInterval]+Record.Skip-1; else Record.Start=Conversion_Factor-BWTSaValue(revfmi,Record.Start); } if(New_Record[0]==1) { if (New_Record[1] == '-') { Head_Hits_Neg[Head_Hit_Neg_Count].Start=Record.Start; Head_Hits_Neg[Head_Hit_Neg_Count].End=Record.Start+Record.Gap; Head_Hits_Neg[Head_Hit_Neg_Count].Mismatches=Record.Mismatches; Head_Hit_Neg_Count++; } else { Head_Hits_Pos[Head_Hit_Pos_Count].Start=Record.Start; Head_Hits_Pos[Head_Hit_Pos_Count].End=Record.Start+Record.Gap; Head_Hits_Pos[Head_Hit_Pos_Count].Mismatches=Record.Mismatches; Head_Hit_Pos_Count++; } #ifdef DEBUG if(Head_Hit_Neg_Count >= MAX_HITS_TO_STORE) {Head_Hit_Neg_Count--;MAX_HIT_FAULT=TRUE;}else if( Head_Hit_Pos_Count >= MAX_HITS_TO_STORE) {Head_Hit_Pos_Count--;MAX_HIT_FAULT=TRUE;}//{printf("Read_Pair: Too many hits !..\n");exit(1);} #endif } else { if (New_Record[1] == '-') { Tail_Hits_Neg[Tail_Hit_Neg_Count].Start=Record.Start; Tail_Hits_Neg[Tail_Hit_Neg_Count].End=Record.Start+Record.Gap; Tail_Hits_Neg[Tail_Hit_Neg_Count].Mismatches=Record.Mismatches; Tail_Hit_Neg_Count++; } else { Tail_Hits_Pos[Tail_Hit_Pos_Count].Start=Record.Start; Tail_Hits_Pos[Tail_Hit_Pos_Count].End=Record.Start+Record.Gap; Tail_Hits_Pos[Tail_Hit_Pos_Count].Mismatches=Record.Mismatches; Tail_Hit_Pos_Count++; } #ifdef DEBUG if(Tail_Hit_Neg_Count >= MAX_HITS_TO_STORE){Tail_Hit_Neg_Count--;MAX_HIT_FAULT=TRUE;} else if( Tail_Hit_Pos_Count >= MAX_HITS_TO_STORE) {Tail_Hit_Pos_Count--;MAX_HIT_FAULT=TRUE;}//printf("Read_Pair: Too many hits !..\n");exit(1);} #endif } } READ_ASSERT(fread(&Tag_Type,1,1,Data_File),1); } }
/* * === FUNCTION ====================================================================== * Name: Search_Small_Gap * Description: Do the searching when at least one of the pairs have small SA range... * Store the result in Pairs, starting from Pairs[Pairs_Index] * Terminates hit list when encountering Pairs[x].Head=0... * ===================================================================================== */ void Search_Small_Gap(BWT* revfmi,RQINDEX & R, SARange & Head, SARange & Tail, int d,Pair* Pairs,int & Pairs_Index,unsigned MAXCOUNT,int & HITS,unsigned Entries,unsigned Conversion_Factor) { unsigned H1,H2,T1,T2,L,H,M; Tag_Info Head_Info,Tail_Info; if(Head.Start==Head.End)//Head is unique... { H1=Head.Start;//Conversion_Factor-BWTSaValue(revfmi,Head.Start); if (Tail.Start==Tail.End)// both hits are unique... { T2=Tail.Start;//Conversion_Factor-BWTSaValue(revfmi,Tail.Start); if (T2>H1 && T2 < H1+d)//modify for multiple hits... { Pairs[Pairs_Index].Head=H1; Pairs[Pairs_Index].MismatchesH=Head.Mismatches; Pairs[Pairs_Index].Tail=T2; Pairs[Pairs_Index].MismatchesT=Tail.Mismatches; Pairs_Index++;HITS++; #ifdef DEBUG if(H1>T2) { printf("Search_Small_Gap(6):Enum error...\n"); exit(0); } #endif } Pairs[Pairs_Index].Head=0; return; } else //tail has multiples... { if (Tail.End-Tail.Start > SAGAP_CUTOFF) { Load_Info(R,Tail_Info,Tail,Entries); T1=Tail_Info.First;T2=Tail_Info.Last; if(H1<T1 )//Possible case for T1>H1 { if(H1+d>T1) { M=0; while (H1<T1 && H1+d>=T1)//enumerate hits... { Pairs[Pairs_Index].Head=H1; Pairs[Pairs_Index].MismatchesH=Head.Mismatches; Pairs[Pairs_Index].Tail=T1; Pairs[Pairs_Index].MismatchesT=Tail.Mismatches; Pairs_Index++;HITS++; #ifdef DEBUG if(H1>T1) { printf("Search_Small_Gap(5):Enum error...\n"); exit(0); } #endif if (HITS>=MAXCOUNT) { Pairs[Pairs_Index].Head=0; return; } M++; T1=Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor); if(M>=Tail_Info.Gap) break; } } } else if(T2>H1) //H1 inside tail gaps.. { L=0; H=Tail_Info.Gap; while (L < H) { M=(L+H)/2; if (Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor) > H1) { H=M; } else { L=M+1; } } if (L==H) M=H;//find tail position closest to unique head... T1=Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor); while (H1<T1 && H1+d>=T1)//enumerate hits... { Pairs[Pairs_Index].Head=H1; Pairs[Pairs_Index].MismatchesH=Head.Mismatches; Pairs[Pairs_Index].Tail=T1; Pairs[Pairs_Index].MismatchesT=Tail.Mismatches; Pairs_Index++;HITS++; #ifdef DEBUG if(H1>T1) { printf("Search_Small_Gap(4):Enum error...\n"); exit(0); } #endif if (HITS>=MAXCOUNT) { Pairs[Pairs_Index].Head=0; return; } M++; T1=Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor); if(M>=Tail_Info.Gap) break; } } Pairs[Pairs_Index].Head=0; return; } else//Unique head and tail with gap below cutoff... { for (unsigned i=Tail.Start;i<=Tail.End;i++) { unsigned Hit=(Conversion_Factor-BWTSaValue(revfmi,i)); if(Hit > H1 && Hit < H1+d) { Pairs[Pairs_Index].Head=H1; Pairs[Pairs_Index].MismatchesH=Head.Mismatches; Pairs[Pairs_Index].Tail=Hit; Pairs[Pairs_Index].MismatchesT=Tail.Mismatches; Pairs_Index++;HITS++; #ifdef DEBUG if(H1>Hit) { printf("Search_Small_Gap(3):Enum error...\n"); exit(0); } #endif if(HITS >=MAXCOUNT) break; } } Pairs[Pairs_Index].Head=0; return; } } } else //if(Tail.End==Tail.Start)//Tail is unique... { T1=Tail.Start;//Conversion_Factor-BWTSaValue(revfmi,Tail.Start); if(Head.End-Head.Start>SAGAP_CUTOFF)//Unique tail, but with multiple possible heads... { Load_Info(R,Head_Info,Head,Entries); H1=Head_Info.First;H2=Head_Info.Last; if(T1 > H1) //Head should not be after T1... { if(H2+d >T1)//Tail not too far away from heads... { //T1-d is between H1 and H2, search for the closest hit... L=0; H=Head_Info.Gap-1; unsigned T1_Temp=T1; if (T1>d) T1-=d; else T1=0; while (L < H) { M=(L+H)/2; if (Get_Location(revfmi,R,Head_Info,M,Conversion_Factor) < T1) { L=M+1; } else { H=M; } } if (L==H) M=L; H1=Get_Location(revfmi,R,Head_Info,M,Conversion_Factor); //T1+=d; T1=T1_Temp; while (H1<T1 && H1+d>=T1)//enumerate hits... { Pairs[Pairs_Index].Head=H1; Pairs[Pairs_Index].MismatchesH=Head.Mismatches; Pairs[Pairs_Index].Tail=T1; Pairs[Pairs_Index].MismatchesT=Tail.Mismatches; Pairs_Index++;HITS++; #ifdef DEBUG if(H1>T1) { printf("Search_Small_Gap(2):Enum error...\n"); exit(0); } #endif if (HITS>=MAXCOUNT) { break; } M++; H1=Get_Location(revfmi,R,Head_Info,M,Conversion_Factor); if(M>=Head_Info.Gap) break; } } } Pairs[Pairs_Index].Head=0; return; } else//Unique tail and head with gap below cutoff... { for (unsigned i=Head.Start;i<=Head.End;i++)//try all hits... { unsigned Hit=(Conversion_Factor-BWTSaValue(revfmi,i)); if(Hit < T1 && T1 <= Hit+d) { Pairs[Pairs_Index].Head=Hit; Pairs[Pairs_Index].MismatchesH=Head.Mismatches; Pairs[Pairs_Index].Tail=T1; Pairs[Pairs_Index].MismatchesT=Tail.Mismatches; Pairs_Index++;HITS++; #ifdef DEBUG if(Hit>T1) { printf("Search_Small_Gap(1):Enum error...\n"); exit(0); } #endif if(HITS >=MAXCOUNT) break; } } Pairs[Pairs_Index].Head=0; return; } } }