Ejemplo n.º 1
0
unsigned Get_Location(BWT* revfmi,RQINDEX & R,Tag_Info & Tag, unsigned Offset,unsigned Conversion_Factor)
{
	unsigned SAPos;
	if(0==Offset) return Tag.First;
	if (Offset==Tag.Gap-1) return Tag.Last;
	Offset--;
	if(R.COMPRESS)
	{
		SAPos=Tag.SA_Start + bfx((unsigned char*)R.SA_Blocks,Tag.Block_Start+(Offset*Tag.Field_Length),Tag.Field_Length);
		return Conversion_Factor-BWTSaValue(revfmi,SAPos);
	}
	else
	{
		return (unsigned)R.SA_Blocks[Tag.Block_Start+Offset];
	}
}
Ejemplo n.º 2
0
//根据saIndex还原出seq在ref中的位置
void BWTRetrievePositionFromSAIndex(Idx2BWT * idx2BWT, unsigned int saIndex,
        unsigned int * sequenceId, unsigned int * offset)
{
    BWT * bwt = idx2BWT->bwt;
    BWT * rev_bwt = idx2BWT->rev_bwt;
    HSP * hsp = idx2BWT->hsp;
	unsigned short * ambiguityMap = hsp->ambiguityMap;
	Translate * translate = hsp->translate;
    
    unsigned int ambPosition = BWTSaValue(bwt,saIndex);
    unsigned int approxIndex = ambPosition>>GRID_SAMPLING_FACTOR_2_POWER;
    unsigned int approxValue = ambiguityMap[approxIndex];
    while (translate[approxValue].startPos>ambPosition) {
        approxValue--;
    }
    ambPosition-=translate[approxValue].correction;
    
    (*sequenceId) = translate[approxValue].chrID;
    (*offset) = ambPosition;
}
Ejemplo n.º 3
0
char Read_Pair(BWT* fwfmi,BWT* revfmi,SARange* Head_Hits_Pos,SARange* Head_Hits_Neg, SARange* Tail_Hits_Pos,SARange* Tail_Hits_Neg,FILE* Data_File,In_File & IN,Record_Info & Hit_Details,char & MAX_HIT_FAULT,unsigned Conversion_Factor)
{
#undef READ_ASSERT
#define READ_ASSERT(X,Y) {if ((X)<(Y)) {if(LOG_SUCCESS_FILE)fprintf(Log_SFile,"Read_Pair(): Read error...\n"); printf("Read_Pair(): Read error...\n");exit(-1);}}
	MAX_HIT_FAULT=FALSE;
	int Head_Hit_Pos_Count=0;
	int Head_Hit_Neg_Count=0;
	int Tail_Hit_Pos_Count=0;
	int Tail_Hit_Neg_Count=0;
	unsigned i, Bitsread;
	unsigned Start,End;
	unsigned Location;
	int Desc_End;
	unsigned Previous_Tag,Hits;
	char letter;
	unsigned pos;
	int StringLength;
	unsigned Progress=0,Average_Length;
	unsigned Number_of_Tags=100;
	char Mismatches_Desc[1000];
	char Quality[MAXTAG+1];
	char* Mismatch_Desc_Ptr;
	char Last_Orientation,Last_Half;
	char* Ins_Format;
	char* Del_Format;
	char* Mis_Format;
	static char First_Pass=TRUE;
	static int Break=0;
	char Tag_Type;
	char* Seek_Tail;
	char N[500];
	unsigned short Stats1[7];
	unsigned short Stats2[7];
	char New_Record[4];
	Mismatches_Record_GIS MismatchesGIS;
	Output_Record Record;

	if (First_Pass) {READ_ASSERT(fread(&Tag_Type,1,1,Data_File),1);} 
	if(fgets(Hit_Details.Description,MAXDES,Data_File)==NULL) {if(LOG_SUCCESS_FILE) fprintf(Log_SFile,"Read_Pair(): error reading file...\n");printf("Read_Pair(): error reading file...\n");exit(-1);};
	Seek_Tail=Hit_Details.Description;
	while(*(Seek_Tail)!='\n'&&*(Seek_Tail)!='\t'&&*(Seek_Tail)!=' ') Seek_Tail++;*Seek_Tail=0;//make description to a string...
	READ_ASSERT(fread(Stats1,IN.Stat_Size,1,Data_File),1);
	READ_ASSERT(fread(Stats2,IN.Stat_Size,1,Data_File),1);
	READ_ASSERT(fread(IN.Tag_Copy,IN.TAG_COPY_LEN,1,Data_File),1);//if(NORMAL_TAGS && FILETYPE==FQ) gzread(Data_File,Quality,TAG_COPY_LEN);
	if(IN.FILETYPE==FQ) READ_ASSERT(fread(Hit_Details.Quality,1,IN.TAG_COPY_LEN,Data_File),IN.TAG_COPY_LEN);
	READ_ASSERT(fread(&Tag_Type,1,1,Data_File),1); 
	if (First_Pass)
	{
		IN.Positive_Tail=IN.Tag_Copy;First_Pass=FALSE;
		while (*(IN.Positive_Tail++)!='\t'){Break++;};
	}
	IN.Tag_Copy[Break]=0;


//--------------------------------------------------------------------------------------------
	for(;;)
	{
		//fread(&Tag_Type,1,1,Data_File);
		if('&' == Tag_Type) return FALSE;
		if('@' == Tag_Type)//start of new tag
		{
			Head_Hits_Pos[Head_Hit_Pos_Count].Start=0;
			Head_Hits_Neg[Head_Hit_Neg_Count].Start=0;
			Tail_Hits_Pos[Tail_Hit_Pos_Count].Start=0;
			Tail_Hits_Neg[Tail_Hit_Neg_Count].Start=0;
			if((Head_Hit_Pos_Count + Head_Hit_Neg_Count ==1) && (Tail_Hit_Pos_Count + Tail_Hit_Neg_Count==1)) //unique pair..
			{
				if (Head_Hit_Pos_Count ==1 && !Head_Hit_Neg_Count)// H + uniq
				{
					if (Tail_Hit_Pos_Count ==1 && !Tail_Hit_Neg_Count)//T + uniq
					{
						return PP;
					}
					else if (Tail_Hit_Pos_Count ==0 && Tail_Hit_Neg_Count==1)//T - uniq
					{
						return PM;
					}
				}
				else if (Head_Hit_Pos_Count ==0 && Head_Hit_Neg_Count ==1)// H - uniq
				{
					if (Tail_Hit_Pos_Count ==1 && !Tail_Hit_Neg_Count)//T + uniq
					{
						return MP;
					}
					else if (Tail_Hit_Pos_Count ==0 && Tail_Hit_Neg_Count==1)//T - uniq
					{
						return MM;
					}
				}
			}
			return REPEAT;
		}		
		else//Process hit....
		{
			if ('&' == Tag_Type) return FALSE;
//--------------------------------------------------------------------------------------------
			//gzread(Data_File,New_Record,3);
			READ_ASSERT(fread(New_Record,1,3,Data_File),3);
			//if (New_Record[2]) gzread(Data_File,N,New_Record[2]*2);
			if (New_Record[2]) READ_ASSERT(fread(N,1,New_Record[2]*2,Data_File),New_Record[2]*2);
			//gzread(Data_File,&Record,sizeof(Output_Record));
			READ_ASSERT(fread(&Record,1,sizeof(Output_Record),Data_File),sizeof(Output_Record));
			//gzread(Data_File,&MismatchesGIS,Record.Mismatches+sizeof(unsigned));
			READ_ASSERT(fread(&MismatchesGIS,1,Record.Mismatches+sizeof(unsigned),Data_File),Record.Mismatches+sizeof(unsigned));
			StringLength=IN.Length_Array[New_Record[0]];
			
			if (!Record.Gap)
			{
				if (Record.Skip) Record.Start = Conversion_Factor-revfmi->saValue[Record.Start/revfmi->saInterval]+Record.Skip-1;
				else Record.Start=Conversion_Factor-BWTSaValue(revfmi,Record.Start);
			}
			if(New_Record[0]==1)
			{
				if (New_Record[1] == '-')
				{
					Head_Hits_Neg[Head_Hit_Neg_Count].Start=Record.Start;
					Head_Hits_Neg[Head_Hit_Neg_Count].End=Record.Start+Record.Gap;
					Head_Hits_Neg[Head_Hit_Neg_Count].Mismatches=Record.Mismatches;
					Head_Hit_Neg_Count++;
				}
				else
				{
					Head_Hits_Pos[Head_Hit_Pos_Count].Start=Record.Start;
					Head_Hits_Pos[Head_Hit_Pos_Count].End=Record.Start+Record.Gap;
					Head_Hits_Pos[Head_Hit_Pos_Count].Mismatches=Record.Mismatches;
					Head_Hit_Pos_Count++;
				}
#ifdef DEBUG
				if(Head_Hit_Neg_Count >= MAX_HITS_TO_STORE) {Head_Hit_Neg_Count--;MAX_HIT_FAULT=TRUE;}else if( Head_Hit_Pos_Count >= MAX_HITS_TO_STORE) {Head_Hit_Pos_Count--;MAX_HIT_FAULT=TRUE;}//{printf("Read_Pair: Too many hits !..\n");exit(1);}
#endif
			}
			else
			{
				if (New_Record[1] == '-')
				{
					Tail_Hits_Neg[Tail_Hit_Neg_Count].Start=Record.Start;
					Tail_Hits_Neg[Tail_Hit_Neg_Count].End=Record.Start+Record.Gap;
					Tail_Hits_Neg[Tail_Hit_Neg_Count].Mismatches=Record.Mismatches;
					Tail_Hit_Neg_Count++;
				}
				else
				{
					Tail_Hits_Pos[Tail_Hit_Pos_Count].Start=Record.Start;
					Tail_Hits_Pos[Tail_Hit_Pos_Count].End=Record.Start+Record.Gap;
					Tail_Hits_Pos[Tail_Hit_Pos_Count].Mismatches=Record.Mismatches;
					Tail_Hit_Pos_Count++;
				}
#ifdef DEBUG
				if(Tail_Hit_Neg_Count >= MAX_HITS_TO_STORE){Tail_Hit_Neg_Count--;MAX_HIT_FAULT=TRUE;} else if( Tail_Hit_Pos_Count >= MAX_HITS_TO_STORE) {Tail_Hit_Pos_Count--;MAX_HIT_FAULT=TRUE;}//printf("Read_Pair: Too many hits !..\n");exit(1);}
#endif
			}
		}
		READ_ASSERT(fread(&Tag_Type,1,1,Data_File),1);

	}
}
Ejemplo n.º 4
0
/* 
 * ===  FUNCTION  ======================================================================
 *         Name:  Search_Small_Gap
 *  Description:  Do the searching when at least one of the pairs have small SA range... 
 * 		  Store the result in Pairs, starting from Pairs[Pairs_Index]
 * 		  Terminates hit list when encountering Pairs[x].Head=0... 
 * =====================================================================================
 */
void Search_Small_Gap(BWT* revfmi,RQINDEX & R, SARange & Head,  SARange & Tail, int d,Pair* Pairs,int & Pairs_Index,unsigned MAXCOUNT,int & HITS,unsigned Entries,unsigned Conversion_Factor)
{
	unsigned H1,H2,T1,T2,L,H,M;
	Tag_Info Head_Info,Tail_Info;


	if(Head.Start==Head.End)//Head is unique...
	{

		H1=Head.Start;//Conversion_Factor-BWTSaValue(revfmi,Head.Start);
		if (Tail.Start==Tail.End)// both hits are unique...
		{
			T2=Tail.Start;//Conversion_Factor-BWTSaValue(revfmi,Tail.Start);
			if (T2>H1 && T2 < H1+d)//modify for multiple hits...
			{
				Pairs[Pairs_Index].Head=H1;
				Pairs[Pairs_Index].MismatchesH=Head.Mismatches;
				Pairs[Pairs_Index].Tail=T2;
				Pairs[Pairs_Index].MismatchesT=Tail.Mismatches;
				Pairs_Index++;HITS++;
#ifdef DEBUG
				if(H1>T2) 
				{
					printf("Search_Small_Gap(6):Enum error...\n");
					exit(0);
				}
#endif
			}
			Pairs[Pairs_Index].Head=0;
			return;
		}
		else //tail has multiples...
		{
			if (Tail.End-Tail.Start > SAGAP_CUTOFF)
			{
				Load_Info(R,Tail_Info,Tail,Entries);
				T1=Tail_Info.First;T2=Tail_Info.Last;


				if(H1<T1 )//Possible case for T1>H1 
				{
					if(H1+d>T1)
					{
						M=0;
						while (H1<T1 && H1+d>=T1)//enumerate hits...
						{
							Pairs[Pairs_Index].Head=H1;
							Pairs[Pairs_Index].MismatchesH=Head.Mismatches;
							Pairs[Pairs_Index].Tail=T1;
							Pairs[Pairs_Index].MismatchesT=Tail.Mismatches;
							Pairs_Index++;HITS++;
#ifdef DEBUG
						if(H1>T1) 
						{
							printf("Search_Small_Gap(5):Enum error...\n");
							exit(0);
						}
#endif
							if (HITS>=MAXCOUNT) 
							{
								Pairs[Pairs_Index].Head=0;
								return;
							}
							M++;
							T1=Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor);
							if(M>=Tail_Info.Gap) break;
						}
					}
				}
				else if(T2>H1) //H1 inside tail gaps..
				{

					L=0;
					H=Tail_Info.Gap;
					while (L < H)
					{
						M=(L+H)/2;
						if (Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor) > H1)
						{
							H=M;
						}
						else
						{
							L=M+1;
						}
					}
					if (L==H) M=H;//find tail position closest to unique head...

					T1=Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor);
					while (H1<T1 && H1+d>=T1)//enumerate hits...
					{
						Pairs[Pairs_Index].Head=H1;
						Pairs[Pairs_Index].MismatchesH=Head.Mismatches;
						Pairs[Pairs_Index].Tail=T1;
						Pairs[Pairs_Index].MismatchesT=Tail.Mismatches;
						Pairs_Index++;HITS++;
#ifdef DEBUG
						if(H1>T1) 
						{
							printf("Search_Small_Gap(4):Enum error...\n");
							exit(0);
						}
#endif
						if (HITS>=MAXCOUNT) 
						{
							Pairs[Pairs_Index].Head=0;
							return;
						}
						M++;
						T1=Get_Location(revfmi,R,Tail_Info,M,Conversion_Factor);
						if(M>=Tail_Info.Gap) break;
					}
				}

				Pairs[Pairs_Index].Head=0;
				return;
			}
			else//Unique head and tail with gap below cutoff...
			{
				for (unsigned i=Tail.Start;i<=Tail.End;i++)
				{
					unsigned Hit=(Conversion_Factor-BWTSaValue(revfmi,i));
					if(Hit > H1 && Hit < H1+d)
					{
						Pairs[Pairs_Index].Head=H1;
						Pairs[Pairs_Index].MismatchesH=Head.Mismatches;
						Pairs[Pairs_Index].Tail=Hit;
						Pairs[Pairs_Index].MismatchesT=Tail.Mismatches;
						Pairs_Index++;HITS++;
#ifdef DEBUG
						if(H1>Hit) 
						{
							printf("Search_Small_Gap(3):Enum error...\n");
							exit(0);
						}
#endif

						if(HITS >=MAXCOUNT) break;
					}
				}
				Pairs[Pairs_Index].Head=0;
				return;
			}
		}
	}
	else //if(Tail.End==Tail.Start)//Tail is unique...
	{
		T1=Tail.Start;//Conversion_Factor-BWTSaValue(revfmi,Tail.Start);
		if(Head.End-Head.Start>SAGAP_CUTOFF)//Unique tail, but with multiple possible heads...
		{
			Load_Info(R,Head_Info,Head,Entries);
			H1=Head_Info.First;H2=Head_Info.Last;
			if(T1 > H1) //Head should not be after T1...
			{
				if(H2+d >T1)//Tail not too far away from heads...
				{
					//T1-d is between H1 and H2, search for the closest hit...
					L=0;
					H=Head_Info.Gap-1;
					unsigned T1_Temp=T1;
					if (T1>d) T1-=d; else T1=0;
					while (L < H)
					{
						M=(L+H)/2;
						if (Get_Location(revfmi,R,Head_Info,M,Conversion_Factor) < T1)
						{
							L=M+1;
						}
						else
						{
							H=M;
						}
					}
					if (L==H) M=L;
					H1=Get_Location(revfmi,R,Head_Info,M,Conversion_Factor);

					//T1+=d;
					T1=T1_Temp;
					while (H1<T1 && H1+d>=T1)//enumerate hits...
					{
						Pairs[Pairs_Index].Head=H1;
						Pairs[Pairs_Index].MismatchesH=Head.Mismatches;
						Pairs[Pairs_Index].Tail=T1;
						Pairs[Pairs_Index].MismatchesT=Tail.Mismatches;
						Pairs_Index++;HITS++;
#ifdef DEBUG
						if(H1>T1) 
						{
							printf("Search_Small_Gap(2):Enum error...\n");
							exit(0);
						}
#endif
						if (HITS>=MAXCOUNT) 
						{
							break;	
						}
						M++;
						H1=Get_Location(revfmi,R,Head_Info,M,Conversion_Factor);
						if(M>=Head_Info.Gap) break;
					}

				}
			}
			Pairs[Pairs_Index].Head=0;
			return;
		}
		else//Unique tail and head with gap below cutoff...
		{
			for (unsigned i=Head.Start;i<=Head.End;i++)//try all hits...
			{
				unsigned Hit=(Conversion_Factor-BWTSaValue(revfmi,i));
				if(Hit < T1 && T1 <= Hit+d)
				{
					Pairs[Pairs_Index].Head=Hit;
					Pairs[Pairs_Index].MismatchesH=Head.Mismatches;
					Pairs[Pairs_Index].Tail=T1;
					Pairs[Pairs_Index].MismatchesT=Tail.Mismatches;
					Pairs_Index++;HITS++;
#ifdef DEBUG
				if(Hit>T1) 
				{
					printf("Search_Small_Gap(1):Enum error...\n");
					exit(0);
				}
#endif
					if(HITS >=MAXCOUNT) break;
				}
			}
			Pairs[Pairs_Index].Head=0;
			return;
		}
	}
}