예제 #1
0
파일: filter.c 프로젝트: Pency/BSPT
int filter_se_fastq_bz2(FLT_OPTS *opts){
	int index=1;
	int stat_left = 0;

	SEQ_QUAL item=init_read();
	gzFile fp=gzopen_report(opts->r1,"r");
	FILE *fo=fopen_report(strcat(opts->output,".flt"),"w+");
	int left = 0;

	while(read_fastq(fp,&item,index++) > 0){
		check_read(&item,2);
		left=filter_all(&item, opts);

		if(left == 1){
			output_fastq(fo, &item);
			stat_left++;
		}
	}

	printf("Totally %d reads were processed\n",index-1);
	printf("  file [ %s ]: %d reads were left (%.2f%)\n",opts->r1,stat_left,(float) stat_left*100/(index-1));
	free_read(&item);
	fclose(fp);
	fclose(fo);
	
	return 0;
}	
예제 #2
0
파일: file_utils.c 프로젝트: Pency/BSPT
int detect_datatype(char *file){

	int dataType=0;
	int i = 0;
	int max = 0;
	int min = 999;
	int sample = 100;

	SEQ_QUAL item = init_read();
	gzFile zfp = gzopen_report(file,"r");

	if(gzgetc(zfp) == '>'){
		dataType |= FILE_FASTA;
	}else{
		gzseek(zfp, 0L, SEEK_SET);
		if(read_fastq(zfp,&item,i) >= 0){
			dataType |= FILE_FASTQ;
			do{
				for(i=0;i+1<strlen(item.qual);i++){
                                        min = MIN(min, item.qual[i]);
                                        max = MAX(max, item.qual[i]);
                                }
                                if((sample--) == 0){
                                        if(max >= 75)
						dataType|=FILE_PHRED64;
                                        else{
						dataType|=FILE_PHRED33;
						if(min > 58)
							warning_msg("Can not identified quality score type in 100 read samples, assume phred+33\n");
					}
					break;
				}
			}while(read_fastq(zfp,&item, i));
		}else
			dataType |= FILE_UNKN;
	}

	gzclose(zfp);
	free_read(&item);
	return dataType;
}
예제 #3
0
// multi sample variant caller: CRISP, PICALL or low coverage method
int multisampleVC(struct OPTIONS* options,REFLIST* reflist,FILE* fp)
{
	if (USE_DUPLICATES ==1) BAM_FILTER_MASK = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL); else BAM_FILTER_MASK = (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP);

	int bamfiles = options->bamfiles;

	int last=0; // last is the current position s.t. all reads have starting position > last
	int i=0; int h=0;
	unsigned long reads=0; int j=0; int prev_tid = -1;   int rf=0;
	int finishedfiles =0; 
	struct alignedread* pread = NULL;
	struct BAMFILE_data* bamfiles_data = calloc(bamfiles,sizeof(struct BAMFILE_data)); // added one extra to list to store indels for all samples combined

	READQUEUE* RQ = (READQUEUE*)malloc(sizeof(READQUEUE));  RQ->first = NULL; RQ->last = NULL; RQ->reads = 0; 
	int* fcigarlist = (int*)malloc(sizeof(int)*4096);

	// data structure for holding potential variants and read counts, etc 
	struct VARIANT variant;  variant.ploidy = calloc(options->bamfiles,sizeof(int)); 
	init_poolsizes(&variant,options,PICALL); 
	init_variant(&variant,options->bamfiles,options->bamfiles);
	variant.options = options;  // pointer to options

	BAMHEAP bheap; bheap.harray = (int*)malloc(sizeof(int)*bamfiles); bheap.length = bamfiles;
	for (i=0;i<bamfiles;i++) { bheap.harray[i] = i; bamfiles_data[i].finished= 0;}
	
	reflist->cinterval = -1; // first interval to the right of current base

        init_bamfiles(bamfiles_data,options->bamfilelist,bamfiles,options->regions,&options->targettid,&options->targetstart,&options->targetend);

	// error when reading indexed bam files probably due to lack of reads in some files resulting in heap error, fixed oct 17 2012
        j=0; for (i=0;i<bamfiles;i++) 
	{
		finishedfiles += bamfiles_data[i].finished; 
		if (bamfiles_data[i].finished ==0) bheap.harray[j++] = i; else bheap.length--; 	
	}
	buildminheap(&bheap,bamfiles_data); // initial minheap call
	//fprintf(stderr,"finishedfiles %d \n",finishedfiles);
	
	if (INDEL_REALIGNMENT >=1) allocate_mem_heap(bamfiles_data,bamfiles,100);

	
	HAPLOTYPES =0,MIN_COVERAGE_FLANKING =0;
	for (i=0;i<variant.samples;i++) 
	{
		MIN_COVERAGE_FLANKING += 2*variant.ploidy[i];  // enforced for regions outside the bedfile target
		HAPLOTYPES += variant.ploidy[i];
	}
	//int min_coverage_target = 1*variant->ploidy*variant->samples;  // enforced for regions outside the bedfile target
	int offset_readlength = 150;  // call variants in window (last,current_read_position-offset_readlength) to allow for indel analysis, set to 0 for original behavior of program
	// the value of offset should not affect the correctness or speed of the code
	int current_position =0;
	
	while (finishedfiles < bamfiles)
	{
		i = bheap.harray[0]; // take the top read off the heap
		if ( !(bamfiles_data[i].read->flag & BAM_FILTER_MASK))
		{
			if (bamfiles_data[i].read->tid != prev_tid) // read's chromosome is different from previousread 
			{
				if (prev_tid >=0)  // finish the processing of previous chromosome and cleanup
				{
					if (RQ->reads >0) 
					{
						fprintf(stderr,"processing %d reads left in queue for chrom %s...",RQ->reads,reflist->names[prev_tid]);
						callvariants(reflist,prev_tid,last,reflist->lengths[prev_tid],RQ,bamfiles_data,options,&variant);
						empty_queue(RQ,bamfiles_data); //clean thequeue
					}
					if (INDEL_REALIGNMENT >=1) clean_indel_lists(bamfiles_data,bamfiles,-1); current_position = 0; 
					for(j=0;j<bamfiles;j++) bamfiles_data[j].last=NULL; last =0; 
					free(reflist->sequences[prev_tid]); 
					fprintf(stderr,".....finished processing reads for chrom %s\n",reflist->names[prev_tid]);
					fprintf(stdout,".....finished processing reads for chrom %s\n",reflist->names[prev_tid]);
					reflist->cinterval = -1; // reset to -1 
				}
				read_chromosome(reflist,bamfiles_data[i].read->tid,fp); 
				prev_tid =bamfiles_data[i].read->tid;
			}

			if (bamfiles_data[i].read->position <last)
			{
				fprintf(stderr,"reads out of order i:%d h:%d pos: %d %d\n",i,h,bamfiles_data[i].read->position,last);
				fprintf(stderr,"the program will now exit, please sort the bamfiles\n");
				return 1;
			}

			if (INDEL_REALIGNMENT >=1 && bamfiles_data[i].read->position > current_position+offset_readlength) 
			{
				// need to clean up indel lists when we encounter a new chromosome... 
				print_indel_lists(bamfiles_data,bamfiles,current_position+offset_readlength); 
				clean_indel_lists(bamfiles_data,bamfiles,current_position);
				current_position = bamfiles_data[i].read->position;
			}
			// realign reads before calling variants, each read is realigned only once

			// small bug here, only call variants when last is less than current read position
			// bug fixed here, update last only when 'callvariants' is invoked, ???
			if (RQ->reads > 0 && bamfiles_data[i].read->position > last+offset_readlength) 
			{
				callvariants(reflist,bamfiles_data[i].read->tid,last,bamfiles_data[i].read->position-offset_readlength,RQ,bamfiles_data,options,&variant);  
			}
			last = bamfiles_data[i].read->position-offset_readlength; if (last < 0) last =0;

			bamfiles_data[i].read->cflag = 0; 
			// this function should only be called on reads inside/close_to targeted regions..
			parse_cigar(bamfiles_data[i].read,reflist,bamfiles_data[i].read->tid,fcigarlist); 

			if (INDEL_REALIGNMENT >=1 && bamfiles_data[i].read->gaps > 0 && bamfiles_data[i].read->mquality >= 20) extract_indel_reads(bamfiles_data[i].read,reflist,bamfiles_data[i].read->tid,i,bamfiles_data[i].ilist);
			
			//fprintf(stdout,"read s:%d IS:%d %s %d \n",i,bamfiles_data[i].read->IS,bamfiles_data[i].read->readid,bamfiles_data[i].read->position);
			if (RQ->last == NULL)
			{
				RQ->last = bamfiles_data[i].read; RQ->first = RQ->last; (RQ->last)->next = NULL;
				RQ->reads++;
			}
			else
			{
				(RQ->last)->next = bamfiles_data[i].read; RQ->last = bamfiles_data[i].read; 
				(RQ->last)->next = NULL;
				RQ->reads++;
			}
			if (bamfiles_data[i].last ==NULL) bamfiles_data[i].first = RQ->last;
			else bamfiles_data[i].last->nextread= RQ->last;
			bamfiles_data[i].last = RQ->last; (RQ->last)->nextread =NULL;
			// read that passes filters from 'i'th bam file is inserted in queue, should also add it to OPE queue 
			//if (bamfiles_data[i].read->position < bamfiles_data[i].read->mateposition && bamfiles_data[i].read->lastpos > bamfiles_data[i].read->mateposition) 
			//fprintf(stdout,"B %d %s %d %d %d \n",i,bamfiles_data[i].read->readid,bamfiles_data[i].read->position,bamfiles_data[i].read->mateposition,bamfiles_data[i].read->IS);
		}
		else free_read(bamfiles_data[i].read);
		//fprintf(stdout,"read from %d %d %s\n",i,bamfiles_data[i].read->position,bamfiles_data[i].read->readid);

		if (options->regions ==NULL) rf =samread(bamfiles_data[i].fp,bamfiles_data[i].b);
		else rf  = bam_iter_read(bamfiles_data[i].fp->x.bam,bamfiles_data[i].iter,bamfiles_data[i].b);
		if (rf >=0)
		{
			bamfiles_data[i].read = get_read_bamfile(bamfiles_data[i].b,bamfiles_data[i].fp,pread); 
			//if (options->samples ==0) bamfiles_data[i].read->sampleid = i;
			//else bamfiles_data[i].read->sampleid = options->BAM_TO_SAMPLE[i];  
			// bug here june 30 2013 commented out .... in 12 T2D pools 
			bamfiles_data[i].read->sampleid = i;
			if (!(bamfiles_data[i].read->flag & BAM_FILTER_MASK)) minHeapify(&bheap,0,bamfiles_data);
		}
		else // no more reads in file 'i' 
		{ 
			bamfiles_data[i].finished = 1; bamfiles_data[i].read= NULL; 
			bam_destroy1(bamfiles_data[i].b);
			h++; finishedfiles++; 
			//fprintf(stderr,"finished reading bam file %s \n",options->bamfilelist[i]); //return 1;
			bheap.harray[0] = bheap.harray[bheap.length-1]; bheap.length--;
			if (bheap.length > 0) minHeapify(&bheap,0,bamfiles_data);
			// call minheapify like function to push sample i off the heap, reduce heap size
		} 
		if ((++reads)%1000000 ==0 && RQ->reads >0) fprintf(stderr,".....processed %ld reads QSIZE:%d %s:%d:%d variants called %d\n",reads,RQ->reads,RQ->first->chrom,RQ->first->position,RQ->first->lastpos,VARIANTS_CALLED);
	}

	if (prev_tid >=0)  // finish the processing of last chromosome 
	{
		if (RQ->reads >0) 
		{
			fprintf(stderr,"processing %d reads left in queue for chrom %s.....",RQ->reads,reflist->names[prev_tid]);
			if (reflist->lengths[prev_tid] > last) callvariants(reflist,prev_tid,last,reflist->lengths[prev_tid],RQ,bamfiles_data,options,&variant);
			empty_queue(RQ,bamfiles_data); //clean thequeue
		}
		else fprintf(stderr,"queue for chrom %s is empty ",reflist->names[prev_tid]);
		free(reflist->sequences[prev_tid]); 
		fprintf(stderr,"finished processing reads for chrom %s \n\n",reflist->names[prev_tid]);
		if (INDEL_REALIGNMENT >=1) 
		{
			print_indel_lists(bamfiles_data,bamfiles,reflist->lengths[prev_tid]); 
			clean_indel_lists(bamfiles_data,bamfiles,reflist->lengths[prev_tid]);
		}
	}
	fprintf(stderr,"CRISP has finished processing bam files: total reads processed %ld total variants called %d \n\n",reads,VARIANTS_CALLED);

	//for (i=0;i<bamfiles;i++) bam_destroy1(bamfiles_data[i].b);
	free(bamfiles_data); free(bheap.harray); free(fcigarlist);
	//empty_queue(RQ); //clean thequeue
	//fprintf(stdout,"FILE %d %s %d %s %d %d %d mapped %d \n",i,read->readid,read->flag,read->chrom,read->position,read->mquality,read->IS,(read->flag &4));
	return 1;
}
예제 #4
0
파일: filter.c 프로젝트: Pency/BSPT
int filter_pe_fastq_bz2(FLT_OPTS *opts){
	int left1=0,left2=0;
	int stat_single1 = 0;
	int stat_single2 = 0;
	int stat_paired = 0;
	int index=1;
	char fn[128];
	char outfile[128];
	SEQ_QUAL item1=init_read();
	SEQ_QUAL item2=init_read();

	BZFILE *fp1=bzopen_report(opts->r1,"r");
	BZFILE *fp2=bzopen_report(opts->r2,"r");
	file_name(outfile,opt->r1);
	sprintf(fn,"%s/%s.flt",opts->output,outfile);
	FILE *fo1=fopen_report(fn,"w+");
	if(!fo1)	return -1;
	file_name(outfile,opts->r2);
	sprintf(fn,"%s/%s.flt",opts->output,outfile);
	FILE *fo2=fopen_report(fn,"w+");
	if(!fo2)	return -1;
	sprintf(fn,"%s/%s.flt.s",opts->output,outfile);
	FILE *fos=fopen_report(fn,"w+");
	if(!fos)	return -1;

	while(read_fastq_bz2(fp1,&item1,index) >= 0 && read_fastq_bz2(fp2,&item2,index) >= 0)
	{
		left1=filter_all(&item1, opts);
		left2=filter_all(&item2, opts);

		if(left1 == 1 && left2 == 1){
			output_fastq(fo1, &item1);
			output_fastq(fo2, &item2);
			stat_single1++;
			stat_single2++;
			stat_paired++;
		}else{
			if(left1 == 1){
				output_fastq(fos, &item1);
				stat_single1++;
			}
			if(left2 == 1){
				output_fastq(fos, &item2);
				stat_single2++;
			}
		}
		index++;
	}

	printf("Totally %d reads were processed\n",(index-1)*2);
	printf("  file [ %s ]: %d reads were left (%.2f%)\n",opts->r1,stat_paired+stat_single1,(float) (stat_paired+stat_single1)*100/(index-1));
	printf("  file [ %s ]: %d reads were left (%.2f%)\n",opts->r2,stat_paired+stat_single2,(float) (stat_paired+stat_single2)*100/(index-1));
	printf("After filtering %d reads are paired in each file (%.2f%)\n",stat_paired,(float) stat_paired*100/(index-1));
	printf("  file [ %s ]: %d reads were left as single end\n",opts->r1,stat_single1);
	printf("  file [ %s ]: %d reads were left as single end\n",opts->r2,stat_single2);
	free_read(&item1);
	free_read(&item2);
	BZ2_bzclose(fp1);
	BZ2_bzclose(fp2);
	fclose(fo1);
	fclose(fo2);
	fclose(fos);
	
	return 0;
}