示例#1
0
int main(int argc, char* argv[])
{
	//fprintf(stderr,"size of indel element %d \n",sizeof(struct VCF_ALLELE));
	//fprintf(stdout,"%d %d %d\n",sizeof(uint8_t),BAM_PAIRED_READ1,BAM_PAIRED_READ2); return 1;
	int vflag=0;
	time_t now; time(&now);    unsigned int iseed = (unsigned int)time(NULL);  srand48(iseed);
	struct OPTIONS* options = (struct OPTIONS*)malloc(sizeof(struct OPTIONS)); options->POOLSIZE = 2;
	options->targettid =-1; options->targetstart=0; options->targetend=0;
	int flag = optparser(argc,argv,options); if (flag ==0) return 1;

	REFLIST reflist;
	if (read_fastaheader(options->fastafile,&reflist) == -1)  return -1;  strcpy(reflist.fastafile,options->fastafile);
	FILE* fp = fopen(options->fastafile,"r"); 

	if (read_bedfile(options->bedfile,&reflist) != -1) targeted = 1; else  targeted = 0; 

	if (strcmp(options->vcffile,"None") ==0) options->vfile = stdout; else options->vfile = fopen(options->vcffile,"w");

	// open indel file with candidate indels 
	if (strcmp(options->indelfile,"None") !=0) options->fp_indelfile = fopen(options->indelfile,"w"); else options->fp_indelfile = NULL;

	// print VCF to stdout as well if no VCFfile is specified 
	if (options->vfile != NULL && strcmp(options->vcffile,"None") !=0)
	{
		vflag =1; print_crispheader(options);
	}

	if (options->bamfiles >=2) fprintf(stderr,"processing %d bamfiles: %s ..... %s \n\n",options->bamfiles,options->bamfilelist[0],options->bamfilelist[options->bamfiles-1]);
	multisampleVC(options,&reflist,fp);
	if (vflag ==1) fclose(options->vfile); 
	fclose(fp); // close pointer to fasta reference file 
	return 1;

}
示例#2
0
REFLIST* init_reflist(char* fastafile, REFLIST* reflist) {
    int i = 0;
    reflist = (REFLIST*) malloc(sizeof (REFLIST));
    //if this statement is used, we have to return reflist pointer for memory to remain valid after function call is complete, otherwise there will be a segfault
    reflist->ns = 0;
    reflist->names = NULL;
    reflist->lengths = NULL;
    reflist->sequences = NULL;
    reflist->current = -1;
    if (read_fastaheader(fastafile, reflist) > 0) {
        reflist->sequences = calloc(reflist->ns, sizeof (char*));
        for (i = 0; i < reflist->ns; i++) {
            reflist->sequences[i] = NULL;
            //reflist->sequences[i] = calloc(reflist->lengths[i]+1,sizeof(char)); // do not allocate memory
            if (i < 4) fprintf(stderr, "chrom %s length %d \n", reflist->names[i], reflist->lengths[i]);
        }
        if (reflist->ns > 4) fprintf(stderr, ".....\n.....\nchrom %s length %d\n\n", reflist->names[reflist->ns - 1], reflist->lengths[reflist->ns - 1]);
    } else return NULL;
    return reflist;
}
示例#3
0
int main (int argc, char** argv)
{
	char samfile[1024]; char bamfile[1024]; char variantfile[1024]; char fastafile[1024]; char maskfile[1024];
	strcpy(samfile,"None"); strcpy(bamfile,"None"); strcpy(variantfile,"None"); strcpy(fastafile,"None"); strcpy(maskfile,"None");
	GROUPNAME = NULL;
	int readsorted = 0;
	char* sampleid = (char*)malloc(1024); sampleid[0] = '-'; sampleid[1] = '\0';
	int samplecol=10; // default if there is a single sample in the VCF file
	int i=0,variants=0,hetvariants=0;
	char** bamfilelist = NULL; int bamfiles =0; 

	logfile = NULL; fragment_file = stdout; // write fragments to this file if it is present
	for (i=1;i<argc;i+=2)
	{
		if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0)        bamfiles++; 
		else if (strcmp(argv[i],"--variants") ==0)        strcpy(variantfile,argv[i+1]);
		else if (strcmp(argv[i],"--reffile") ==0 || strcmp(argv[i],"--ref") ==0)        strcpy(fastafile,argv[i+1]);
		else if (strcmp(argv[i],"--mask") ==0 || strcmp(argv[i],"--mappability") ==0)        strcpy(maskfile,argv[i+1]);
		else if (strcmp(argv[i],"--VCF") ==0 || strcmp(argv[i],"--vcf") ==0)    {     strcpy(variantfile,argv[i+1]); VCFformat =1; }
		else if (strcmp(argv[i],"--sorted") ==0)       readsorted = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--mbq") ==0)       MINQ = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--mmq") ==0)       MIN_MQ = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--maxIS") ==0)       MAX_IS = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--minIS") ==0)       MIN_IS = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--PEonly") ==0)       PEONLY = 1;  // discard single end mapped reads 
		else if (strcmp(argv[i],"--indels") ==0)       PARSEINDELS = atoi(argv[i+1]);  // allow indels in hairs
		else if (strcmp(argv[i],"--pflag") ==0)      IFLAG  = atoi(argv[i+1]);  // allow indels in hairs
		else if (strcmp(argv[i],"--qvoffset") ==0)       QVoffset = atoi(argv[i+1]);
		else if (strcmp(argv[i],"--out") == 0 || strcmp(argv[i],"-o") ==0) fragment_file = fopen(argv[i+1],"w");
		else if (strcmp(argv[i],"--logfile")==0 || strcmp(argv[i],"--log") ==0) logfile = fopen(argv[i+1],"w");  
		else if (strcmp(argv[i],"--singlereads")==0) SINGLEREADS = atoi(argv[i+1]);  
		else if (strcmp(argv[i],"--maxfragments")==0) MAXFRAG = atoi(argv[i+1]);  
		else if (strcmp(argv[i],"--noquality")==0) MISSING_QV = atoi(argv[i+1]);  
		else if (strcmp(argv[i],"--triallelic")==0) TRI_ALLELIC = atoi(argv[i+1]);  
		//else if (strcmp(argv[i],"--fosmids") == 0 || strcmp(argv[i],"--fosmid") ==0) FOSMIDS = 1;
		//else if (strcmp(argv[i],"--prior") == 0) PRIOR = atoi(argv[i+1]); 
		//else if (strcmp(argv[i],"--comparephase") == 0 || strcmp(argv[i],"--compare") ==0) COMPARE_PHASE = atoi(argv[i+1]); 
		else if (strcmp(argv[i],"--groupname") == 0) 
		{
			GROUPNAME = (char*)malloc(1024); strcpy(GROUPNAME,argv[i+1]); 
		}
	}
	if (bamfiles > 0 && strcmp(variantfile,"None") !=0)
	{
		bamfilelist = (char**)malloc(sizeof(char*)*bamfiles); 
		for (i=0;i<bamfiles;i++) bamfilelist[i] = (char*)malloc(1024);
		bamfiles=0;
		for (i=1;i<argc;i+=2)
		{
			if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0)     strcpy(bamfilelist[bamfiles++],argv[i+1]);
		}
		fprintf(stderr,"\n extracting haplotype informative reads from bamfiles %s minQV %d minMQ %d maxIS %d \n\n",bamfilelist[0],MINQ,MIN_MQ,MAX_IS);
	}
	else
	{
		print_options(); return -1;
	}

	HASHTABLE ht; ht.htsize = 7919;  init_hashtable(&ht);
	VARIANT* varlist;
	int chromosomes=0;

	if (VCFformat ==1)
	{
		variants = count_variants(variantfile,sampleid,&samplecol); 
		if (variants < 0) return -1; 
		varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants);
		chromosomes = read_variantfile(variantfile,varlist,&ht,&hetvariants,samplecol); 
	}
	else
	{
		variants = count_variants_oldformat(variantfile);
		if (variants < 0) return -1; 
		varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants);
		chromosomes = read_variantfile_oldformat(variantfile,varlist,&ht,variants);
	}
	// variants is set to hetvariants only, but this is not correct since 
	VARIANTS = variants;  
	// there are two options, we include all variants in the chromvars datastructure but only use heterozygous variants for outputting HAIRS 
	// variant-id should correspond to line-number in VCF file since that will be used for printing out variants in Hapcut 

	//	fprintf(stderr,"read %d variants from file %s chromosomes %d\n",snps,argv[1],chromosomes);
	CHROMVARS* chromvars  = (CHROMVARS*)malloc(sizeof(CHROMVARS)*chromosomes);
	build_intervalmap(chromvars,chromosomes,varlist,VARIANTS);

	// read reference fasta file for INDELS, currently reads entire genome in one go, need to modify to read chromosome by chromosome 
	REFLIST* reflist = (REFLIST*)malloc(sizeof(REFLIST)); 
	reflist->ns = 0; reflist->names = NULL; reflist->lengths = NULL; reflist->sequences = NULL; reflist->current = -1;
	if (strcmp(fastafile,"None") != 0)
	{
		if (read_fastaheader(fastafile,reflist) > 0) 
		{
			reflist->sequences = calloc(reflist->ns,sizeof(char*)); //(char**)malloc(sizeof(char*)*reflist->ns);
			if (FOSMIDS ==0)
			{
				for (i=0;i<reflist->ns;i++)
				{
					reflist->sequences[i] = calloc(reflist->lengths[i]+1,sizeof(char));
					if (i < 5) fprintf(stderr,"contig %s length %d\n",reflist->names[i],reflist->lengths[i]);
				}
				read_fasta(fastafile,reflist);
			}
			else // 10.27.14 new code to read one chromosome at a time 
			{
				fprintf(stderr,"opening fasta file %s \n",fastafile);
				reflist->fp = fopen(fastafile,"r");
			}
		}
	}
	//return 1;
	if (readsorted ==0 && bamfiles > 0)
	{
		for (i=0;i<bamfiles;i++) 
		{
			if (FOSMIDS ==0) parse_bamfile_sorted(bamfilelist[i],&ht,chromvars,varlist,reflist);
			//else parse_bamfile_fosmid(bamfilelist[i],&ht,chromvars,varlist,reflist,maskfile); // fosmid pool bam file 
		}
	}
	if (logfile != NULL) fclose(logfile);
	if (fragment_file != NULL && fragment_file != stdout) fclose(fragment_file);


	// need to free up all memory before we exit the program 
	/*
	int xor = pow(2,16)-1;
	for (i=0;i<variants;i++)
	{
		//if (varlist[i].type ==0) continue;
		if (varlist[i].genotype[0] == varlist[i].genotype[2]) continue;
		fprintf(stdout,"variant %d %s %d %d %s %s %d:%d %d:%d \n",i+1,varlist[i].genotype,varlist[i].position-1,varlist[i].type,varlist[i].RA,varlist[i].AA,varlist[i].A1>>16,varlist[i].A1 & xor,varlist[i].A2>>16,varlist[i].A2 & xor);
	}
	*/
	return 0;
}