int main(int argc, char* argv[]) { //fprintf(stderr,"size of indel element %d \n",sizeof(struct VCF_ALLELE)); //fprintf(stdout,"%d %d %d\n",sizeof(uint8_t),BAM_PAIRED_READ1,BAM_PAIRED_READ2); return 1; int vflag=0; time_t now; time(&now); unsigned int iseed = (unsigned int)time(NULL); srand48(iseed); struct OPTIONS* options = (struct OPTIONS*)malloc(sizeof(struct OPTIONS)); options->POOLSIZE = 2; options->targettid =-1; options->targetstart=0; options->targetend=0; int flag = optparser(argc,argv,options); if (flag ==0) return 1; REFLIST reflist; if (read_fastaheader(options->fastafile,&reflist) == -1) return -1; strcpy(reflist.fastafile,options->fastafile); FILE* fp = fopen(options->fastafile,"r"); if (read_bedfile(options->bedfile,&reflist) != -1) targeted = 1; else targeted = 0; if (strcmp(options->vcffile,"None") ==0) options->vfile = stdout; else options->vfile = fopen(options->vcffile,"w"); // open indel file with candidate indels if (strcmp(options->indelfile,"None") !=0) options->fp_indelfile = fopen(options->indelfile,"w"); else options->fp_indelfile = NULL; // print VCF to stdout as well if no VCFfile is specified if (options->vfile != NULL && strcmp(options->vcffile,"None") !=0) { vflag =1; print_crispheader(options); } if (options->bamfiles >=2) fprintf(stderr,"processing %d bamfiles: %s ..... %s \n\n",options->bamfiles,options->bamfilelist[0],options->bamfilelist[options->bamfiles-1]); multisampleVC(options,&reflist,fp); if (vflag ==1) fclose(options->vfile); fclose(fp); // close pointer to fasta reference file return 1; }
REFLIST* init_reflist(char* fastafile, REFLIST* reflist) { int i = 0; reflist = (REFLIST*) malloc(sizeof (REFLIST)); //if this statement is used, we have to return reflist pointer for memory to remain valid after function call is complete, otherwise there will be a segfault reflist->ns = 0; reflist->names = NULL; reflist->lengths = NULL; reflist->sequences = NULL; reflist->current = -1; if (read_fastaheader(fastafile, reflist) > 0) { reflist->sequences = calloc(reflist->ns, sizeof (char*)); for (i = 0; i < reflist->ns; i++) { reflist->sequences[i] = NULL; //reflist->sequences[i] = calloc(reflist->lengths[i]+1,sizeof(char)); // do not allocate memory if (i < 4) fprintf(stderr, "chrom %s length %d \n", reflist->names[i], reflist->lengths[i]); } if (reflist->ns > 4) fprintf(stderr, ".....\n.....\nchrom %s length %d\n\n", reflist->names[reflist->ns - 1], reflist->lengths[reflist->ns - 1]); } else return NULL; return reflist; }
int main (int argc, char** argv) { char samfile[1024]; char bamfile[1024]; char variantfile[1024]; char fastafile[1024]; char maskfile[1024]; strcpy(samfile,"None"); strcpy(bamfile,"None"); strcpy(variantfile,"None"); strcpy(fastafile,"None"); strcpy(maskfile,"None"); GROUPNAME = NULL; int readsorted = 0; char* sampleid = (char*)malloc(1024); sampleid[0] = '-'; sampleid[1] = '\0'; int samplecol=10; // default if there is a single sample in the VCF file int i=0,variants=0,hetvariants=0; char** bamfilelist = NULL; int bamfiles =0; logfile = NULL; fragment_file = stdout; // write fragments to this file if it is present for (i=1;i<argc;i+=2) { if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0) bamfiles++; else if (strcmp(argv[i],"--variants") ==0) strcpy(variantfile,argv[i+1]); else if (strcmp(argv[i],"--reffile") ==0 || strcmp(argv[i],"--ref") ==0) strcpy(fastafile,argv[i+1]); else if (strcmp(argv[i],"--mask") ==0 || strcmp(argv[i],"--mappability") ==0) strcpy(maskfile,argv[i+1]); else if (strcmp(argv[i],"--VCF") ==0 || strcmp(argv[i],"--vcf") ==0) { strcpy(variantfile,argv[i+1]); VCFformat =1; } else if (strcmp(argv[i],"--sorted") ==0) readsorted = atoi(argv[i+1]); else if (strcmp(argv[i],"--mbq") ==0) MINQ = atoi(argv[i+1]); else if (strcmp(argv[i],"--mmq") ==0) MIN_MQ = atoi(argv[i+1]); else if (strcmp(argv[i],"--maxIS") ==0) MAX_IS = atoi(argv[i+1]); else if (strcmp(argv[i],"--minIS") ==0) MIN_IS = atoi(argv[i+1]); else if (strcmp(argv[i],"--PEonly") ==0) PEONLY = 1; // discard single end mapped reads else if (strcmp(argv[i],"--indels") ==0) PARSEINDELS = atoi(argv[i+1]); // allow indels in hairs else if (strcmp(argv[i],"--pflag") ==0) IFLAG = atoi(argv[i+1]); // allow indels in hairs else if (strcmp(argv[i],"--qvoffset") ==0) QVoffset = atoi(argv[i+1]); else if (strcmp(argv[i],"--out") == 0 || strcmp(argv[i],"-o") ==0) fragment_file = fopen(argv[i+1],"w"); else if (strcmp(argv[i],"--logfile")==0 || strcmp(argv[i],"--log") ==0) logfile = fopen(argv[i+1],"w"); else if (strcmp(argv[i],"--singlereads")==0) SINGLEREADS = atoi(argv[i+1]); else if (strcmp(argv[i],"--maxfragments")==0) MAXFRAG = atoi(argv[i+1]); else if (strcmp(argv[i],"--noquality")==0) MISSING_QV = atoi(argv[i+1]); else if (strcmp(argv[i],"--triallelic")==0) TRI_ALLELIC = atoi(argv[i+1]); //else if (strcmp(argv[i],"--fosmids") == 0 || strcmp(argv[i],"--fosmid") ==0) FOSMIDS = 1; //else if (strcmp(argv[i],"--prior") == 0) PRIOR = atoi(argv[i+1]); //else if (strcmp(argv[i],"--comparephase") == 0 || strcmp(argv[i],"--compare") ==0) COMPARE_PHASE = atoi(argv[i+1]); else if (strcmp(argv[i],"--groupname") == 0) { GROUPNAME = (char*)malloc(1024); strcpy(GROUPNAME,argv[i+1]); } } if (bamfiles > 0 && strcmp(variantfile,"None") !=0) { bamfilelist = (char**)malloc(sizeof(char*)*bamfiles); for (i=0;i<bamfiles;i++) bamfilelist[i] = (char*)malloc(1024); bamfiles=0; for (i=1;i<argc;i+=2) { if (strcmp(argv[i],"--bam") ==0 || strcmp(argv[i],"--bamfile") ==0) strcpy(bamfilelist[bamfiles++],argv[i+1]); } fprintf(stderr,"\n extracting haplotype informative reads from bamfiles %s minQV %d minMQ %d maxIS %d \n\n",bamfilelist[0],MINQ,MIN_MQ,MAX_IS); } else { print_options(); return -1; } HASHTABLE ht; ht.htsize = 7919; init_hashtable(&ht); VARIANT* varlist; int chromosomes=0; if (VCFformat ==1) { variants = count_variants(variantfile,sampleid,&samplecol); if (variants < 0) return -1; varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants); chromosomes = read_variantfile(variantfile,varlist,&ht,&hetvariants,samplecol); } else { variants = count_variants_oldformat(variantfile); if (variants < 0) return -1; varlist = (VARIANT*)malloc(sizeof(VARIANT)*variants); chromosomes = read_variantfile_oldformat(variantfile,varlist,&ht,variants); } // variants is set to hetvariants only, but this is not correct since VARIANTS = variants; // there are two options, we include all variants in the chromvars datastructure but only use heterozygous variants for outputting HAIRS // variant-id should correspond to line-number in VCF file since that will be used for printing out variants in Hapcut // fprintf(stderr,"read %d variants from file %s chromosomes %d\n",snps,argv[1],chromosomes); CHROMVARS* chromvars = (CHROMVARS*)malloc(sizeof(CHROMVARS)*chromosomes); build_intervalmap(chromvars,chromosomes,varlist,VARIANTS); // read reference fasta file for INDELS, currently reads entire genome in one go, need to modify to read chromosome by chromosome REFLIST* reflist = (REFLIST*)malloc(sizeof(REFLIST)); reflist->ns = 0; reflist->names = NULL; reflist->lengths = NULL; reflist->sequences = NULL; reflist->current = -1; if (strcmp(fastafile,"None") != 0) { if (read_fastaheader(fastafile,reflist) > 0) { reflist->sequences = calloc(reflist->ns,sizeof(char*)); //(char**)malloc(sizeof(char*)*reflist->ns); if (FOSMIDS ==0) { for (i=0;i<reflist->ns;i++) { reflist->sequences[i] = calloc(reflist->lengths[i]+1,sizeof(char)); if (i < 5) fprintf(stderr,"contig %s length %d\n",reflist->names[i],reflist->lengths[i]); } read_fasta(fastafile,reflist); } else // 10.27.14 new code to read one chromosome at a time { fprintf(stderr,"opening fasta file %s \n",fastafile); reflist->fp = fopen(fastafile,"r"); } } } //return 1; if (readsorted ==0 && bamfiles > 0) { for (i=0;i<bamfiles;i++) { if (FOSMIDS ==0) parse_bamfile_sorted(bamfilelist[i],&ht,chromvars,varlist,reflist); //else parse_bamfile_fosmid(bamfilelist[i],&ht,chromvars,varlist,reflist,maskfile); // fosmid pool bam file } } if (logfile != NULL) fclose(logfile); if (fragment_file != NULL && fragment_file != stdout) fclose(fragment_file); // need to free up all memory before we exit the program /* int xor = pow(2,16)-1; for (i=0;i<variants;i++) { //if (varlist[i].type ==0) continue; if (varlist[i].genotype[0] == varlist[i].genotype[2]) continue; fprintf(stdout,"variant %d %s %d %d %s %s %d:%d %d:%d \n",i+1,varlist[i].genotype,varlist[i].position-1,varlist[i].type,varlist[i].RA,varlist[i].AA,varlist[i].A1>>16,varlist[i].A1 & xor,varlist[i].A2>>16,varlist[i].A2 & xor); } */ return 0; }