static void print_restart(const struct md *md) { msg(" RESTART DATA\n\n"); for (size_t i = 0; i < md->n_bodies; i++) { struct body *body = md->bodies + i; char name[64]; check_fail(efp_get_frag_name(md->state->efp, i, sizeof(name), name)); double xyzabc[6] = { body->pos.x * BOHR_RADIUS, body->pos.y * BOHR_RADIUS, body->pos.z * BOHR_RADIUS }; matrix_to_euler(&body->rotmat, xyzabc + 3, xyzabc + 4, xyzabc + 5); double vel[6] = { body->vel.x, body->vel.y, body->vel.z, body->angmom.x * body->inertia_inv.x, body->angmom.y * body->inertia_inv.y, body->angmom.z * body->inertia_inv.z }; print_fragment(name, xyzabc, vel); } msg("\n"); }
// sort the fragment list by 'mate-position or position of 2nd read' so that reads that are from the same DNA fragment are together // also takes care of overlapping paired-end reads to avoid duplicates in fragments void clean_fragmentlist(FRAGMENT* flist,int* fragments,VARIANT* varlist,int currchrom,int currpos,int prevchrom) { int i=0,j=0,k=0,first=0,sl=0; FRAGMENT fragment; fragment.variants =0; fragment.alist = (allele*)malloc(sizeof(allele)*16184); if (*fragments > 1) qsort(flist,*fragments,sizeof(FRAGMENT),compare_fragments); // sort such that mate pairs are together and reverse sorted by starting position of second read in a mate-piar //for (i=0;i<*fragments;i++) fprintf(stdout,"frag %s %d vars %d \n",flist[i].id,flist[i].alist[0].varid,flist[i].variants); if (currchrom== prevchrom) // need to ignore the top of the fragment list { first=0; while (flist[first].matepos >= currpos && first < *fragments) first++; } // fprintf(stdout,"cleaning the fragment list: current chrom %d %d first %d fragments %d\n",currchrom,currpos,first,*fragments); if (*fragments > 1) // bug fixed jan 13 2012, when there is only one fragment, we don't need to check if it is part of mate-pair { // serious bug fixed here: mate-pairs being examined twice April 5 2012 // check this code for corrrectness: mate-pairs will be adjacent to each other. i =first; while (i< (*fragments)-1) { if (strcmp(flist[i].id,flist[i+1].id) == 0) // mate pair with both ends having at least one variant { //fprintf(stdout,"mate-pair %s %s %s\n",flist[i].id); if (flist[i].alist[flist[i].variants-1].varid < flist[i+1].alist[0].varid) print_matepair(&flist[i],&flist[i+1],varlist,fragment_file); else if (flist[i+1].alist[flist[i+1].variants-1].varid < flist[i].alist[0].varid) print_matepair(&flist[i+1],&flist[i],varlist,fragment_file); else if (flist[i].variants+flist[i+1].variants > 2) { j=0;k=0; fragment.variants =0; while (j < flist[i].variants || k < flist[i+1].variants) { if (j >= flist[i].variants) { fragment.alist[fragment.variants].varid = flist[i+1].alist[k].varid; fragment.alist[fragment.variants].allele = flist[i+1].alist[k].allele; fragment.alist[fragment.variants].qv = flist[i+1].alist[k].qv; fragment.variants++; k++; continue; } if (k >= flist[i+1].variants) { fragment.alist[fragment.variants].varid = flist[i].alist[j].varid; fragment.alist[fragment.variants].allele = flist[i].alist[j].allele; fragment.alist[fragment.variants].qv = flist[i].alist[j].qv; fragment.variants++; j++; continue; } if (flist[i].alist[j].varid < flist[i+1].alist[k].varid) { fragment.alist[fragment.variants].varid = flist[i].alist[j].varid; fragment.alist[fragment.variants].allele = flist[i].alist[j].allele; fragment.alist[fragment.variants].qv = flist[i].alist[j].qv; fragment.variants++; j++; } else if (flist[i].alist[j].varid > flist[i+1].alist[k].varid) { fragment.alist[fragment.variants].varid = flist[i+1].alist[k].varid; fragment.alist[fragment.variants].allele = flist[i+1].alist[k].allele; fragment.alist[fragment.variants].qv = flist[i+1].alist[k].qv; fragment.variants++; k++; } else if (flist[i].alist[j].allele==flist[i+1].alist[k].allele) // consistent { fragment.alist[fragment.variants].varid = flist[i].alist[j].varid; fragment.alist[fragment.variants].allele = flist[i].alist[j].allele; fragment.alist[fragment.variants].qv = flist[i].alist[j].qv; if (flist[i+1].alist[k].qv > flist[i].alist[j].qv) fragment.alist[fragment.variants].qv = flist[i+1].alist[k].qv; fragment.variants++; j++; k++; } else { j++;k++; } } if (fragment.variants >= 2) { sl = strlen(flist[i].id); fragment.id = (char*)malloc(sl+1); for (j=0;j<sl;j++) fragment.id[j] = flist[i].id[j]; fragment.id[j] = '\0'; //for (j=0;j<flist[i].variants;j++) fprintf(stdout,"%d ",flist[i].alist[j].varid); fprintf(stdout,"| "); //for (j=0;j<flist[i+1].variants;j++) fprintf(stdout,"%d ",flist[i+1].alist[j].varid); //fprintf(stdout,"order of variants not correct %s \t",flist[i].id); print_fragment(&fragment,varlist,fragment_file); free(fragment.id); } } else if (flist[i].variants+flist[i+1].variants ==2 && SINGLEREADS ==1)print_fragment(&flist[i],varlist,fragment_file); // added 05/31/2017 for OPE //else if (flist[i].variants ==1 && flist[i+1].variants >1) print_fragment(&flist[i+1],varlist); //else if (flist[i].variants > 1 && flist[i+1].variants ==1) print_fragment(&flist[i],varlist); // april 27 2012 these PE reads were being ignored until now i +=2; // what about overlapping paired-end reads.... reads..... ???? jan 13 2012, } else if ( flist[i].variants >= 2 || SINGLEREADS ==1) { print_fragment(&flist[i],varlist,fragment_file); i++; } else i++; } // last read examined if it is not paired if (i < *fragments) { if (flist[i].variants >= 2 || SINGLEREADS ==1) print_fragment(&flist[i],varlist,fragment_file); } } else // only one fragment in fraglist single end { if (flist[first].variants >= 2 || SINGLEREADS ==1) print_fragment(&flist[first],varlist,fragment_file); } // free the fragments starting from first.... if (*fragments > 0)// check added jan 13 2012 { for (i=first;i<*fragments;i++) { free(flist[i].id); free(flist[i].alist); } } (*fragments) = first; free(fragment.alist); }
// extract haplotype informative reads from sorted bam file // // need to discard reads that are marked as duplicates using flag // int parse_bamfile_sorted(char* bamfile,HASHTABLE* ht,CHROMVARS* chromvars,VARIANT* varlist,REFLIST* reflist) { fprintf(stderr,"reading sorted bamfile %s \n",bamfile); int reads=0; struct alignedread* read = (struct alignedread*)malloc(sizeof(struct alignedread)); int i=0; int sl=0; int chrom=0; int v1,v2; int absIS; int prevchrom=-1; int prevtid = -1; FRAGMENT* flist = (FRAGMENT*)malloc(sizeof(FRAGMENT)*MAXFRAG); int fragments =0; int prevfragments =0; FRAGMENT fragment; fragment.variants =0; fragment.alist = (allele*)malloc(sizeof(allele)*4096); samfile_t *fp; if ((fp = samopen(bamfile, "rb", 0)) == 0) { fprintf(stderr, "Fail to open BAM file %s\n", bamfile); return -1; } bam1_t *b = bam_init1(); while (samread(fp, b) >= 0) { fetch_func(b, fp,read); if ((read->flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) || read->mquality < MIN_MQ) { free_readmemory(read); continue; } // find the chromosome in reflist that matches read->chrom if the previous chromosome is different from current chromosome if (read->tid != prevtid) { chrom = getindex(ht,read->chrom); // doing this for every read, can replace this by string comparison ..april 4 2012 i = read->tid; if (reflist->ns > 0) { reflist->current = i; if (i >= reflist->ns || i < 0 || strcmp(reflist->names[i],read->chrom) !=0) { reflist->current = -1; for (i=0;i<reflist->ns;i++) { if (strcmp(reflist->names[i],read->chrom) ==0) { reflist->current = i; break; } } } } } else chrom = prevchrom; if (read->tid == read->mtid) // use mateposition to calculate insert size, march 12 2013, wrong since we need to consider the readlength/cigar { //read->IS = read->mateposition - read->position; } absIS = (read->IS < 0) ? -1*read->IS: read->IS; // add check to see if the mate and its read are on same chromosome, bug for contigs, july 16 2012 if ((read->flag & 8) || absIS > MAX_IS || absIS < MIN_IS || read->IS ==0 || !(read->flag & 1) || read->tid != read->mtid) // single read { fragment.variants =0; v1 =0; v2=0; if (chrom >=0 && PEONLY ==0) { fragment.id = read->readid; v1 = extract_variants_read(read,ht,chromvars,varlist,0,&fragment,chrom,reflist); if (fragment.variants >= 2 || (SINGLEREADS ==1 && fragment.variants >=1)) { // instead of printing fragment, we could change this to update genotype likelihoods print_fragment(&fragment,varlist,fragment_file); } } } else // paired-end read { //fprintf(stdout,"tid %d %d \n",read->tid,read->mtid); fragment.variants =0; v1 =0; v2=0; fragment.id = read->readid; if (chrom >=0) v1 = extract_variants_read(read,ht,chromvars,varlist,1,&fragment,chrom,reflist); //fprintf(stderr,"paired read stats %s %d flag %d IS %d\n",read->chrom,read->cigs,read->flag,read->IS); if (fragment.variants > 0) { //fprintf(stderr,"variants %d read %s %s \n",fragment.variants,read->chrom,read->readid); add_fragment(flist,&fragment,read,fragments); fragments++; if (fragments >= MAXFRAG) { fprintf(stderr,"exceeded max #cached fragments: %d,increase MAXFRAGMENTS using --maxfragments option \n",MAXFRAG); return -1; } } } // BUG here when the fragment list cannot be cleaned due to long mate-pair fragments (accumulated for large IS) // fragments >= 100000 and we will clean it repeatedly... // need to fix this june 4 2012.... even for long mate-pairs this could be a problem... if ( (fragments-prevfragments >= 100000) || fragments >= MAXFRAG -10000 || (chrom != prevchrom && prevchrom != -1 && fragments > 0)) // chrom of current read is not the same as previous read's chromosome... { if (PFLAG ==1) fprintf(stderr,"cleaning buffer: current chrom %s %d fragments %d\n",read->chrom,read->position,fragments); // BUG HERE when trying to clean empty fragment list (fragments ==0) if (fragments > 0) clean_fragmentlist(flist,&fragments,varlist,chrom,read->position,prevchrom); prevfragments = fragments; //fprintf(stderr,"remaining %d\n",fragments); } reads+=1; if (reads%2000000 ==0) fprintf(stderr,"processed %d reads, useful fragments %d\n",reads,fragments); prevchrom = chrom; prevtid = read->tid; free_readmemory(read); } if (fragments > 0) { fprintf(stderr,"final cleanup of fragment list: %d current chrom %s %d \n",fragments,read->chrom,read->position); clean_fragmentlist(flist,&fragments,varlist,-1,read->position,prevchrom); } bam_destroy1(b); }
int generate_single_fragment(struct alignedread** readlist, int s, int e, int length, double read_density, FRAGMENT* flist, VARIANT* varlist) { int j = 0, i = 0, k = 0; FRAGMENT fragment; fragment.variants = 0; fragment.alist = (allele*) malloc(sizeof (allele)*4096); for (k = s; k < e; k++) { i = k; if (readlist[i]->IS < 0 || ((readlist[i]->flag & 1024) == 1024)) continue; if (readlist[i]->findex >= 0) { for (j = 0; j < flist[readlist[i]->findex].variants; j++) { fragment.alist[fragment.variants].varid = flist[readlist[i]->findex].alist[j].varid; fragment.alist[fragment.variants].allele = flist[readlist[i]->findex].alist[j].allele; fragment.alist[fragment.variants].qv = flist[readlist[i]->findex].alist[j].qv; fragment.variants++; } } i = readlist[i]->mateindex; if (i >= 0 && readlist[i]->findex >= 0) { for (j = 0; j < flist[readlist[i]->findex].variants; j++) { fragment.alist[fragment.variants].varid = flist[readlist[i]->findex].alist[j].varid; fragment.alist[fragment.variants].allele = flist[readlist[i]->findex].alist[j].allele; fragment.alist[fragment.variants].qv = flist[readlist[i]->findex].alist[j].qv; fragment.variants++; } } } int unique_variants = 1; int hets = 0; int counts[4]; int qv = 0; qsort(fragment.alist, fragment.variants, sizeof (allele), compare_alleles); for (i = 0; i < fragment.variants; i++) { j = fragment.alist[i].varid; if (i > 0 && fragment.alist[i].varid != fragment.alist[i - 1].varid) unique_variants++; if (i > 0 && j == fragment.alist[i - 1].varid && fragment.alist[i].allele != fragment.alist[i - 1].allele) hets++; } if (hets >= 2 || hets * 3 >= unique_variants || unique_variants < 2) // fragment only has single variant or has 2 or more heterzygous variants { free(fragment.alist); return 0; } FRAGMENT fp; fp.variants = 0; fp.alist = (allele*) malloc(sizeof (allele) * unique_variants); counts[0] = counts[1] = counts[2] = counts[3] = 0; counts[(int) fragment.alist[0].allele - 48]++; counts[(int) fragment.alist[0].allele - 48 + 2] += (int) fragment.alist[0].qv - QVoffset; j = 0; for (i = 1; i <= fragment.variants; i++) { if (i == fragment.variants || fragment.alist[i].varid != fragment.alist[i - 1].varid) { // print consensus base if (counts[0] > counts[1] && counts[1] <= 1) { fp.alist[j].varid = fragment.alist[i - 1].varid; fp.alist[j].allele = '0'; qv = (QVoffset + counts[2] - counts[3]); if (counts[2] - counts[3] >= 60) qv = 60 + QVoffset; fp.alist[j].qv = (char) (qv); if (qv - QVoffset >= MINQ) j++; } else if (counts[1] > counts[0] && counts[0] <= 1) { fp.alist[j].varid = fragment.alist[i - 1].varid; fp.alist[j].allele = '1'; qv = (QVoffset + counts[3] - counts[2]); if (counts[3] - counts[2] >= 60) qv = 60 + QVoffset; fp.alist[j].qv = (char) (qv); if (qv - QVoffset >= MINQ) j++; } counts[0] = counts[1] = counts[2] = counts[3] = 0; } if (i < fragment.variants) { counts[(int) fragment.alist[i].allele - 48]++; counts[(int) fragment.alist[i].allele - 48 + 2] += (int) fragment.alist[i].qv - QVoffset; } } /* */ fprintf(stdout, "fragment %d %d \n", unique_variants, j); fp.id = (char*) malloc(1024); //if (GROUPNAME != NULL) sprintf(fp.id,"%s:%s:%d_%d_%d_%0.1f",GROUPNAME,varlist[fp.alist[0].varid].chrom,readlist[s].position,readlist[e-1].position,length,read_density); //else sprintf(fp.id,"%s:%d_%d_%d_%0.1f",varlist[fp.alist[0].varid].chrom,readlist[s].position,readlist[e-1].position,length,read_density); sprintf(fp.id, "%s:%d_%d_%d_%0.1f", varlist[fp.alist[0].varid].chrom, readlist[s]->position, readlist[e - 1]->position, length, read_density); fp.variants = j; if (j >= 2) { fprintf(stdout, "FRAGMENT "); print_fragment(&fp, varlist, stdout); //fprintf(stderr,"fragfile %s \n",fragment_file); //if (fragment_file != stdout) print_fragment(&fp, varlist, fragment_file); } free(fp.alist); free(fp.id); for (i = 0; i < fragment.variants; i++) { j = fragment.alist[i].varid; if (i == 0 || j != fragment.alist[i - 1].varid) fprintf(stdout, "\n %d:%d %s/%s %c:%d | ", j, varlist[j].position, varlist[j].allele1, varlist[j].allele2, fragment.alist[i].allele, fragment.alist[i].qv - 33); else if (fragment.alist[i].allele != fragment.alist[i - 1].allele) fprintf(stdout, "%c:%d:HET | ", fragment.alist[i].allele, fragment.alist[i].qv - 33); else fprintf(stdout, "%c:%d | ", fragment.alist[i].allele, fragment.alist[i].qv - 33); } fprintf(stdout, "\n"); free(fragment.alist); return 1; }