void eseqclustersingle::init(INDTYPE count,const estr& ofilename,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist) { ofile.open(ofilename,"w"); ofile.write("# seqsfile: "+seqsfile+"\n"); ofile.write("# OTU_count Merge_distance Merged_OTU_id1 Merged_OTU_id2\n"); long i,j; mergecount=0; scount.reserve(count); scluster.reserve(count); smerge.reserve(count); incluster.reserve(count); for (i=0; i<count; ++i){ scount.add(1); scluster.add(i); smerge.add(-1); incluster.add(list<INDTYPE>()); incluster[i].push_back(i); } for (i=0; i<dupslist.size(); ++i){ for (j=1; j<dupslist[i].size(); ++j){ ++mergecount; ofile.write(estr(scluster.size()-mergecount)+" 1.0 "+dupslist[i][0]+" "+dupslist[i][j]+"\n"); clusterData.mergearr.add(eseqdist(dupslist[i][0],dupslist[i][j],1.0)); } } cout << "# initializing cluster with: "<< count<< " seqs" << endl; }
void eseqclusterData::load(const efile& f,int _count) { count=_count; mergearr.clear(); estr line; estrarray parts; while (f.readln(line)){ if (line.len()==0 || line[0]=='#') continue; parts=line.explode(" "); if (parts.size()==3){ mergearr.add(eseqdist(parts[1].i(),parts[2].i(),parts[0].f())); }else if (parts.size()==4){ mergearr.add(eseqdist(parts[2].i(),parts[3].i(),parts[1].f())); }else ldie("fields in line different than 3: "+line); } }
void eseqclusteravg::init(INDTYPE count,const estr& filename,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist,float _thres,float (_fdist)(const estr&,const estr&,int),estrarray& _seqarr,int _seqlen) { thres=_thres; seqarr=&_seqarr; seqlen=_seqlen; fdist=_fdist; ofile.open(filename,"w"); ofile.write("# seqsfile: "+seqsfile+"\n"); ofile.write("# OTU_count Merge_distance Merged_OTU_id1 Merged_OTU_id2\n"); long i,j; incmaxdist=1.0; incmaxit=smatrix.end(); cf=0; lastdist=0.0; scount.reserve(count); scluster.reserve(count); smerge.reserve(count); inter.reserve(count); incluster.reserve(count); mergecount=0; for (i=0; i<count; ++i){ scount.add(1); scluster.add(i); smerge.add(-1); incluster.add(list<INDTYPE>()); incluster[i].push_back(i); inter.add(list<INDTYPE>()); } for (i=0; i<dupslist.size(); ++i){ for (j=1; j<dupslist[i].size(); ++j){ ++mergecount; ofile.write(estr(scluster.size()-mergecount)+" 1.0 "+dupslist[i][0]+" "+dupslist[i][j]+"\n"); clusterData.mergearr.add(eseqdist(dupslist[i][0],dupslist[i][j],1.0)); } } cout << "# initializing cluster with: "<< count<< " seqs" << endl; cout << "# initializing smatrix with: " << (long)(count)*(long)(count)/20000l/2l<< " elements" << endl; smatrix.reserve((long)(count)*(long)(count)/20000l/2l); // cout << "# smatrix._hashitems = " << smatrix._hashitems << endl; }
void eseqclustersingle::merge(INDTYPE x,INDTYPE y,float dist) { if (x==y) return; ldieif(scount[x]==0 || scount[y]==0,"also should not happen"); clusterData.mergearr.add(eseqdist(x,y,dist)); smerge[x]=x; smerge[y]=x; scount[x]+=scount[y]; scount[y]=0; list<INDTYPE>::iterator it; for (it=incluster[y].begin(); it!=incluster[y].end(); ++it){ scluster[*it]=x; incluster[x].push_back(*it); } ++mergecount; // cout << scluster.size()-mergecount << " " << dist << " " << x << " " << y << endl; ofile.write(estr(scluster.size()-mergecount)+" "+dist+" "+x+" "+y+"\n"); }
void eseqclusteravg::merge(const eseqdistCount& sdist) { ldieif(sdist.x==sdist.y,"should not happen!"); ldieif(scount[sdist.x]==0 || scount[sdist.y]==0,"also should not happen"); clusterData.mergearr.add(eseqdist(sdist.x,sdist.y,sdist.dist)); smerge[sdist.x]=sdist.x; smerge[sdist.y]=sdist.x; scount[sdist.x]+=scount[sdist.y]; scount[sdist.y]=0; list<INDTYPE>::iterator it; for (it=incluster[sdist.y].begin(); it!=incluster[sdist.y].end(); ++it){ scluster[*it]=sdist.x; incluster[sdist.x].push_back(*it); } eseqdistCount tmpdist,tmpdist2; // estr tmpstr,tmpstr2; tmpdist.x=sdist.x; tmpdist2.x=sdist.y; INDTYPE i,j; for (it=inter[sdist.y].begin(); it!=inter[sdist.y].end(); ++it){ j=scluster[*it]; if (sdist.x==j || sdist.y==j) continue; tmpdist.y=j; tmpdist2.y=j; // xy2estr(x,j,tmpstr); // xy2estr(y,j,tmpstr2); eseqdistavghash::iter tmpit2=smatrix.get(tmpdist2); if(tmpit2==smatrix.end()) continue; eseqdistavghash::iter tmpit=smatrix.get(tmpdist); if (tmpit!=smatrix.end()){ /* if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count+tmpit2->count){ if((tmpit->dist*tmpit->count+tmpit2->dist*tmpit2->count)/(tmpit->count+tmpit2->count)>sdist.dist){ cout << "sdist.dist: " << sdist.dist << " tmpit: " << *tmpit << " tmpit2: " << *tmpit2 << " sdist: " << sdist << " scount[x]: " << scount[sdist.x] << " scount[y]: " << scount[sdist.y] << " scount[j]: "<< scount[j] << endl;; exit(-1); } } */ tmpit->dist=((double)tmpit->dist*tmpit->count+(double)tmpit2->dist*tmpit2->count)/(double)(tmpit->count+tmpit2->count); tmpit->count+=tmpit2->count; }else{ tmpdist.dist=tmpit2->dist; tmpdist.count=tmpit2->count; smatrix.add(tmpdist,tmpdist); inter[sdist.x].push_back(j); tmpit=smatrix.get(tmpdist); } // make sure to add merged neighbors which are complete to the complete list lassert(scount[tmpit->x]==0 || scount[tmpit->y]==0); if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count){ // ldieif(tmpit->dist>sdist.dist,"sdist.dist: "+estr(sdist.dist)+" tmpit.dist: "+tmpit->dist+" tmpit.count: "+tmpit->count); completemerges.insert(*tmpit); if (tmpit == incmaxit) incmaxit=smatrix.end(); } if (tmpit2==incmaxit) incmaxit=smatrix.end(); smatrix.erase(tmpit2); } ++mergecount; }