void eseqclusteravg::add(const eseqdistCount& sdist){ if (sdist.count==0) return; ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size())); eseqdistCount tmpdist; tmpdist.x=scluster[sdist.x]; tmpdist.y=scluster[sdist.y]; tmpdist.dist=sdist.dist; tmpdist.count=sdist.count; ldieif(tmpdist.x<0 || tmpdist.y<0 || tmpdist.x>=scluster.size() || tmpdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(tmpdist.x)+" sdist.y: "+estr(tmpdist.y)+" scluster.size(): "+estr(scluster.size())); INDTYPE tmp; if (tmpdist.x>tmpdist.y) { tmp=tmpdist.x; tmpdist.x=tmpdist.y; tmpdist.y=tmp; } long links; long i; // estr xystr; // cout << x << " " << y << " " << sdist.dist << endl; ldieif(tmpdist.x==tmpdist.y,"should not happen: "+estr(tmpdist.x)+","+estr(tmpdist.y)+" --- "+estr(sdist.x)+","+estr(sdist.y)); // xy2estr(x,y,xystr); eseqdistavghash::iter it; it=smatrix.get(tmpdist); if (it==smatrix.end()){ if (scount[tmpdist.x]*scount[tmpdist.y]==sdist.count){ merge(tmpdist); ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n"); ofile.flush(); // cout << scluster.size()-mergecount << " " << sdist.dist << " " << sdist.x << " " << sdist.y << endl; }else{ smatrix.add(tmpdist,tmpdist); inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x); } return; } it->dist=(it->dist*it->count+tmpdist.dist*tmpdist.count)/(it->count+tmpdist.count); it->count+=tmpdist.count; // ++(*it); // complete linkage if (it->count==scount[tmpdist.x]*scount[tmpdist.y]){ merge(tmpdist); // update(ind-1,x,y); ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n"); ofile.flush(); // cout << scluster.size()-mergecount << " " << tmpdist.dist << " " << tmpdist.x << " " << tmpdist.y << endl; // sleep(1); // cout << sdist.dist << " " << x << " " << y << endl; smatrix.erase(it); } }
void eseqclustersingle::add(const eseqdist& sdist){ // if (sdist.count==0) return; ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size())); INDTYPE x=scluster[sdist.x]; INDTYPE y=scluster[sdist.y]; ldieif(x<0 || y<0 || x>=scluster.size() || y>=scluster.size(),"out of bounds: sdist.x: "+estr(x)+" sdist.y: "+estr(y)+" scluster.size(): "+estr(scluster.size())); INDTYPE tmp; if (x>y) { tmp=x; x=y; y=tmp; } merge(x,y,sdist.dist); }
void eseqclusteravg::check(ebasicarray<eseqdistCount>& dists) { long i; estr xystr; bool duplicate=false; eseqdisthash checkmatrix; for (i=0; i<dists.size(); ++i){ if (i%(dists.size()/10)==0) { cout << i*10/dists.size(); flush(cout); } // xy2estr(dists[i].x,dists[i].y,xystr); eseqdisthash::iter it; // cout << dists[i].dist << " " << dists[i].x << " " << dists[i].y; it=checkmatrix.get(dists[i]); if (it != checkmatrix.end()) { cout << "duplicate found: "+estr(dists[i].x)+","+dists[i].y << endl; // cout << " *"; duplicate=true; }else checkmatrix.add(dists[i],1); // cout << endl; } smatrix.clear(); ldieif(duplicate,"duplicates found"); cout << "# no duplicates found!" << endl; }
void actionMakeOtusMothur() { estrarray uarr; eseqclusterData cdata; ldieif(argvc<4,"syntax: "+efile(argv[0]).basename()+" -makeotus_mothur <alignment> <mergelog> <cutoff>"); cout << "# loading seqs file: " << argv[1] << endl; load_seqs(argv[1],uarr); cdata.load(argv[2],uarr.size()); float t=estr(argv[3]).f(); earray<eintarray> otuarr; cdata.getOTU(t,otuarr,uarr.size()); cout << "label\tnumOtus"; for (long i=0; i<otuarr.size(); ++i) cout << "\tOTU" << i; cout << endl; cout << (1.0-t) << "\t" << otuarr.size(); for (long i=0; i<otuarr.size(); ++i){ // cout << ">OTU" << i << " otu_size="<< otuarr[i].size() << endl; cout << "\t" << uarr.keys(otuarr[i][0]); for (long j=1; j<otuarr[i].size(); ++j) cout << "," << uarr.keys(otuarr[i][j]); } cout << endl; exit(0); }
void actionMakePart() { ldieif(argvc<3,"syntax: "+efile(argv[0]).basename()+" -makepart <alignment> <cutoff>"); cout << "# loading seqs file: " << argv[1] << endl; load_seqs_compressed(argv[1],arr,seqlen); t=estr(argv[2]).f(); ebasicarray<INDTYPE> uniqind; earray<ebasicarray<INDTYPE> > dupslist; finduniq(uniqind,dupslist); cout << "# unique seqs: " << uniqind.size() << endl; ebasicarray<INDTYPE> otuid; otuid.reserve(uniqind.size()); for (long i=0l; i<uniqind.size(); ++i) otuid.add(i); cout << "# computing partitions. threshold: " << t << endl; if (partsTotal>(arr.size()-1l)*arr.size()/20l) partsTotal=(arr.size()-1l)*arr.size()/20l; // make fewer tasks if to few calculations per task // partsTotal=1; for (long i=0; i<partsTotal; ++i) taskman.addTask(dfuncpart.value().calcfunc,evararray(mutex,uniqind,arr,otuid,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen)); taskman.createThread(nthreads); taskman.wait(); cout << endl; ebasicarray<INDTYPE> newotuid; earray<ebasicarray<INDTYPE> > otus; newotuid.init(otuid.size(),-1l); long otucount=0; for (long i=0; i<otuid.size(); ++i){ if (newotuid[otuid[i]]==-1l){ newotuid[otuid[i]]=otucount; otus.add(ebasicarray<INDTYPE>()); ++otucount; } otuid[i]=newotuid[otuid[i]]; otus[otuid[i]].add(i); } cout << "# partitions: " << otus.size() << endl; for (long i=0; i<otus.size(); ++i){ cout << otus[i].size() << ":"; for (long j=0; j<otus[i].size(); ++j){ // cout << " " << uniqind[otus[i][j]]; for (long k=0; k<dupslist[otus[i][j]].size(); ++k) cout << " " << dupslist[otus[i][j]][k]; } cout << endl; } exit(0); }
void eseqclustersingle::merge(INDTYPE x,INDTYPE y,float dist) { if (x==y) return; ldieif(scount[x]==0 || scount[y]==0,"also should not happen"); clusterData.mergearr.add(eseqdist(x,y,dist)); smerge[x]=x; smerge[y]=x; scount[x]+=scount[y]; scount[y]=0; list<INDTYPE>::iterator it; for (it=incluster[y].begin(); it!=incluster[y].end(); ++it){ scluster[*it]=x; incluster[x].push_back(*it); } ++mergecount; // cout << scluster.size()-mergecount << " " << dist << " " << x << " " << y << endl; ofile.write(estr(scluster.size()-mergecount)+" "+dist+" "+x+" "+y+"\n"); }
long eseqclusteravg::update(eblockarray<eseqdistCount>& dists,long s) { long count=0; long i; long smergepos=0; ebasicarray<long> tmpsmerge; long updcount; long updind[smerge.size()]; long updcount2; long updind2[smerge.size()]; for (i=0; i<smerge.size(); ++i) tmpsmerge.add(-1); for (i=0; i<smerge.size(); ++i) updind2[i]=-1; updcount2=0; for (i=0; i<scluster.size(); ++i){ if (updind2[scluster[i]]==-1){ updind2[scluster[i]]=updcount2; ++updcount2; } } // make sure to only update 100 entries at a time, this will force more passes but use less memory do { updcount=0; for (i=0; i<smerge.size(); ++i) updind[i]=-1; for (i=0; i<tmpsmerge.size(); ++i) tmpsmerge[i]=-1; for (; smergepos<smerge.size(); ++smergepos){ if (smerge[smergepos]>=0 && scluster[smergepos]!=smergepos){ if (updind[scluster[smergepos]]==-1) { updind[scluster[smergepos]]=updcount; ++updcount; } updind[smergepos]=updind[scluster[smergepos]]; tmpsmerge[smergepos]=scluster[smergepos]; tmpsmerge[scluster[smergepos]]=scluster[smergepos]; // ldieif(scluster[scluster[smergepos]]!=scluster[smergepos],"something wrong??"); if (updcount==100) { ++smergepos; break; } } /* if (smerge[smergepos]>=0){ if (updind[smerge[smergepos]]==-1) { updind[smerge[smergepos]]=updcount; ++updcount; } updind[smergepos]=updind[smerge[smergepos]]; tmpsmerge[smergepos]=smerge[smergepos]; // if (updcount==100) break; } */ } if (updcount==0) return(0); cerr << "# updating: " << updcount << " merges smerge.size: "<<tmpsmerge.size()<<endl; long *uarr=new long[updcount*updcount2]; ldieif (uarr==0x00,"not enough memory"); long li,lj; for (i=0; i<updcount*updcount2; ++i) uarr[i]=-1l; for (li=s; li>=0; --li){ if (dists[li].count==0) continue; if (tmpsmerge[dists[li].x]>=0){ lj=uarr[updind[tmpsmerge[dists[li].x]]*updcount2+updind2[scluster[dists[li].y]]]; if (lj>=0){ dists[li].count+=dists[lj].count; dists[lj].count=0; ++count; } uarr[updind[tmpsmerge[dists[li].x]]*updcount2+updind2[scluster[dists[li].y]]]=li; }else if (tmpsmerge[dists[li].y]>=0){ lj=uarr[updind[tmpsmerge[dists[li].y]]*updcount2+updind2[scluster[dists[li].x]]]; if (lj>=0){ dists[li].count+=dists[lj].count; dists[lj].count=0; ++count; } uarr[updind[tmpsmerge[dists[li].y]]*updcount2+updind2[scluster[dists[li].x]]]=li; } } delete[] uarr; }while (updcount==100); for (i=0; i<smerge.size(); ++i) smerge[i]=-1; return(count); }
void eseqclusteravg::add(const eseqdist& sdist){ ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size())); if (lastdist != sdist.dist){ if (incmaxit!=smatrix.end()) incmaxdist=((double)incmaxit->count*incmaxit->dist+(double)(scount[incmaxit->x]*scount[incmaxit->y]-incmaxit->count)*sdist.dist)/(double)(scount[incmaxit->x]*scount[incmaxit->y]); else getIncompleteMaxDist(sdist.dist,incmaxdist,incmaxit); } if (completemerges.size()>0l && completemerges.begin()->dist>=incmaxdist){ cout << "# trying merge: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl; cout << "# merging: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl; long tmpmc=mergecount; while (completemerges.size() && completemerges.begin()->dist>=incmaxdist){ mergeComplete(sdist.dist); } if (tmpmc!=mergecount) clearComplete(); cf=completemerges.size()/100000; cout << "# after merge: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl; ++cf; } lastdist=sdist.dist; eseqdistCount tmpdist; tmpdist.x=scluster[sdist.x]; tmpdist.y=scluster[sdist.y]; tmpdist.dist=sdist.dist; tmpdist.count=1; // cout << tmpdist << " scount[x]: "<<scount[tmpdist.x] << " scount[y]: " << scount[tmpdist.y] << endl; ldieif(tmpdist.x<0 || tmpdist.y<0 || tmpdist.x>=scluster.size() || tmpdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(tmpdist.x)+" sdist.y: "+estr(tmpdist.y)+" scluster.size(): "+estr(scluster.size())); // int tmp; // if (tmpdist.x>tmpdist.y) { tmp=tmpdist.x; tmpdist.x=tmpdist.y; tmpdist.y=tmp; } long links; long i; ldieif(tmpdist.x==tmpdist.y,"should not happen: "+estr(tmpdist.x)+","+estr(tmpdist.y)+" --- "+estr(sdist.x)+","+estr(sdist.y)); eseqdistavghash::iter it; it=smatrix.get(tmpdist); if (it==smatrix.end()){ if (scount[tmpdist.x]*scount[tmpdist.y]==tmpdist.count){ if (tmpdist.dist>=incmaxdist){ // cout << "1 " << scluster.size()-mergecount << " " << tmpdist.dist << " ("<<tmpdist.dist<<") " << tmpdist.x << " " << scount[tmpdist.x]<< " " << tmpdist.y << " " << scount[tmpdist.y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl; merge(tmpdist); ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n"); ofile.flush(); }else{ smatrix.add(tmpdist,tmpdist); inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x); completemerges.insert(tmpdist); } }else{ smatrix.add(tmpdist,tmpdist); inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x); if (incmaxit==smatrix.end()) incmaxit=smatrix.get(tmpdist); } return; } it->dist=((double)it->dist*it->count+(double)tmpdist.dist*tmpdist.count)/(double)(it->count+tmpdist.count); it->count+=tmpdist.count; // complete linkage if (it->count==scount[tmpdist.x]*scount[tmpdist.y]){ if (it->dist>=incmaxdist){ // cout << "+ " << scluster.size()-mergecount << " " << it->dist << " ("<<tmpdist.dist<<") " << it->x << " " << scount[it->x]<< " " << it->y << " " << scount[it->y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl; merge(*it); ofile.write(estr(scluster.size()-mergecount)+" "+it->dist+" "+it->x+" "+it->y+"\n"); ofile.flush(); smatrix.erase(it); // incmaxdist=getIncompleteMaxDist(sdist.dist); // while (completemerges.size() && completemerges.begin()->dist>=incmaxdist){ // mergeComplete(incmaxdist); // incmaxdist=getIncompleteMaxDist(sdist.dist); // } }else{ completemerges.insert(*it); // cout << "# " << scluster.size()-mergecount << " " << tmpdist.dist << " " << tmpdist.x << " " << tmpdist.y << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << endl; } if (it == incmaxit || incmaxit==smatrix.end()) getIncompleteMaxDist(sdist.dist,incmaxdist,incmaxit); } }
void eseqclusteravg::merge(const eseqdistCount& sdist) { ldieif(sdist.x==sdist.y,"should not happen!"); ldieif(scount[sdist.x]==0 || scount[sdist.y]==0,"also should not happen"); clusterData.mergearr.add(eseqdist(sdist.x,sdist.y,sdist.dist)); smerge[sdist.x]=sdist.x; smerge[sdist.y]=sdist.x; scount[sdist.x]+=scount[sdist.y]; scount[sdist.y]=0; list<INDTYPE>::iterator it; for (it=incluster[sdist.y].begin(); it!=incluster[sdist.y].end(); ++it){ scluster[*it]=sdist.x; incluster[sdist.x].push_back(*it); } eseqdistCount tmpdist,tmpdist2; // estr tmpstr,tmpstr2; tmpdist.x=sdist.x; tmpdist2.x=sdist.y; INDTYPE i,j; for (it=inter[sdist.y].begin(); it!=inter[sdist.y].end(); ++it){ j=scluster[*it]; if (sdist.x==j || sdist.y==j) continue; tmpdist.y=j; tmpdist2.y=j; // xy2estr(x,j,tmpstr); // xy2estr(y,j,tmpstr2); eseqdistavghash::iter tmpit2=smatrix.get(tmpdist2); if(tmpit2==smatrix.end()) continue; eseqdistavghash::iter tmpit=smatrix.get(tmpdist); if (tmpit!=smatrix.end()){ /* if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count+tmpit2->count){ if((tmpit->dist*tmpit->count+tmpit2->dist*tmpit2->count)/(tmpit->count+tmpit2->count)>sdist.dist){ cout << "sdist.dist: " << sdist.dist << " tmpit: " << *tmpit << " tmpit2: " << *tmpit2 << " sdist: " << sdist << " scount[x]: " << scount[sdist.x] << " scount[y]: " << scount[sdist.y] << " scount[j]: "<< scount[j] << endl;; exit(-1); } } */ tmpit->dist=((double)tmpit->dist*tmpit->count+(double)tmpit2->dist*tmpit2->count)/(double)(tmpit->count+tmpit2->count); tmpit->count+=tmpit2->count; }else{ tmpdist.dist=tmpit2->dist; tmpdist.count=tmpit2->count; smatrix.add(tmpdist,tmpdist); inter[sdist.x].push_back(j); tmpit=smatrix.get(tmpdist); } // make sure to add merged neighbors which are complete to the complete list lassert(scount[tmpit->x]==0 || scount[tmpit->y]==0); if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count){ // ldieif(tmpit->dist>sdist.dist,"sdist.dist: "+estr(sdist.dist)+" tmpit.dist: "+tmpit->dist+" tmpit.count: "+tmpit->count); completemerges.insert(*tmpit); if (tmpit == incmaxit) incmaxit=smatrix.end(); } if (tmpit2==incmaxit) incmaxit=smatrix.end(); smatrix.erase(tmpit2); } ++mergecount; }
int emain() { bool cl=false; bool sl=false; bool al=false; bool cdist=false; epregister(cl); epregister(sl); epregister(al); epregister(cdist); epregisterFunc(help); dfuncpart.choice=0; dfuncpart.add("gap",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_compressed2>,dist_compressed2)); dfuncpart.add("nogap",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_nogap_compressed2>,dist_nogap_compressed2)); dfuncpart.add("gap2",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_compressed>,dist_compressed)); dfuncpart.add("nogap2",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_nogap_compressed>,dist_nogap_compressed)); dfuncpart.add("nogapsingle",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_nogapsingle_compressed>,dist_nogapsingle_compressed)); dfuncpart.add("tamura",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_tamura_compressed>,dist_tamura_compressed)); epregister(dfuncpart); dfunc.choice=0; dfunc.add("gap",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_compressed2>,dist_compressed2)); dfunc.add("nogap",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_nogap_compressed2>,dist_nogap_compressed2)); dfunc.add("gap2",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_compressed>,dist_compressed)); dfunc.add("nogap2",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_nogap_compressed>,dist_nogap_compressed)); dfunc.add("nogapsingle",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_nogapsingle_compressed>,dist_nogapsingle_compressed)); dfunc.add("tamura",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_tamura_compressed>,dist_tamura_compressed)); epregisterClass(eoption<edistfunc>); epregisterClassMethod4(eoption<edistfunc>,operator=,int,(const estr& val),"="); epregister(dfunc); epregister(winlen); estr ofile; estr dfile; estr dupfile; epregister(dupfile); epregister(ignoreUnique); epregister(t); epregister(nthreads); epregister(ofile); epregister(dfile); epregister(ignoreMemThres); getParser()->actions.add("makereps",actionMakeReps); getParser()->actions.add("makeotus",actionMakeOtus); getParser()->actions.add("makeotus_mothur",actionMakeOtusMothur); getParser()->actions.add("makepart",actionMakePart); eparseArgs(argvc,argv); // cout << "# initializing identity lookup table" << endl; // initLookupTable(); if(argvc<2) { cout << "syntax: "+efile(argv[0]).basename()+" <-sl true|-cl true|-al true> <seqali>" << endl; cout << "\""+efile(argv[0]).basename()+ " --help\" for more help" << endl; exit(-1); } if(!cl && !sl && !al) { cout << "syntax: "+efile(argv[0]).basename()+" <-sl true|-cl true|-al true> <seqali>" << endl; cout << "please choose at least one clustering method <-sl true|-cl true|-al true>" << endl; cout << "\""+efile(argv[0]).basename()+ " --help\" for more help" << endl; exit(-1); } cout << "# " << date() << endl; cout << "# " << args2str(argvc,argv) << endl; cout << "# system RAM: " << getSystem()->getTotalRam()/1024 << "Mb" << endl; cout << "# free system RAM: " << (getSystem()->getFreeRam()+getSystem()->getBufferRam())/1024 << "Mb" << endl; cout << "# process memory limit: " << ((getSystem()->getMemLimit()&0x3fffffffffffff)==0x3fffffffffffff?estr("unlimited"):estr(getSystem()->getMemLimit()/1024/1024)+"Mb") << endl; warnMemThres=MIN(MIN(getSystem()->getTotalRam(),getSystem()->getMemLimit()/1024),getSystem()->getFreeRam()+getSystem()->getBufferRam())*0.6/1024; exitMemThres=MIN(MIN(getSystem()->getTotalRam(),getSystem()->getMemLimit()/1024),getSystem()->getFreeRam()+getSystem()->getBufferRam())*0.65/1024; cout << "# warning memory threshold: " << warnMemThres << "Mb" << endl; cout << "# exit memory threshold: " << exitMemThres << "Mb" << endl; cout << "# distance function: " << dfunc.key() << endl; if (ofile.len()==0) ofile=argv[1]; epregisterClass(eseqdist); epregisterClassSerializeMethod(eseqdist); epregisterClassProperty(eseqdist,dist); epregisterClassProperty(eseqdist,x); epregisterClassProperty(eseqdist,y); epregisterClass(ebasicarray<eseqdist>); epregisterClassInheritance(ebasicarray<eseqdist>,ebasearray); epregisterClassMethod(ebasicarray<eseqdist>,subset); epregisterClassSerializeMethod(ebasicarray<eseqdist>); long i,j; cout << "# loading seqs file: " << argv[1] << endl; load_seqs_compressed(argv[1],arr,seqlen); #ifndef HPC_CLUST_USE_LONGIND ldieif(arr.size() > (2l<<31),"To cluster more than 2 million sequences please recompile hpc-clust with the --enable-longind flag."); #endif ebasicarray<INDTYPE> uniqind; earray<ebasicarray<INDTYPE> > dupslist; finduniq(uniqind,dupslist); cout << "# unique seqs: " << uniqind.size() << endl; if (dupfile.len()){ efile dupf(dupfile,"w"); for (i=0; i<dupslist.size(); ++i){ dupf.write(estr(dupslist[i][0])+" "+estr(dupslist[i].size())); for (j=1; j<dupslist[i].size(); ++j) dupf.write(estr(" ")+dupslist[i][j]); dupf.write("\n"); } dupf.close(); } long maxdists=uniqind.size()*(uniqind.size()-1)/2; long maxmem=maxdists*sizeof(eseqdist)/1024/1024; cout << "# maximum number of distance pairs: " << maxdists << " (" << maxmem << "Mb)" << endl; if (maxmem > warnMemThres){ cout << "# WARNING: Number of sequences provided may require more memory than is currently available on this system." << endl; cout << "# Please monitor the memory usage of this program and check the log at the end. This program will" << endl; cout << "# automatically exit if it reaches the exitMemThres value shown above. You can force the program" << endl; cout << "# to ignore this threshold using the argument: -ignoreMemThres true" << endl; cout << "# Memory requirements can be reduced by increasing the clustering threshold, or reducing the number" << endl; cout << "# of sequences to be clustered. For more information and tips on optimizing hpc-clust memory" << endl; cout << "# usage please refer to the documentation." << endl; } float dtime,stime; etimer t1; t1.reset(); efile df(dfile); cout << "# computing distances" << endl; // if ((arr.size()-1l)*arr.size()/2l/partsTotal > 10000l) partsTotal=(arr.size()-1l)*arr.size()/2l/10000l; // make more tasks if too many calculations per task if (partsTotal>(arr.size()-1l)*arr.size()/20l) partsTotal=(arr.size()-1l)*arr.size()/20l; // make fewer tasks if to few calculations per task // cout << "partsTotal: " << partsTotal << endl; cerr << endl; // needed for keeping track of the progress for (i=0; i<partsTotal; ++i) taskman.addTask(dfunc.value().calcfunc,evararray(mutex,uniqind,arr,dists,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen)); taskman.createThread(nthreads); taskman.wait(); cerr << endl; dtime=t1.lap()*0.001; cout << "# time calculating distances: " << dtime << endl; cout << "# distances within threshold: " << dists.size() << endl; cout << "# number of tasks: " << taskman.tasks.size() << endl; fradix256sort<eblockarray<eseqdist>,radixKey>(dists); cout << "# number of tasks: " << taskman.tasks.size() << endl; stime=t1.lap()*0.001; if (dfile.len()){ cout << "# saving distances to file: "<<dfile << endl; for (i=0; i<dists.size(); ++i) df.write(estr(arr.keys(dists[i].x))+"\t"+arr.keys(dists[i].y)+"\t"+(1.0-dists[i].dist)+"\n"); /* for (i=0; i<dupslist.size(); ++i){ for (j=1; j<dupslist[i].size(); ++j) df.write(estr(dupslist[i][0])+" "+dupslist[i][j]+" 1.0\n"); } */ df.close(); } // }else{ // cout << "# loading distances from file: "<<dfile << endl; /* estr str; df.read(str); ldieif(mindists.unserial(str,0)==-1,"problem loading distance file: "+dfile); df.close(); */ // } totaldists=dists.size(); cout << "# time sorting distances: " << stime << endl; cout << "# initializing cluster"<<endl; if (cl) clcluster.init(arr.size(),ofile+".cl",argv[1],dupslist); if (sl) slcluster.init(arr.size(),ofile+".sl",argv[1],dupslist); if (al) alcluster.init(arr.size(),ofile+".al",argv[1],dupslist,t,dfunc.value().calcfunc_single,arr,seqlen); cout << "# starting clustering"<<endl; t1.reset(); for (i=dists.size()-1; i>=0; --i){ if (cl) clcluster.add(dists[i]); if (al) alcluster.add(dists[i]); if (sl) slcluster.add(dists[i]); } if (al) alcluster.finalize(); float clustime=t1.lap()*0.001; cout << "# time calculating distances: " << dtime << endl; cout << "# time sorting distances: " << stime << endl; cout << "# time clustering: " << clustime << endl; cout << "# total time: " << dtime+clustime+stime << endl; cout << "# distances within threshold: " << totaldists << endl; if (cdist){ efile fsl,fcl,fal; if (sl) fsl.open(ofile+".sl.dist","w"); if (cl) fcl.open(ofile+".cl.dist","w"); if (al) fal.open(ofile+".cl.dist","w"); for (i=dists.size()-1; i>=0; --i){ if (sl) fsl.write(estr(dists[i].x)+" "+dists[i].y+" "+dists[i].dist+" "+slcluster.clusterData.getMergeDistance(dists[i].x,dists[i].y)+"\n"); if (cl) fcl.write(estr(dists[i].x)+" "+dists[i].y+" "+dists[i].dist+" "+clcluster.clusterData.getMergeDistance(dists[i].x,dists[i].y)+"\n"); if (al) fal.write(estr(dists[i].x)+" "+dists[i].y+" "+dists[i].dist+" "+alcluster.clusterData.getMergeDistance(dists[i].x,dists[i].y)+"\n"); } } return(0); }
void actionMakeReps() { ldieif(argvc<3,"syntax: "+efile(argv[0]).basename()+" -makereps <alignment> <otu>"); estrhashof<INDTYPE> seqind; estrarray uarr; cout << "# loading seqs file: " << argv[1] << endl; load_seqs_compressed(argv[1],arr,seqind,seqlen); load_seqs(argv[1],uarr); earray<ebasicarray<INDTYPE> > otus; efile f; estr line; estrarray parts; f.open(argv[2],"r"); while (!f.eof()){ f.readln(line); if (line.len()==0 || line[0]=='#') continue; if (line[0]=='>'){ otus.add(ebasicarray<INDTYPE>()); continue; } ldieif(otus.size()==0,"first entry not start of OTU or missing '>'"); parts=line.explode("\t"); ldieif(parts.size()==0,"array empty: "+line); ldieif(!seqind.exists(parts[0]),"sequence not found: "+parts[0]); otus[otus.size()-1].add(seqind[parts[0]]); } cerr << endl; ebasicarray<INDTYPE> tuniqind; earray<ebasicarray<INDTYPE> > dupslist; finduniq(tuniqind,dupslist); eintarray uniqmask; uniqmask.init(arr.size(),0); for (long i=0; i<tuniqind.size(); ++i) uniqmask[tuniqind[i]]=dupslist[i].size(); // ebasicarray<INDTYPE> uniqind; taskman.createThread(nthreads); ebasicarray<INDTYPE> uniqind; const float t=0.0; efloatarray avgdist; for (long j=0; j<otus.size(); ++j){ // cout << "# computing distances for otu "<< j << " size: " << otus[j].size() << endl; if (otus[j].size()==1){ cout << ">OTU" << j << " " << arr.keys(otus[j][0]) << " avg_id=1.0 otu_size=1" << endl; cout << uarr.values(otus[j][0]) << endl; continue; } uniqind.clear(); for (long l=0; l<otus[j].size(); ++l){ if (uniqmask[otus[j][l]]!=0) uniqind.add(otus[j][l]); } // uniqind=otus[j]; ldieif(uniqind.size()==0,"empty OTU"); if (uniqind.size()==1){ cout << ">OTU" << j << " " << arr.keys(uniqind[0]) << " avg_id=1.0 otu_size=" << otus[j].size() << endl; cout << uarr.values(uniqind[0]) << endl; continue; } avgdist.clear(); avgdist.init(arr.size(),0.0); dists.clear(); partsTotal=10000; if (partsTotal>(uniqind.size()-1l)*uniqind.size()/20l) partsTotal=(uniqind.size()-1l)*uniqind.size()/20l; // make fewer tasks if to few calculations per task if (partsTotal<=0) partsTotal=1; taskman.clear(); for (long i=0; i<partsTotal; ++i) taskman.addTask(dfunc.value().calcfunc,evararray(mutex,uniqind,arr,dists,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen)); taskman.wait(); for (long i=0; i<dists.size(); ++i){ eseqdist& d(dists[i]); avgdist[d.x]+=d.dist*uniqmask[d.y]; avgdist[d.y]+=d.dist*uniqmask[d.x]; // cout << "# "<< arr.keys(d.x) << " " << arr.keys(d.y) << " " << d.dist << " " << uniqmask[d.x] << " " << uniqmask[d.y] << endl; } long k=uniqind[0]; for (long i=0; i<uniqind.size(); ++i){ long ti=uniqind[i]; avgdist[ti]+=uniqmask[ti]-1; if (avgdist[k]<avgdist[ti]) { // cout << "# " << arr.keys(ti) << " " << ti << " " << uniqmask[ti] << " " << avgdist[ti] << " " << counts[ti] << endl; k=ti; } } // cout << "OTU" << j << " " << otus[j].size() << " " << arr.keys(k) << " " << avgdist[k]/(otus[j].size()-1) << " " << dists.size() << endl; cout << ">OTU" << j << " " << arr.keys(k) << " avg_id=" << avgdist[k]/(otus[j].size()-1) << " otu_size=" << otus[j].size() << endl; cout << uarr.values(k) << endl; } cerr << endl; exit(0); }