Example #1
0
void eseqclusteravg::add(const eseqdistCount& sdist){
  if (sdist.count==0) return;
  ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size()));

  eseqdistCount tmpdist;
  tmpdist.x=scluster[sdist.x];
  tmpdist.y=scluster[sdist.y];
  tmpdist.dist=sdist.dist;
  tmpdist.count=sdist.count;

  ldieif(tmpdist.x<0 || tmpdist.y<0 || tmpdist.x>=scluster.size() || tmpdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(tmpdist.x)+" sdist.y: "+estr(tmpdist.y)+" scluster.size(): "+estr(scluster.size()));
  INDTYPE tmp;
  if (tmpdist.x>tmpdist.y) { tmp=tmpdist.x; tmpdist.x=tmpdist.y; tmpdist.y=tmp; }

  long links;
  long i;
//  estr xystr;

//  cout << x << " " << y << " " << sdist.dist << endl;
  ldieif(tmpdist.x==tmpdist.y,"should not happen: "+estr(tmpdist.x)+","+estr(tmpdist.y)+" --- "+estr(sdist.x)+","+estr(sdist.y));

//  xy2estr(x,y,xystr);

  eseqdistavghash::iter it;

  it=smatrix.get(tmpdist);
  if (it==smatrix.end()){
    if (scount[tmpdist.x]*scount[tmpdist.y]==sdist.count){
      merge(tmpdist);
      ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
      ofile.flush();
//      cout << scluster.size()-mergecount << " " << sdist.dist << " " << sdist.x << " " << sdist.y << endl;
    }else{
      smatrix.add(tmpdist,tmpdist);
      inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x);
    }
    return;
  }

  it->dist=(it->dist*it->count+tmpdist.dist*tmpdist.count)/(it->count+tmpdist.count);
  it->count+=tmpdist.count;
//  ++(*it);

  // complete linkage
  if (it->count==scount[tmpdist.x]*scount[tmpdist.y]){
    merge(tmpdist);
//    update(ind-1,x,y);
    ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
    ofile.flush();
//    cout << scluster.size()-mergecount << " " << tmpdist.dist << " " << tmpdist.x << " " << tmpdist.y << endl;
//    sleep(1);
//    cout << sdist.dist << " " << x << " " << y << endl;
    smatrix.erase(it);
  }
}
Example #2
0
void eseqclustersingle::add(const eseqdist& sdist){
//  if (sdist.count==0) return;
  ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size()));

  INDTYPE x=scluster[sdist.x];
  INDTYPE y=scluster[sdist.y];

  ldieif(x<0 || y<0 || x>=scluster.size() || y>=scluster.size(),"out of bounds: sdist.x: "+estr(x)+" sdist.y: "+estr(y)+" scluster.size(): "+estr(scluster.size()));
  INDTYPE tmp;
  if (x>y) { tmp=x; x=y; y=tmp; }

  merge(x,y,sdist.dist);
}
Example #3
0
void eseqclusteravg::check(ebasicarray<eseqdistCount>& dists)
{
  long i;
  estr xystr;
  bool duplicate=false;
  eseqdisthash checkmatrix;
  for (i=0; i<dists.size(); ++i){
    if (i%(dists.size()/10)==0) { cout << i*10/dists.size(); flush(cout); }
//    xy2estr(dists[i].x,dists[i].y,xystr);

    eseqdisthash::iter it;

//    cout << dists[i].dist << " " << dists[i].x << " " << dists[i].y;
    it=checkmatrix.get(dists[i]);
    if (it != checkmatrix.end()) {
      cout << "duplicate found: "+estr(dists[i].x)+","+dists[i].y << endl;
//      cout << " *";
      duplicate=true;
    }else
      checkmatrix.add(dists[i],1);
//    cout << endl;
  }
  smatrix.clear();

  ldieif(duplicate,"duplicates found");
  cout << "# no duplicates found!" << endl;
}
Example #4
0
void actionMakeOtusMothur()
{
  estrarray uarr;
  eseqclusterData cdata;
  ldieif(argvc<4,"syntax: "+efile(argv[0]).basename()+" -makeotus_mothur <alignment> <mergelog> <cutoff>");

  cout << "# loading seqs file: " << argv[1] << endl;
  load_seqs(argv[1],uarr);
  cdata.load(argv[2],uarr.size());

  float t=estr(argv[3]).f();
  earray<eintarray> otuarr;
  cdata.getOTU(t,otuarr,uarr.size());

  cout << "label\tnumOtus";
  for (long i=0; i<otuarr.size(); ++i)
    cout << "\tOTU" << i;
  cout << endl;

  cout << (1.0-t) << "\t" << otuarr.size();
  for (long i=0; i<otuarr.size(); ++i){
//    cout << ">OTU" << i << " otu_size="<< otuarr[i].size() << endl;
    cout << "\t" << uarr.keys(otuarr[i][0]);
    for (long j=1; j<otuarr[i].size(); ++j)
      cout << "," << uarr.keys(otuarr[i][j]);
  }
  cout << endl;

  exit(0);
}
Example #5
0
void actionMakePart()
{
  ldieif(argvc<3,"syntax: "+efile(argv[0]).basename()+" -makepart <alignment> <cutoff>");

  cout << "# loading seqs file: " << argv[1] << endl;
  load_seqs_compressed(argv[1],arr,seqlen);

  t=estr(argv[2]).f();

  ebasicarray<INDTYPE> uniqind;
  earray<ebasicarray<INDTYPE> > dupslist;
  finduniq(uniqind,dupslist);
  cout << "# unique seqs: " << uniqind.size() << endl;

  ebasicarray<INDTYPE> otuid;
  otuid.reserve(uniqind.size());
  for (long i=0l; i<uniqind.size(); ++i)
    otuid.add(i);

  cout << "# computing partitions. threshold: " << t << endl;
  if (partsTotal>(arr.size()-1l)*arr.size()/20l) partsTotal=(arr.size()-1l)*arr.size()/20l; // make fewer tasks if to few calculations per task
//  partsTotal=1;
  for (long i=0; i<partsTotal; ++i)
    taskman.addTask(dfuncpart.value().calcfunc,evararray(mutex,uniqind,arr,otuid,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen));

  taskman.createThread(nthreads);
  taskman.wait();

  cout << endl;

  ebasicarray<INDTYPE> newotuid;
  earray<ebasicarray<INDTYPE> > otus;
  newotuid.init(otuid.size(),-1l);
  long otucount=0;
  for (long i=0; i<otuid.size(); ++i){
    if (newotuid[otuid[i]]==-1l){
      newotuid[otuid[i]]=otucount;
      otus.add(ebasicarray<INDTYPE>());
      ++otucount;
    }
    otuid[i]=newotuid[otuid[i]];
    otus[otuid[i]].add(i);
  }
  cout << "# partitions: " << otus.size() << endl;

  for (long i=0; i<otus.size(); ++i){
    cout << otus[i].size() << ":";
    for (long j=0; j<otus[i].size(); ++j){
//      cout << " " << uniqind[otus[i][j]];
      for (long k=0; k<dupslist[otus[i][j]].size(); ++k)
        cout << " " << dupslist[otus[i][j]][k];
    }
    cout << endl;
  }

  exit(0);
}
Example #6
0
void eseqclustersingle::merge(INDTYPE x,INDTYPE y,float dist)
{
  if (x==y) return;
  ldieif(scount[x]==0 || scount[y]==0,"also should not happen");

  clusterData.mergearr.add(eseqdist(x,y,dist));

  smerge[x]=x;
  smerge[y]=x;

  scount[x]+=scount[y];
  scount[y]=0;

  list<INDTYPE>::iterator it;
  for (it=incluster[y].begin(); it!=incluster[y].end(); ++it){
    scluster[*it]=x;
    incluster[x].push_back(*it);
  }
  ++mergecount;
  
//  cout << scluster.size()-mergecount << " " << dist << " " << x << " " << y << endl;
  ofile.write(estr(scluster.size()-mergecount)+" "+dist+" "+x+" "+y+"\n");
}
Example #7
0
long eseqclusteravg::update(eblockarray<eseqdistCount>& dists,long s)
{
  long count=0;
  long i;
  long smergepos=0;
  ebasicarray<long> tmpsmerge;
  long updcount;
  long updind[smerge.size()];
  long updcount2;
  long updind2[smerge.size()];

  for (i=0; i<smerge.size(); ++i)
    tmpsmerge.add(-1);

  for (i=0; i<smerge.size(); ++i)
    updind2[i]=-1;

  updcount2=0;
  for (i=0; i<scluster.size(); ++i){
    if (updind2[scluster[i]]==-1){
      updind2[scluster[i]]=updcount2;
      ++updcount2;
    }
  }

  // make sure to only update 100 entries at a time, this will force more passes but use less memory
  do {
    updcount=0;

    for (i=0; i<smerge.size(); ++i)
      updind[i]=-1;

    for (i=0; i<tmpsmerge.size(); ++i)
      tmpsmerge[i]=-1;

    for (; smergepos<smerge.size(); ++smergepos){
      if (smerge[smergepos]>=0 && scluster[smergepos]!=smergepos){
        if (updind[scluster[smergepos]]==-1) {
          updind[scluster[smergepos]]=updcount;
          ++updcount;
        }
        updind[smergepos]=updind[scluster[smergepos]];
        tmpsmerge[smergepos]=scluster[smergepos];
        tmpsmerge[scluster[smergepos]]=scluster[smergepos];
//        ldieif(scluster[scluster[smergepos]]!=scluster[smergepos],"something wrong??");
        if (updcount==100) { ++smergepos; break; }
      }
/*
      if (smerge[smergepos]>=0){
        if (updind[smerge[smergepos]]==-1) {
          updind[smerge[smergepos]]=updcount;
          ++updcount;
        }
        updind[smergepos]=updind[smerge[smergepos]];
        tmpsmerge[smergepos]=smerge[smergepos];
//        if (updcount==100) break;
      }
*/
    }

    if (updcount==0) return(0);

    cerr << "# updating: " << updcount << " merges smerge.size: "<<tmpsmerge.size()<<endl;

    long *uarr=new long[updcount*updcount2];
    ldieif (uarr==0x00,"not enough memory");
    long li,lj;
    for (i=0; i<updcount*updcount2; ++i)
      uarr[i]=-1l;

    for (li=s; li>=0; --li){
      if (dists[li].count==0) continue;

      if (tmpsmerge[dists[li].x]>=0){
        lj=uarr[updind[tmpsmerge[dists[li].x]]*updcount2+updind2[scluster[dists[li].y]]];
        if (lj>=0){
          dists[li].count+=dists[lj].count;
          dists[lj].count=0;
          ++count;
        }
        uarr[updind[tmpsmerge[dists[li].x]]*updcount2+updind2[scluster[dists[li].y]]]=li;
      }else if (tmpsmerge[dists[li].y]>=0){
        lj=uarr[updind[tmpsmerge[dists[li].y]]*updcount2+updind2[scluster[dists[li].x]]];
        if (lj>=0){
          dists[li].count+=dists[lj].count;
          dists[lj].count=0;
          ++count;
        }
        uarr[updind[tmpsmerge[dists[li].y]]*updcount2+updind2[scluster[dists[li].x]]]=li;
      }
    }

    delete[] uarr;
  }while (updcount==100);

  for (i=0; i<smerge.size(); ++i)
    smerge[i]=-1;

  return(count);
}
Example #8
0
void eseqclusteravg::add(const eseqdist& sdist){
  ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size()));

  if (lastdist != sdist.dist){
    if (incmaxit!=smatrix.end())
      incmaxdist=((double)incmaxit->count*incmaxit->dist+(double)(scount[incmaxit->x]*scount[incmaxit->y]-incmaxit->count)*sdist.dist)/(double)(scount[incmaxit->x]*scount[incmaxit->y]);
    else
      getIncompleteMaxDist(sdist.dist,incmaxdist,incmaxit);
  }

  if (completemerges.size()>0l && completemerges.begin()->dist>=incmaxdist){
    cout << "# trying merge: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl;
    cout << "# merging: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl;
    long tmpmc=mergecount;
    while (completemerges.size() && completemerges.begin()->dist>=incmaxdist){
      mergeComplete(sdist.dist);
    }
    if (tmpmc!=mergecount)
      clearComplete();
    cf=completemerges.size()/100000;
    cout << "# after merge: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl;
    ++cf;
  }
  lastdist=sdist.dist;

  eseqdistCount tmpdist;
  tmpdist.x=scluster[sdist.x];
  tmpdist.y=scluster[sdist.y];
  tmpdist.dist=sdist.dist;
  tmpdist.count=1;

//  cout << tmpdist << " scount[x]: "<<scount[tmpdist.x] << " scount[y]: " << scount[tmpdist.y] << endl;

  ldieif(tmpdist.x<0 || tmpdist.y<0 || tmpdist.x>=scluster.size() || tmpdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(tmpdist.x)+" sdist.y: "+estr(tmpdist.y)+" scluster.size(): "+estr(scluster.size()));
//  int tmp;
//  if (tmpdist.x>tmpdist.y) { tmp=tmpdist.x; tmpdist.x=tmpdist.y; tmpdist.y=tmp; }

  long links;
  long i;

  ldieif(tmpdist.x==tmpdist.y,"should not happen: "+estr(tmpdist.x)+","+estr(tmpdist.y)+" --- "+estr(sdist.x)+","+estr(sdist.y));

  eseqdistavghash::iter it;

  it=smatrix.get(tmpdist);
  if (it==smatrix.end()){
    if (scount[tmpdist.x]*scount[tmpdist.y]==tmpdist.count){
      if (tmpdist.dist>=incmaxdist){
//        cout << "1 " << scluster.size()-mergecount << " " << tmpdist.dist << " ("<<tmpdist.dist<<") " << tmpdist.x << " " << scount[tmpdist.x]<< " " << tmpdist.y << " " << scount[tmpdist.y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
        merge(tmpdist);
        ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
        ofile.flush();
      }else{
        smatrix.add(tmpdist,tmpdist);
        inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x);
        completemerges.insert(tmpdist);
      }
    }else{
      smatrix.add(tmpdist,tmpdist);
      inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x);
      if (incmaxit==smatrix.end())
        incmaxit=smatrix.get(tmpdist);
    }
    return;
  }

  it->dist=((double)it->dist*it->count+(double)tmpdist.dist*tmpdist.count)/(double)(it->count+tmpdist.count);
  it->count+=tmpdist.count;

  // complete linkage
  if (it->count==scount[tmpdist.x]*scount[tmpdist.y]){
    if (it->dist>=incmaxdist){
//      cout << "+ " << scluster.size()-mergecount << " " << it->dist << " ("<<tmpdist.dist<<") " << it->x << " " << scount[it->x]<< " " << it->y << " " << scount[it->y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
      merge(*it);
      ofile.write(estr(scluster.size()-mergecount)+" "+it->dist+" "+it->x+" "+it->y+"\n");
      ofile.flush();
      smatrix.erase(it);
//      incmaxdist=getIncompleteMaxDist(sdist.dist);
//      while (completemerges.size() && completemerges.begin()->dist>=incmaxdist){
//        mergeComplete(incmaxdist);
//        incmaxdist=getIncompleteMaxDist(sdist.dist);
//      }
    }else{
      completemerges.insert(*it);
//      cout << "# " << scluster.size()-mergecount << " " << tmpdist.dist << " " << tmpdist.x << " " << tmpdist.y << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << endl;
    }
    if (it == incmaxit || incmaxit==smatrix.end())
      getIncompleteMaxDist(sdist.dist,incmaxdist,incmaxit);
  }
}
Example #9
0
void eseqclusteravg::merge(const eseqdistCount& sdist)
{
  ldieif(sdist.x==sdist.y,"should not happen!");
  ldieif(scount[sdist.x]==0 || scount[sdist.y]==0,"also should not happen");

  clusterData.mergearr.add(eseqdist(sdist.x,sdist.y,sdist.dist));

  smerge[sdist.x]=sdist.x;
  smerge[sdist.y]=sdist.x;

  scount[sdist.x]+=scount[sdist.y];
  scount[sdist.y]=0;

  list<INDTYPE>::iterator it;
  for (it=incluster[sdist.y].begin(); it!=incluster[sdist.y].end(); ++it){
    scluster[*it]=sdist.x;
    incluster[sdist.x].push_back(*it);
  }

  eseqdistCount tmpdist,tmpdist2;
//  estr tmpstr,tmpstr2;
  tmpdist.x=sdist.x;
  tmpdist2.x=sdist.y;

  INDTYPE i,j;
  for (it=inter[sdist.y].begin(); it!=inter[sdist.y].end(); ++it){
    j=scluster[*it];
    if (sdist.x==j || sdist.y==j) continue;
    tmpdist.y=j;
    tmpdist2.y=j;
//    xy2estr(x,j,tmpstr);
//    xy2estr(y,j,tmpstr2);

    eseqdistavghash::iter tmpit2=smatrix.get(tmpdist2);
    if(tmpit2==smatrix.end()) continue;

    eseqdistavghash::iter tmpit=smatrix.get(tmpdist);
    if (tmpit!=smatrix.end()){
/*
      if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count+tmpit2->count){
        if((tmpit->dist*tmpit->count+tmpit2->dist*tmpit2->count)/(tmpit->count+tmpit2->count)>sdist.dist){
          cout << "sdist.dist: " << sdist.dist << " tmpit: " << *tmpit << " tmpit2: " << *tmpit2 << " sdist: " << sdist << " scount[x]: " << scount[sdist.x] << " scount[y]: " << scount[sdist.y] << " scount[j]: "<< scount[j] << endl;;
          exit(-1);
        }
      }
*/
      tmpit->dist=((double)tmpit->dist*tmpit->count+(double)tmpit2->dist*tmpit2->count)/(double)(tmpit->count+tmpit2->count);
      tmpit->count+=tmpit2->count;
    }else{
      tmpdist.dist=tmpit2->dist;
      tmpdist.count=tmpit2->count;
      smatrix.add(tmpdist,tmpdist);
      inter[sdist.x].push_back(j);
      tmpit=smatrix.get(tmpdist);
    }
    // make sure to add merged neighbors which are complete to the complete list
    lassert(scount[tmpit->x]==0 || scount[tmpit->y]==0);
    if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count){
//      ldieif(tmpit->dist>sdist.dist,"sdist.dist: "+estr(sdist.dist)+" tmpit.dist: "+tmpit->dist+" tmpit.count: "+tmpit->count);
      completemerges.insert(*tmpit);
      if (tmpit == incmaxit)
        incmaxit=smatrix.end();
    }
    if (tmpit2==incmaxit)
      incmaxit=smatrix.end();
    smatrix.erase(tmpit2);
  }
  ++mergecount;
}
Example #10
0
int emain()
{ 
  bool cl=false;
  bool sl=false;
  bool al=false;
  bool cdist=false;
  epregister(cl);
  epregister(sl);
  epregister(al);
  epregister(cdist);
  epregisterFunc(help);

  dfuncpart.choice=0;
  dfuncpart.add("gap",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_compressed2>,dist_compressed2));
  dfuncpart.add("nogap",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_nogap_compressed2>,dist_nogap_compressed2));
  dfuncpart.add("gap2",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_compressed>,dist_compressed));
  dfuncpart.add("nogap2",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_nogap_compressed>,dist_nogap_compressed));
  dfuncpart.add("nogapsingle",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_nogapsingle_compressed>,dist_nogapsingle_compressed));
  dfuncpart.add("tamura",edistfunc(part_calc_dists_u<estrarray,eseqdist,dist_tamura_compressed>,dist_tamura_compressed));
  epregister(dfuncpart);

  dfunc.choice=0;
  dfunc.add("gap",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_compressed2>,dist_compressed2));
  dfunc.add("nogap",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_nogap_compressed2>,dist_nogap_compressed2));
  dfunc.add("gap2",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_compressed>,dist_compressed));
  dfunc.add("nogap2",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_nogap_compressed>,dist_nogap_compressed));
  dfunc.add("nogapsingle",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_nogapsingle_compressed>,dist_nogapsingle_compressed));
  dfunc.add("tamura",edistfunc(t_calc_dists_u<estrarray,eseqdist,eblockarray<eseqdist>,dist_tamura_compressed>,dist_tamura_compressed));

  epregisterClass(eoption<edistfunc>);
  epregisterClassMethod4(eoption<edistfunc>,operator=,int,(const estr& val),"=");

  epregister(dfunc);

  epregister(winlen);

  estr ofile;
  estr dfile;
  estr dupfile;

  epregister(dupfile);
  epregister(ignoreUnique);
  epregister(t);
  epregister(nthreads);
  epregister(ofile);
  epregister(dfile);
  epregister(ignoreMemThres);

  getParser()->actions.add("makereps",actionMakeReps);
  getParser()->actions.add("makeotus",actionMakeOtus);
  getParser()->actions.add("makeotus_mothur",actionMakeOtusMothur);
  getParser()->actions.add("makepart",actionMakePart);
  eparseArgs(argvc,argv);

//  cout << "# initializing identity lookup table" << endl;
//  initLookupTable();

  if(argvc<2) {
    cout << "syntax: "+efile(argv[0]).basename()+" <-sl true|-cl true|-al true> <seqali>" << endl;
    cout << "\""+efile(argv[0]).basename()+ " --help\" for more help" << endl;
    exit(-1);
  }
  if(!cl && !sl && !al) {
    cout << "syntax: "+efile(argv[0]).basename()+" <-sl true|-cl true|-al true> <seqali>" << endl;
    cout << "please choose at least one clustering method <-sl true|-cl true|-al true>" << endl;
    cout << "\""+efile(argv[0]).basename()+ " --help\" for more help" << endl;
    exit(-1);
  }

  cout << "# " << date() << endl;
  cout << "# " << args2str(argvc,argv) << endl;
  cout << "# system RAM: " << getSystem()->getTotalRam()/1024 << "Mb" << endl;
  cout << "# free system RAM: " << (getSystem()->getFreeRam()+getSystem()->getBufferRam())/1024 << "Mb" << endl;
  cout << "# process memory limit: " << ((getSystem()->getMemLimit()&0x3fffffffffffff)==0x3fffffffffffff?estr("unlimited"):estr(getSystem()->getMemLimit()/1024/1024)+"Mb") << endl;

  warnMemThres=MIN(MIN(getSystem()->getTotalRam(),getSystem()->getMemLimit()/1024),getSystem()->getFreeRam()+getSystem()->getBufferRam())*0.6/1024;
  exitMemThres=MIN(MIN(getSystem()->getTotalRam(),getSystem()->getMemLimit()/1024),getSystem()->getFreeRam()+getSystem()->getBufferRam())*0.65/1024;

  cout << "# warning memory threshold: " << warnMemThres << "Mb" << endl;
  cout << "# exit memory threshold: " << exitMemThres << "Mb" << endl;

  cout << "# distance function: " << dfunc.key() << endl;

  if (ofile.len()==0)
    ofile=argv[1];

  epregisterClass(eseqdist);
  epregisterClassSerializeMethod(eseqdist);
  epregisterClassProperty(eseqdist,dist);
  epregisterClassProperty(eseqdist,x);
  epregisterClassProperty(eseqdist,y);

  epregisterClass(ebasicarray<eseqdist>);
  epregisterClassInheritance(ebasicarray<eseqdist>,ebasearray);
  epregisterClassMethod(ebasicarray<eseqdist>,subset);
  epregisterClassSerializeMethod(ebasicarray<eseqdist>);

  long i,j;
  cout << "# loading seqs file: " << argv[1] << endl;
  load_seqs_compressed(argv[1],arr,seqlen);
#ifndef HPC_CLUST_USE_LONGIND
  ldieif(arr.size() > (2l<<31),"To cluster more than 2 million sequences please recompile hpc-clust with the --enable-longind flag.");
#endif

  ebasicarray<INDTYPE> uniqind;
  earray<ebasicarray<INDTYPE> > dupslist;
  finduniq(uniqind,dupslist);
  cout << "# unique seqs: " << uniqind.size() << endl;


  if (dupfile.len()){
    efile dupf(dupfile,"w");
    for (i=0; i<dupslist.size(); ++i){
      dupf.write(estr(dupslist[i][0])+" "+estr(dupslist[i].size()));
      for (j=1; j<dupslist[i].size(); ++j)
        dupf.write(estr(" ")+dupslist[i][j]);
      dupf.write("\n");
    }
    dupf.close();
  }

  long maxdists=uniqind.size()*(uniqind.size()-1)/2;
  long maxmem=maxdists*sizeof(eseqdist)/1024/1024;
  cout << "# maximum number of distance pairs: " << maxdists << " (" << maxmem << "Mb)" << endl;

  if (maxmem > warnMemThres){
    cout << "# WARNING: Number of sequences provided may require more memory than is currently available on this system." << endl;
    cout << "#           Please monitor the memory usage of this program and check the log at the end. This program will" << endl;
    cout << "#           automatically exit if it reaches the exitMemThres value shown above. You can force the program" << endl;
    cout << "#           to ignore this threshold using the argument: -ignoreMemThres true" << endl;
    cout << "#           Memory requirements can be reduced by increasing the clustering threshold, or reducing the number" << endl;
    cout << "#           of sequences to be clustered. For more information and tips on optimizing hpc-clust memory" << endl;
    cout << "#           usage please refer to the documentation." << endl;
  }

  
  float dtime,stime;
  etimer t1;
  t1.reset();

  efile df(dfile);
  cout << "# computing distances" << endl;
//  if ((arr.size()-1l)*arr.size()/2l/partsTotal > 10000l) partsTotal=(arr.size()-1l)*arr.size()/2l/10000l;  // make more tasks if too many calculations per task
  if (partsTotal>(arr.size()-1l)*arr.size()/20l) partsTotal=(arr.size()-1l)*arr.size()/20l; // make fewer tasks if to few calculations per task

//  cout << "partsTotal: " << partsTotal << endl;
  cerr << endl; // needed for keeping track of the progress

  for (i=0; i<partsTotal; ++i)
    taskman.addTask(dfunc.value().calcfunc,evararray(mutex,uniqind,arr,dists,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen));

  taskman.createThread(nthreads);
  taskman.wait();
  cerr << endl;

  dtime=t1.lap()*0.001;
  cout << "# time calculating distances: " << dtime << endl;
  cout << "# distances within threshold: " << dists.size() << endl;

  cout << "# number of tasks: " << taskman.tasks.size() << endl;
  fradix256sort<eblockarray<eseqdist>,radixKey>(dists);
  cout << "# number of tasks: " << taskman.tasks.size() << endl;
  stime=t1.lap()*0.001;

  if (dfile.len()){
    cout << "# saving distances to file: "<<dfile << endl;
    for (i=0; i<dists.size(); ++i)
      df.write(estr(arr.keys(dists[i].x))+"\t"+arr.keys(dists[i].y)+"\t"+(1.0-dists[i].dist)+"\n");
/*
    for (i=0; i<dupslist.size(); ++i){
      for (j=1; j<dupslist[i].size(); ++j)
        df.write(estr(dupslist[i][0])+" "+dupslist[i][j]+" 1.0\n");
    }
*/
    df.close();
  }
//  }else{
//    cout << "# loading distances from file: "<<dfile << endl;
/*
    estr str;
    df.read(str);
    ldieif(mindists.unserial(str,0)==-1,"problem loading distance file: "+dfile);
    df.close();
*/
//  } 

  totaldists=dists.size();
  cout << "# time sorting distances: " << stime << endl;

  cout << "# initializing cluster"<<endl;
  if (cl)
    clcluster.init(arr.size(),ofile+".cl",argv[1],dupslist);
  if (sl)
    slcluster.init(arr.size(),ofile+".sl",argv[1],dupslist);
  if (al)
    alcluster.init(arr.size(),ofile+".al",argv[1],dupslist,t,dfunc.value().calcfunc_single,arr,seqlen);

  cout << "# starting clustering"<<endl;
  t1.reset();
  for (i=dists.size()-1; i>=0; --i){
    if (cl)
      clcluster.add(dists[i]);
    if (al)
      alcluster.add(dists[i]);
    if (sl)
      slcluster.add(dists[i]);
  }
  if (al)
    alcluster.finalize();

  float clustime=t1.lap()*0.001;
  cout << "# time calculating distances: " << dtime << endl;
  cout << "# time sorting distances: " << stime << endl;
  cout << "# time clustering: " << clustime << endl;
  cout << "# total time: " << dtime+clustime+stime << endl;
  cout << "# distances within threshold: " << totaldists << endl;

  if (cdist){
    efile fsl,fcl,fal;
    if (sl) fsl.open(ofile+".sl.dist","w");
    if (cl) fcl.open(ofile+".cl.dist","w");
    if (al) fal.open(ofile+".cl.dist","w");

    for (i=dists.size()-1; i>=0; --i){
      if (sl) fsl.write(estr(dists[i].x)+" "+dists[i].y+" "+dists[i].dist+" "+slcluster.clusterData.getMergeDistance(dists[i].x,dists[i].y)+"\n");
      if (cl) fcl.write(estr(dists[i].x)+" "+dists[i].y+" "+dists[i].dist+" "+clcluster.clusterData.getMergeDistance(dists[i].x,dists[i].y)+"\n");
      if (al) fal.write(estr(dists[i].x)+" "+dists[i].y+" "+dists[i].dist+" "+alcluster.clusterData.getMergeDistance(dists[i].x,dists[i].y)+"\n");
    }
  }
  return(0);
}
Example #11
0
void actionMakeReps()
{
  ldieif(argvc<3,"syntax: "+efile(argv[0]).basename()+" -makereps <alignment> <otu>");
  estrhashof<INDTYPE> seqind;

  estrarray uarr;

  cout << "# loading seqs file: " << argv[1] << endl;
  load_seqs_compressed(argv[1],arr,seqind,seqlen);
  load_seqs(argv[1],uarr);

  earray<ebasicarray<INDTYPE> > otus;

  efile f;
  estr line;
  estrarray parts;
  f.open(argv[2],"r");
  while (!f.eof()){
    f.readln(line);
    if (line.len()==0 || line[0]=='#') continue;
    if (line[0]=='>'){
      otus.add(ebasicarray<INDTYPE>());
      continue;
    }
    ldieif(otus.size()==0,"first entry not start of OTU or missing '>'");
    parts=line.explode("\t");
    ldieif(parts.size()==0,"array empty: "+line);
    ldieif(!seqind.exists(parts[0]),"sequence not found: "+parts[0]);
    otus[otus.size()-1].add(seqind[parts[0]]);
  }

  cerr << endl;

  ebasicarray<INDTYPE> tuniqind;
  earray<ebasicarray<INDTYPE> > dupslist;
  finduniq(tuniqind,dupslist);

  eintarray uniqmask;
  uniqmask.init(arr.size(),0);
  for (long i=0; i<tuniqind.size(); ++i)
    uniqmask[tuniqind[i]]=dupslist[i].size();


//  ebasicarray<INDTYPE> uniqind;
  taskman.createThread(nthreads);

  ebasicarray<INDTYPE> uniqind;
  const float t=0.0;
  efloatarray avgdist;
  for (long j=0; j<otus.size(); ++j){
//    cout << "# computing distances for otu "<< j << " size: " << otus[j].size() <<  endl;
    if (otus[j].size()==1){
      cout << ">OTU" << j << " " << arr.keys(otus[j][0]) << " avg_id=1.0 otu_size=1" << endl;
      cout << uarr.values(otus[j][0]) << endl;
      continue;
    }
    uniqind.clear();
    for (long l=0; l<otus[j].size(); ++l){
      if (uniqmask[otus[j][l]]!=0)
        uniqind.add(otus[j][l]);
    }
//    uniqind=otus[j];
    ldieif(uniqind.size()==0,"empty OTU");

    if (uniqind.size()==1){
      cout << ">OTU" << j << " " << arr.keys(uniqind[0]) << " avg_id=1.0 otu_size=" << otus[j].size() << endl;
      cout << uarr.values(uniqind[0]) << endl;
      continue;
    }
    avgdist.clear();
    avgdist.init(arr.size(),0.0);
    dists.clear();
  
    partsTotal=10000;
    if (partsTotal>(uniqind.size()-1l)*uniqind.size()/20l) partsTotal=(uniqind.size()-1l)*uniqind.size()/20l; // make fewer tasks if to few calculations per task
    if (partsTotal<=0) partsTotal=1;
    
    taskman.clear();
    for (long i=0; i<partsTotal; ++i)
      taskman.addTask(dfunc.value().calcfunc,evararray(mutex,uniqind,arr,dists,(const int&)seqlen,(const long int&)i,(const long int&)partsTotal,(const float&)t,(const int&)winlen));
    taskman.wait();
    for (long i=0; i<dists.size(); ++i){
      eseqdist& d(dists[i]);
      avgdist[d.x]+=d.dist*uniqmask[d.y];
      avgdist[d.y]+=d.dist*uniqmask[d.x];
//      cout << "# "<< arr.keys(d.x) << " " << arr.keys(d.y) << " " << d.dist << " " << uniqmask[d.x] << " " << uniqmask[d.y] << endl;
    }
    long k=uniqind[0];
    for (long i=0; i<uniqind.size(); ++i){
      long ti=uniqind[i];
      avgdist[ti]+=uniqmask[ti]-1;
      if (avgdist[k]<avgdist[ti]) {
//        cout << "# " << arr.keys(ti) << " " << ti << " " << uniqmask[ti] << " " << avgdist[ti] << " " << counts[ti] << endl;
        k=ti;
      }
    }
//    cout << "OTU" << j << " " << otus[j].size() << " " << arr.keys(k) << " " << avgdist[k]/(otus[j].size()-1) << " " << dists.size() << endl;
    cout << ">OTU" << j << " " << arr.keys(k) << " avg_id=" << avgdist[k]/(otus[j].size()-1) << " otu_size=" << otus[j].size() << endl;
    cout << uarr.values(k) << endl;
  }
  cerr << endl;

  exit(0);
}