예제 #1
0
void eseqclustersingle::init(INDTYPE count,const estr& ofilename,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist) {
  ofile.open(ofilename,"w");
  ofile.write("# seqsfile: "+seqsfile+"\n");
  ofile.write("# OTU_count Merge_distance Merged_OTU_id1 Merged_OTU_id2\n");
  long i,j;
  mergecount=0;
  scount.reserve(count);
  scluster.reserve(count);
  smerge.reserve(count);
  incluster.reserve(count);
  for (i=0; i<count; ++i){
    scount.add(1);
    scluster.add(i);
    smerge.add(-1);
    incluster.add(list<INDTYPE>());
    incluster[i].push_back(i);
  }
  for (i=0; i<dupslist.size(); ++i){
    for (j=1; j<dupslist[i].size(); ++j){
      ++mergecount;
      ofile.write(estr(scluster.size()-mergecount)+" 1.0 "+dupslist[i][0]+" "+dupslist[i][j]+"\n");
      clusterData.mergearr.add(eseqdist(dupslist[i][0],dupslist[i][j],1.0));
    }
  }
  cout << "# initializing cluster with: "<< count<< " seqs" << endl; 
}
예제 #2
0
void eseqclusterData::load(const efile& f,int _count)
{
  count=_count;
  mergearr.clear();
  estr line;
  estrarray parts;
  while (f.readln(line)){
    if (line.len()==0 || line[0]=='#') continue;
    parts=line.explode(" ");
    if (parts.size()==3){
      mergearr.add(eseqdist(parts[1].i(),parts[2].i(),parts[0].f()));
    }else if (parts.size()==4){
      mergearr.add(eseqdist(parts[2].i(),parts[3].i(),parts[1].f()));
    }else
      ldie("fields in line different than 3: "+line);
  }
}
예제 #3
0
void eseqclusteravg::init(INDTYPE count,const estr& filename,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist,float _thres,float (_fdist)(const estr&,const estr&,int),estrarray& _seqarr,int _seqlen)
{
  thres=_thres;
  seqarr=&_seqarr;
  seqlen=_seqlen;
  fdist=_fdist;
  ofile.open(filename,"w");
  ofile.write("# seqsfile: "+seqsfile+"\n");
  ofile.write("# OTU_count Merge_distance Merged_OTU_id1 Merged_OTU_id2\n");
  long i,j;
  incmaxdist=1.0;
  incmaxit=smatrix.end();
  cf=0;
  lastdist=0.0;
  scount.reserve(count);
  scluster.reserve(count);
  smerge.reserve(count);
  inter.reserve(count);
  incluster.reserve(count);
  mergecount=0;
  for (i=0; i<count; ++i){
    scount.add(1);
    scluster.add(i);
    smerge.add(-1);
    incluster.add(list<INDTYPE>());
    incluster[i].push_back(i);
    inter.add(list<INDTYPE>());
  }
  for (i=0; i<dupslist.size(); ++i){
    for (j=1; j<dupslist[i].size(); ++j){
      ++mergecount;
      ofile.write(estr(scluster.size()-mergecount)+" 1.0 "+dupslist[i][0]+" "+dupslist[i][j]+"\n");
      clusterData.mergearr.add(eseqdist(dupslist[i][0],dupslist[i][j],1.0));
    }
  }
  cout << "# initializing cluster with: "<< count<< " seqs" << endl; 
  cout << "# initializing smatrix with: " << (long)(count)*(long)(count)/20000l/2l<< " elements" << endl; 
  smatrix.reserve((long)(count)*(long)(count)/20000l/2l);
//  cout << "# smatrix._hashitems = " << smatrix._hashitems << endl;
}
예제 #4
0
void eseqclustersingle::merge(INDTYPE x,INDTYPE y,float dist)
{
  if (x==y) return;
  ldieif(scount[x]==0 || scount[y]==0,"also should not happen");

  clusterData.mergearr.add(eseqdist(x,y,dist));

  smerge[x]=x;
  smerge[y]=x;

  scount[x]+=scount[y];
  scount[y]=0;

  list<INDTYPE>::iterator it;
  for (it=incluster[y].begin(); it!=incluster[y].end(); ++it){
    scluster[*it]=x;
    incluster[x].push_back(*it);
  }
  ++mergecount;
  
//  cout << scluster.size()-mergecount << " " << dist << " " << x << " " << y << endl;
  ofile.write(estr(scluster.size()-mergecount)+" "+dist+" "+x+" "+y+"\n");
}
예제 #5
0
void eseqclusteravg::merge(const eseqdistCount& sdist)
{
  ldieif(sdist.x==sdist.y,"should not happen!");
  ldieif(scount[sdist.x]==0 || scount[sdist.y]==0,"also should not happen");

  clusterData.mergearr.add(eseqdist(sdist.x,sdist.y,sdist.dist));

  smerge[sdist.x]=sdist.x;
  smerge[sdist.y]=sdist.x;

  scount[sdist.x]+=scount[sdist.y];
  scount[sdist.y]=0;

  list<INDTYPE>::iterator it;
  for (it=incluster[sdist.y].begin(); it!=incluster[sdist.y].end(); ++it){
    scluster[*it]=sdist.x;
    incluster[sdist.x].push_back(*it);
  }

  eseqdistCount tmpdist,tmpdist2;
//  estr tmpstr,tmpstr2;
  tmpdist.x=sdist.x;
  tmpdist2.x=sdist.y;

  INDTYPE i,j;
  for (it=inter[sdist.y].begin(); it!=inter[sdist.y].end(); ++it){
    j=scluster[*it];
    if (sdist.x==j || sdist.y==j) continue;
    tmpdist.y=j;
    tmpdist2.y=j;
//    xy2estr(x,j,tmpstr);
//    xy2estr(y,j,tmpstr2);

    eseqdistavghash::iter tmpit2=smatrix.get(tmpdist2);
    if(tmpit2==smatrix.end()) continue;

    eseqdistavghash::iter tmpit=smatrix.get(tmpdist);
    if (tmpit!=smatrix.end()){
/*
      if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count+tmpit2->count){
        if((tmpit->dist*tmpit->count+tmpit2->dist*tmpit2->count)/(tmpit->count+tmpit2->count)>sdist.dist){
          cout << "sdist.dist: " << sdist.dist << " tmpit: " << *tmpit << " tmpit2: " << *tmpit2 << " sdist: " << sdist << " scount[x]: " << scount[sdist.x] << " scount[y]: " << scount[sdist.y] << " scount[j]: "<< scount[j] << endl;;
          exit(-1);
        }
      }
*/
      tmpit->dist=((double)tmpit->dist*tmpit->count+(double)tmpit2->dist*tmpit2->count)/(double)(tmpit->count+tmpit2->count);
      tmpit->count+=tmpit2->count;
    }else{
      tmpdist.dist=tmpit2->dist;
      tmpdist.count=tmpit2->count;
      smatrix.add(tmpdist,tmpdist);
      inter[sdist.x].push_back(j);
      tmpit=smatrix.get(tmpdist);
    }
    // make sure to add merged neighbors which are complete to the complete list
    lassert(scount[tmpit->x]==0 || scount[tmpit->y]==0);
    if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count){
//      ldieif(tmpit->dist>sdist.dist,"sdist.dist: "+estr(sdist.dist)+" tmpit.dist: "+tmpit->dist+" tmpit.count: "+tmpit->count);
      completemerges.insert(*tmpit);
      if (tmpit == incmaxit)
        incmaxit=smatrix.end();
    }
    if (tmpit2==incmaxit)
      incmaxit=smatrix.end();
    smatrix.erase(tmpit2);
  }
  ++mergecount;
}