Esempio n. 1
0
/** Use modern version of the Fisher-Yates shuffle to randomly reorder the
  * given points.
  */
void Cluster_Kmeans::ShufflePoints( Iarray& PointIndices ) {
  for (unsigned int i = PointIndices.size() - 1; i != 1; i--)
  { // 0 <= j <= i
    unsigned int j = (unsigned int)(RN_.rn_gen() * (double)i);
    int temp = PointIndices[j];
    PointIndices[j] = PointIndices[i];
    PointIndices[i] = temp;
  }
  if (debug_ > 0) { 
    mprintf("DEBUG: Shuffled points:");
    for (Iarray::const_iterator it = PointIndices.begin();
                                it != PointIndices.end(); ++it)
      mprintf(" %i", *it);
    mprintf("\n");
  }
}
Esempio n. 2
0
int SequenceAlign(CpptrajState& State, ArgList& argIn) {
  std::string blastfile = argIn.GetStringKey("blastfile");
  if (blastfile.empty()) {
    mprinterr("Error: 'blastfile' must be specified.\n");
    return 1;
  }
  ReferenceFrame qref = State.DSL()->GetReferenceFrame(argIn);
  if (qref.error() || qref.empty()) {
    mprinterr("Error: Must specify reference structure for query.\n");
    return 1;
  }
  std::string outfilename = argIn.GetStringKey("out");
  if (outfilename.empty()) {
    mprinterr("Error: Must specify output file.\n");
    return 1;
  }
  TrajectoryFile::TrajFormatType fmt = TrajectoryFile::GetFormatFromArg(argIn);
  if (fmt != TrajectoryFile::PDBFILE && fmt != TrajectoryFile::MOL2FILE)
    fmt = TrajectoryFile::PDBFILE; // Default to PDB
  int smaskoffset = argIn.getKeyInt("smaskoffset", 0) + 1;
  int qmaskoffset = argIn.getKeyInt("qmaskoffset", 0) + 1;

  // Load blast file
  mprintf("\tReading BLAST alignment from '%s'\n", blastfile.c_str());
  BufferedLine infile;
  if (infile.OpenFileRead( blastfile )) return 1;
  // Seek down to first Query line.
  const char* ptr = infile.Line();
  bool atFirstQuery = false;
  while (ptr != 0) {
    if (*ptr == 'Q') {
      if ( strncmp(ptr, "Query", 5) == 0 ) {
        atFirstQuery = true;
        break;
      }
    }
    ptr = infile.Line();
  }
  if (!atFirstQuery) {
    mprinterr("Error: 'Query' not found.\n");
    return 1;
  }

  // Read alignment. Replacing query with subject.
  typedef std::vector<char> Carray;
  typedef std::vector<int> Iarray;
  Carray Query; // Query residues
  Carray Sbjct; // Sbjct residues
  Iarray Smap;  // Smap[Sbjct index] = Query index
  while (ptr != 0) {
    const char* qline = ptr;           // query line
    const char* aline = infile.Line(); // alignment line
    const char* sline = infile.Line(); // subject line
    if (aline == 0 || sline == 0) {
      mprinterr("Error: Missing alignment line or subject line after Query:\n");
      mprinterr("Error:  %s", qline);
      return 1;
    }
    for (int idx = 12; qline[idx] != ' '; idx++) {
      if (qline[idx] == '-') {
        // Sbjct does not have corresponding res in Query
        Smap.push_back(-1);
        Sbjct.push_back( sline[idx] );
      } else if (sline[idx] == '-') {
        // Query does not have a corresponding res in Sbjct
        Query.push_back( qline[idx] );
      } else {
        // Direct Query to Sbjct map
        Smap.push_back( Query.size() );
        Sbjct.push_back( sline[idx] );
        Query.push_back( qline[idx] );
      }
    }
    // Scan to next Query 
    ptr = infile.Line();
    while (ptr != 0) {
      if (*ptr == 'Q') {
        if ( strncmp(ptr, "Query", 5) == 0 ) break;
      }
      ptr = infile.Line();
    }
  }
  // DEBUG
  std::string SmaskExp, QmaskExp;
  if (State.Debug() > 0) mprintf("  Map of Sbjct to Query:\n");
  for (int sres = 0; sres != (int)Sbjct.size(); sres++) {
    if (State.Debug() > 0)
      mprintf("%-i %3s %i", sres+smaskoffset, Residue::ConvertResName(Sbjct[sres]),
              Smap[sres]+qmaskoffset);
    const char* qres = "";
    if (Smap[sres] != -1) {
      qres = Residue::ConvertResName(Query[Smap[sres]]);
      if (SmaskExp.empty())
        SmaskExp.assign( integerToString(sres+smaskoffset) );
      else
        SmaskExp.append( "," + integerToString(sres+smaskoffset) );
      if (QmaskExp.empty())
        QmaskExp.assign( integerToString(Smap[sres]+qmaskoffset) );
      else
        QmaskExp.append( "," + integerToString(Smap[sres]+qmaskoffset) );

    }
    if (State.Debug() > 0) mprintf(" %3s\n", qres);
  }
  mprintf("Smask: %s\n", SmaskExp.c_str());
  mprintf("Qmask: %s\n", QmaskExp.c_str());
  // Check that query residues match reference.
  for (unsigned int sres = 0; sres != Sbjct.size(); sres++) {
    int qres = Smap[sres];
    if (qres != -1) {
      if (Query[qres] != qref.Parm().Res(qres).SingleCharName()) {
        mprintf("Warning: Potential residue mismatch: Query %s reference %s\n",
                Residue::ConvertResName(Query[qres]), qref.Parm().Res(qres).c_str());
      }
    }
  }
  // Build subject using coordinate from reference.
  //AtomMask sMask; // Contain atoms that should be in sTop
  Topology sTop;
  Frame sFrame;
  Iarray placeHolder; // Atom indices of placeholder residues.
  for (unsigned int sres = 0; sres != Sbjct.size(); sres++) {
    int qres = Smap[sres];
    NameType SresName( Residue::ConvertResName(Sbjct[sres]) );
    if (qres != -1) {
      Residue const& QR = qref.Parm().Res(qres);
      Residue SR(SresName, sres+1, ' ', QR.ChainID());
      if (Query[qres] == Sbjct[sres]) { // Exact match. All non-H atoms.
        for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++)
        {
          if (qref.Parm()[qat].Element() != Atom::HYDROGEN)
            sTop.AddTopAtom( qref.Parm()[qat], SR );
            sFrame.AddXYZ( qref.Coord().XYZ(qat) );
            //sMask.AddAtom(qat);
        }
      } else { // Partial match. Copy only backbone and CB.
        for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++)
        {
          if ( qref.Parm()[qat].Name().Match("N" ) ||
               qref.Parm()[qat].Name().Match("CA") ||
               qref.Parm()[qat].Name().Match("CB") ||
               qref.Parm()[qat].Name().Match("C" ) ||
               qref.Parm()[qat].Name().Match("O" ) )
          {
            sTop.AddTopAtom( qref.Parm()[qat], SR );
            sFrame.AddXYZ( qref.Coord().XYZ(qat) );
          }
        }
      }
    } else {
      // Residue in query does not exist for subject. Just put placeholder CA for now.
      Vec3 Zero(0.0);
      placeHolder.push_back( sTop.Natom() );
      sTop.AddTopAtom( Atom("CA", "C "), Residue(SresName, sres+1, ' ', ' ') );
      sFrame.AddXYZ( Zero.Dptr() );
    }
  }
  //sTop.PrintAtomInfo("*");
  mprintf("\tPlaceholder residue indices:");
  for (Iarray::const_iterator p = placeHolder.begin(); p != placeHolder.end(); ++p)
    mprintf(" %i", *p + 1);
  mprintf("\n");
  // Try to give placeholders more reasonable coordinates.
  if (!placeHolder.empty()) {
    Iarray current_indices;
    unsigned int pidx = 0;
    while (pidx < placeHolder.size()) {
      if (current_indices.empty()) {
        current_indices.push_back( placeHolder[pidx++] );
        // Search for the end of this segment
        for (; pidx != placeHolder.size(); pidx++) {
          if (placeHolder[pidx] - current_indices.back() > 1) break;
          current_indices.push_back( placeHolder[pidx] );
        }
        // DEBUG
        mprintf("\tSegment:");
        for (Iarray::const_iterator it = current_indices.begin();
                                    it != current_indices.end(); ++it)
          mprintf(" %i", *it + 1);
        // Get coordinates of residues bordering segment.
        int prev_res = sTop[current_indices.front()].ResNum() - 1;
        int next_res = sTop[current_indices.back() ].ResNum() + 1;
        mprintf(" (prev_res=%i, next_res=%i)\n", prev_res+1, next_res+1);
        Vec3 prev_crd(sFrame.XYZ(current_indices.front() - 1));
        Vec3 next_crd(sFrame.XYZ(current_indices.back()  + 1));
        prev_crd.Print("prev_crd");
        next_crd.Print("next_crd");
        Vec3 crd_step = (next_crd - prev_crd) / (double)(current_indices.size()+1);
        crd_step.Print("crd_step");
        double* xyz = sFrame.xAddress() + (current_indices.front() * 3);
        for (unsigned int i = 0; i != current_indices.size(); i++, xyz += 3) {
          prev_crd += crd_step;
          xyz[0] = prev_crd[0];
          xyz[1] = prev_crd[1];
          xyz[2] = prev_crd[2];
        }
        current_indices.clear();
      }
    }
  }
  //Topology* sTop = qref.Parm().partialModifyStateByMask( sMask );
  //if (sTop == 0) return 1;
  //Frame sFrame(qref.Coord(), sMask);
  // Write output traj
  Trajout_Single trajout;
  if (trajout.PrepareTrajWrite(outfilename, argIn, &sTop, CoordinateInfo(), 1, fmt)) return 1;
  if (trajout.WriteSingle(0, sFrame)) return 1;
  trajout.EndTraj();
  return 0;
}
Esempio n. 3
0
// Analysis_TI::Calc_Increment()
int Analysis_TI::Calc_Increment() {
  // Determine max points if not given.
  int maxpts = avg_max_;
  if (maxpts == -1) {
    for (unsigned int idx = 0; idx != input_dsets_.size(); idx++) {
      DataSet_1D const& ds = static_cast<DataSet_1D const&>( *(input_dsets_[idx]) );
      if (maxpts == -1)
        maxpts = (int)ds.Size();
      else if (maxpts != (int)ds.Size()) {
        mprintf("Warning: # points in '%s' (%zu) is different than %i.\n",
                ds.legend(), ds.Size(), maxpts);
        maxpts = std::min( maxpts, (int)ds.Size() );
        mprintf("Warning:   Will only use %i points.\n", maxpts);
      }
    }
  }
  if (maxpts < 1) {
    mprinterr("Error: Max points to use is < 1.\n");
    return 1;
  }
  if (avg_skip_ >= maxpts) {
    mprinterr("Error: 'avgskip' (%i) > max (%i).\n", avg_skip_, maxpts);
    return 1;
  }
  // sum: Hold the results of integration for each curve (increment)
  Darray sum;
  // points: Hold point values at which each avg is being calculated
  Iarray points;
  // Loop over input data sets. 
  for (unsigned int idx = 0; idx != input_dsets_.size(); idx++) {
    DataSet_1D const& ds = static_cast<DataSet_1D const&>( *(input_dsets_[idx]) );
    if (CheckSet(ds)) return 1; 
    // Calculate averages for each increment
    Darray avg;
    Iarray increments;
    int count = 0;
    int endpt = maxpts -1;
    double currentSum = 0.0;
    if (debug_ > 0) mprintf("DEBUG: Lambda %g\n", xval_[idx]);
    for (int pt = avg_skip_; pt != maxpts; pt++)
    {
      currentSum += ds.Dval(pt);
      count++;
      if (count == avg_increment_ || pt == endpt) {
        avg.push_back( currentSum / ((double)(pt - avg_skip_ + 1)) );
        increments.push_back(pt+1);
        if (debug_ > 0)
          mprintf("DEBUG:\t\tAvg from %i to %i: %g\n", avg_skip_+1, pt+1, avg.back());
        count = 0;
      }
    }
    if (sum.empty()) {
      sum.resize(avg.size());
      points = increments;
    } else if (sum.size() != avg.size()) {
      mprinterr("Error: Different # of increments for set '%s'; got %zu, expected %zu.\n",
                ds.legend(), avg.size(), sum.size());
      return 1;
    }
    // Create increment curve data sets
    if (curve_.empty()) {
      MetaData md(dAout_->Meta().Name(), "TIcurve");
      for (unsigned int j = 0; j != avg.size(); j++) {
        md.SetIdx( increments[j] );
        DataSet* ds = masterDSL_->AddSet(DataSet::XYMESH, md);
        if (ds == 0) return Analysis::ERR;
        ds->ModifyDim(Dimension::X).SetLabel("Lambda");
        ds->SetLegend( md.Name() + "_Skip" + integerToString(increments[j]) );
        if (curveout_ != 0) curveout_->AddDataSet( ds );
        curve_.push_back( ds );
      }
    }
    for (unsigned int j = 0; j != avg.size(); j++) {
      DataSet_Mesh& CR = static_cast<DataSet_Mesh&>( *(curve_[j]) );
      CR.AddXY(xval_[idx], avg[j]);
      if (mode_ == GAUSSIAN_QUAD)
        sum[j] += (wgt_[idx] * avg[j]);
    }
  } // END loop over data sets
  if (mode_ == TRAPEZOID) Integrate_Trapezoid(sum);
  // Store final integration values
  DataSet_Mesh& DA = static_cast<DataSet_Mesh&>( *dAout_ );
  DA.ModifyDim(Dimension::X).SetLabel("Point");
  for (unsigned int j = 0; j != points.size(); j++)
    DA.AddXY(points[j], sum[j]);

  return 0;
}
Esempio n. 4
0
// Cluster_Kmeans::Cluster()
int Cluster_Kmeans::Cluster() {
  // First determine which frames are being clustered.
  Iarray const& FramesToCluster = FrameDistances().FramesToCluster();

  // Determine seeds
  FindKmeansSeeds( FramesToCluster );

  if (mode_ == RANDOM)
    RN_.rn_set( kseed_ );

  int pointCount = (int)FramesToCluster.size();

  // This array will hold the indices of the points to process each iteration.
  // If sequential this is just 0 -> pointCount. If random this will be 
  // reassigned each iteration.
  Iarray PointIndices;
  PointIndices.reserve( pointCount );
  for (int processIdx = 0; processIdx != pointCount; processIdx++)
    PointIndices.push_back( processIdx );

  // Add the seed clusters
  for (Iarray::const_iterator seedIdx = SeedIndices_.begin();
                              seedIdx != SeedIndices_.end(); ++seedIdx)
  {
    int seedFrame = FramesToCluster[ *seedIdx ];
    // A centroid is created for new clusters.
    AddCluster( ClusterDist::Cframes(1, seedFrame) );
    // NOTE: No need to calc best rep frame, only 1 frame.
    if (debug_ > 0)
      mprintf("Put frame %i in cluster %i (seed index=%i).\n", 
              seedFrame, clusters_.back().Num(), *seedIdx);
  }
  // Assign points in 3 passes. If a point looked like it belonged to cluster A
  // at first, but then we added many other points and altered our cluster 
  // shapes, its possible that we will want to reassign it to cluster B.
  for (int iteration = 0; iteration != maxIt_; iteration++)
  {
    if (mode_ == RANDOM)
      ShufflePoints( PointIndices );
    // Add each point to an existing cluster, and recompute centroid
    mprintf("\tRound %i: ", iteration);
    ProgressBar progress( PointIndices.size() );
    int Nchanged = 0;
    int prog = 0;
    for (Iarray::const_iterator pointIdx = PointIndices.begin();
                                pointIdx != PointIndices.end(); ++pointIdx, ++prog)
    {
      if (debug_ < 1) progress.Update( prog );
      int oldClusterIdx = -1;
//      if ( iteration != 0 || mode_ != SEQUENTIAL) // FIXME: Should this really happen for RANDOM
//      {
        int pointFrame = FramesToCluster[ *pointIdx ];
        if (debug_ > 0)
          mprintf("DEBUG: Processing frame %i (index %i)\n", pointFrame, *pointIdx);
        bool pointWasYanked = true;
        if (iteration > 0) {
          // Yank this point out of its cluster, recompute the centroid
          for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1)
          {
            if (C1->HasFrame( pointFrame )) 
            {
              // If this point is alone in its cluster its in the right place
              if (C1->Nframes() == 1) {
                pointWasYanked = false;
                continue; // FIXME: should this be a break?
              }
              //oldBestRep = C1->BestRepFrame(); 
              oldClusterIdx = C1->Num();
              C1->RemoveFrameUpdateCentroid( Cdist_, pointFrame ); // TEST
//              C1->RemoveFrameFromCluster( pointFrame );
              //newBestRep = C1->FindBestRepFrame();
//              C1->CalculateCentroid( Cdist_ );
              if (debug_ > 0)
                mprintf("Remove Frame %i from cluster %i\n", pointFrame, C1->Num());
              //if (clusterToClusterCentroid_) {
              //  if (oldBestRep != NewBestRep)
              //    C1->AlignToBestRep( Cdist_ ); // FIXME: Only relevant for COORDS dist?
              //  C1->CalculateCentroid( Cdist_ ); // FIXME: Seems unnessecary to align prior
              //} 
            }
          }
        } else {
          // First iteration. If this point is already in a cluster it is a seed.
          for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1)
          {
            if (C1->HasFrame( pointFrame )) {
              pointWasYanked = false;
              if (debug_ > 0)
                mprintf("Frame %i was already used to seed cluster %i\n", 
                        pointFrame, C1->Num());
              continue; // FIXME break?
            }
          }
        }
        if (pointWasYanked) {
          // Find out what cluster this point is now closest to.
          double closestDist = -1.0;
          cluster_it closestCluster = clusters_.begin();
          for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1)
          {
            double dist = Cdist_->FrameCentroidDist(pointFrame, C1->Cent());
            if (closestDist < 0.0 || dist < closestDist)
            {
              closestDist = dist;
              closestCluster = C1;
            }
          }
          //oldBestRep = closestCluster->BestRepFrame();
          closestCluster->AddFrameUpdateCentroid( Cdist_, pointFrame ); // TEST
//          closestCluster->AddFrameToCluster( pointFrame );
          //newBestRep = closestCluster->FindBestFrameFrame();
//          closestCluster->CalculateCentroid( Cdist_ );
          if (closestCluster->Num() != oldClusterIdx)
          {
            Nchanged++;
            if (debug_ > 0)
              mprintf("Remove Frame %i from cluster %i, but add to cluster %i (dist= %f).\n",
                      pointFrame, oldClusterIdx, closestCluster->Num(), closestDist);
          } else {
            if (debug_ > 0)
              mprintf("Frame %i staying in cluster %i\n", pointFrame, closestCluster->Num());
          }
          if (clusterToClusterCentroid_) {
            //if (oldBestRep != NewBestRep) {
            //    C1->AlignToBestRep( Cdist_ ); // FIXME: Only relevant for COORDS dist?
            //  C1->CalculateCentroid( Cdist_ ); // FIXME: Seems unnessecary to align prior
            //}
          }
        }
//      }
    } // END loop over points to cluster
    if (Nchanged == 0) {
      mprintf("\tK-means round %i: No change. Skipping the rest of the iterations.\n", iteration);
      break;
    } else
      mprintf("\tK-means round %i: %i points changed cluster assignment.\n", iteration, Nchanged);
  } // END k-means iterations
  // Remove any empty clusters
  // FIXME: Will there ever be empty clusters?
  RemoveEmptyClusters();
  // NOTE in PTRAJ here align all frames to best rep 
  return 0;
}
Esempio n. 5
0
/** Find some seed-points for K-means clustering. Take the first point as an 
  * arbitrary first choice.  Then, at each iteration, add the point whose total
  * distance from our set of seeds is as large as possible.
  */
int Cluster_Kmeans::FindKmeansSeeds(Iarray const& FramesToCluster) {
  // SeedIndices will hold indices into FramesToCluster
  SeedIndices_.resize( nclusters_, 1 ); // 1 used to be consistent with ptraj

  double bestDistance = 0.0;
  int frameCount = (int)FramesToCluster.size();
  for (int frameIdx = 0; frameIdx != frameCount; frameIdx++)
  {
    int seedFrame = FramesToCluster[ frameIdx ];
    for (int candidateIdx = frameIdx; candidateIdx < frameCount; candidateIdx++)
    {
      int candidateFrame = FramesToCluster[ candidateIdx ];
      double dist = FrameDistances().GetFdist( seedFrame, candidateFrame );
      if (dist > bestDistance)
      {
        bestDistance = dist;
        SeedIndices_[0] = frameIdx;
        SeedIndices_[1] = candidateIdx;
      }
    }
  }

  for (int seedIdx = 2; seedIdx != nclusters_; seedIdx++)
  {
    bestDistance = 0.0;
    int bestIdx = 0;
    for (int candidateIdx = 0; candidateIdx < frameCount; candidateIdx++)
    {
      // Make sure this candidate isnt already a seed
      bool skipCandidate = false;
      for (int checkIdx = 0; checkIdx != seedIdx; checkIdx++)
      {
        if (SeedIndices_[checkIdx] == candidateIdx) {
          skipCandidate = true;
          break;
        }
      }
      if (!skipCandidate) {
        // Get the closest distance from this candidate to a current seed
        int candidateFrame = FramesToCluster[ candidateIdx ];
        double nearestDist = -1.0;
        for (int checkIdx = 0; checkIdx != seedIdx; checkIdx++)
        {
          int seedFrame = FramesToCluster[ SeedIndices_[checkIdx] ];
          double dist = FrameDistances().GetFdist( candidateFrame, seedFrame );
          if (dist < nearestDist || nearestDist < 0.0)
            nearestDist = dist;
        }
        // Is this the best so far?
        if (nearestDist > bestDistance)
        {
          bestDistance = nearestDist;
          bestIdx = candidateIdx;
        }
      }
    }
    SeedIndices_[seedIdx] = bestIdx;
  }
  if (debug_ > 0)
    for (unsigned int si = 0; si != SeedIndices_.size(); si++)
      mprintf("DEBUG:\t\tSeedIndices[%u]= %i\n", si, SeedIndices_[si]);
  return 0;
}