/** Use modern version of the Fisher-Yates shuffle to randomly reorder the * given points. */ void Cluster_Kmeans::ShufflePoints( Iarray& PointIndices ) { for (unsigned int i = PointIndices.size() - 1; i != 1; i--) { // 0 <= j <= i unsigned int j = (unsigned int)(RN_.rn_gen() * (double)i); int temp = PointIndices[j]; PointIndices[j] = PointIndices[i]; PointIndices[i] = temp; } if (debug_ > 0) { mprintf("DEBUG: Shuffled points:"); for (Iarray::const_iterator it = PointIndices.begin(); it != PointIndices.end(); ++it) mprintf(" %i", *it); mprintf("\n"); } }
int SequenceAlign(CpptrajState& State, ArgList& argIn) { std::string blastfile = argIn.GetStringKey("blastfile"); if (blastfile.empty()) { mprinterr("Error: 'blastfile' must be specified.\n"); return 1; } ReferenceFrame qref = State.DSL()->GetReferenceFrame(argIn); if (qref.error() || qref.empty()) { mprinterr("Error: Must specify reference structure for query.\n"); return 1; } std::string outfilename = argIn.GetStringKey("out"); if (outfilename.empty()) { mprinterr("Error: Must specify output file.\n"); return 1; } TrajectoryFile::TrajFormatType fmt = TrajectoryFile::GetFormatFromArg(argIn); if (fmt != TrajectoryFile::PDBFILE && fmt != TrajectoryFile::MOL2FILE) fmt = TrajectoryFile::PDBFILE; // Default to PDB int smaskoffset = argIn.getKeyInt("smaskoffset", 0) + 1; int qmaskoffset = argIn.getKeyInt("qmaskoffset", 0) + 1; // Load blast file mprintf("\tReading BLAST alignment from '%s'\n", blastfile.c_str()); BufferedLine infile; if (infile.OpenFileRead( blastfile )) return 1; // Seek down to first Query line. const char* ptr = infile.Line(); bool atFirstQuery = false; while (ptr != 0) { if (*ptr == 'Q') { if ( strncmp(ptr, "Query", 5) == 0 ) { atFirstQuery = true; break; } } ptr = infile.Line(); } if (!atFirstQuery) { mprinterr("Error: 'Query' not found.\n"); return 1; } // Read alignment. Replacing query with subject. typedef std::vector<char> Carray; typedef std::vector<int> Iarray; Carray Query; // Query residues Carray Sbjct; // Sbjct residues Iarray Smap; // Smap[Sbjct index] = Query index while (ptr != 0) { const char* qline = ptr; // query line const char* aline = infile.Line(); // alignment line const char* sline = infile.Line(); // subject line if (aline == 0 || sline == 0) { mprinterr("Error: Missing alignment line or subject line after Query:\n"); mprinterr("Error: %s", qline); return 1; } for (int idx = 12; qline[idx] != ' '; idx++) { if (qline[idx] == '-') { // Sbjct does not have corresponding res in Query Smap.push_back(-1); Sbjct.push_back( sline[idx] ); } else if (sline[idx] == '-') { // Query does not have a corresponding res in Sbjct Query.push_back( qline[idx] ); } else { // Direct Query to Sbjct map Smap.push_back( Query.size() ); Sbjct.push_back( sline[idx] ); Query.push_back( qline[idx] ); } } // Scan to next Query ptr = infile.Line(); while (ptr != 0) { if (*ptr == 'Q') { if ( strncmp(ptr, "Query", 5) == 0 ) break; } ptr = infile.Line(); } } // DEBUG std::string SmaskExp, QmaskExp; if (State.Debug() > 0) mprintf(" Map of Sbjct to Query:\n"); for (int sres = 0; sres != (int)Sbjct.size(); sres++) { if (State.Debug() > 0) mprintf("%-i %3s %i", sres+smaskoffset, Residue::ConvertResName(Sbjct[sres]), Smap[sres]+qmaskoffset); const char* qres = ""; if (Smap[sres] != -1) { qres = Residue::ConvertResName(Query[Smap[sres]]); if (SmaskExp.empty()) SmaskExp.assign( integerToString(sres+smaskoffset) ); else SmaskExp.append( "," + integerToString(sres+smaskoffset) ); if (QmaskExp.empty()) QmaskExp.assign( integerToString(Smap[sres]+qmaskoffset) ); else QmaskExp.append( "," + integerToString(Smap[sres]+qmaskoffset) ); } if (State.Debug() > 0) mprintf(" %3s\n", qres); } mprintf("Smask: %s\n", SmaskExp.c_str()); mprintf("Qmask: %s\n", QmaskExp.c_str()); // Check that query residues match reference. for (unsigned int sres = 0; sres != Sbjct.size(); sres++) { int qres = Smap[sres]; if (qres != -1) { if (Query[qres] != qref.Parm().Res(qres).SingleCharName()) { mprintf("Warning: Potential residue mismatch: Query %s reference %s\n", Residue::ConvertResName(Query[qres]), qref.Parm().Res(qres).c_str()); } } } // Build subject using coordinate from reference. //AtomMask sMask; // Contain atoms that should be in sTop Topology sTop; Frame sFrame; Iarray placeHolder; // Atom indices of placeholder residues. for (unsigned int sres = 0; sres != Sbjct.size(); sres++) { int qres = Smap[sres]; NameType SresName( Residue::ConvertResName(Sbjct[sres]) ); if (qres != -1) { Residue const& QR = qref.Parm().Res(qres); Residue SR(SresName, sres+1, ' ', QR.ChainID()); if (Query[qres] == Sbjct[sres]) { // Exact match. All non-H atoms. for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++) { if (qref.Parm()[qat].Element() != Atom::HYDROGEN) sTop.AddTopAtom( qref.Parm()[qat], SR ); sFrame.AddXYZ( qref.Coord().XYZ(qat) ); //sMask.AddAtom(qat); } } else { // Partial match. Copy only backbone and CB. for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++) { if ( qref.Parm()[qat].Name().Match("N" ) || qref.Parm()[qat].Name().Match("CA") || qref.Parm()[qat].Name().Match("CB") || qref.Parm()[qat].Name().Match("C" ) || qref.Parm()[qat].Name().Match("O" ) ) { sTop.AddTopAtom( qref.Parm()[qat], SR ); sFrame.AddXYZ( qref.Coord().XYZ(qat) ); } } } } else { // Residue in query does not exist for subject. Just put placeholder CA for now. Vec3 Zero(0.0); placeHolder.push_back( sTop.Natom() ); sTop.AddTopAtom( Atom("CA", "C "), Residue(SresName, sres+1, ' ', ' ') ); sFrame.AddXYZ( Zero.Dptr() ); } } //sTop.PrintAtomInfo("*"); mprintf("\tPlaceholder residue indices:"); for (Iarray::const_iterator p = placeHolder.begin(); p != placeHolder.end(); ++p) mprintf(" %i", *p + 1); mprintf("\n"); // Try to give placeholders more reasonable coordinates. if (!placeHolder.empty()) { Iarray current_indices; unsigned int pidx = 0; while (pidx < placeHolder.size()) { if (current_indices.empty()) { current_indices.push_back( placeHolder[pidx++] ); // Search for the end of this segment for (; pidx != placeHolder.size(); pidx++) { if (placeHolder[pidx] - current_indices.back() > 1) break; current_indices.push_back( placeHolder[pidx] ); } // DEBUG mprintf("\tSegment:"); for (Iarray::const_iterator it = current_indices.begin(); it != current_indices.end(); ++it) mprintf(" %i", *it + 1); // Get coordinates of residues bordering segment. int prev_res = sTop[current_indices.front()].ResNum() - 1; int next_res = sTop[current_indices.back() ].ResNum() + 1; mprintf(" (prev_res=%i, next_res=%i)\n", prev_res+1, next_res+1); Vec3 prev_crd(sFrame.XYZ(current_indices.front() - 1)); Vec3 next_crd(sFrame.XYZ(current_indices.back() + 1)); prev_crd.Print("prev_crd"); next_crd.Print("next_crd"); Vec3 crd_step = (next_crd - prev_crd) / (double)(current_indices.size()+1); crd_step.Print("crd_step"); double* xyz = sFrame.xAddress() + (current_indices.front() * 3); for (unsigned int i = 0; i != current_indices.size(); i++, xyz += 3) { prev_crd += crd_step; xyz[0] = prev_crd[0]; xyz[1] = prev_crd[1]; xyz[2] = prev_crd[2]; } current_indices.clear(); } } } //Topology* sTop = qref.Parm().partialModifyStateByMask( sMask ); //if (sTop == 0) return 1; //Frame sFrame(qref.Coord(), sMask); // Write output traj Trajout_Single trajout; if (trajout.PrepareTrajWrite(outfilename, argIn, &sTop, CoordinateInfo(), 1, fmt)) return 1; if (trajout.WriteSingle(0, sFrame)) return 1; trajout.EndTraj(); return 0; }
// Analysis_TI::Calc_Increment() int Analysis_TI::Calc_Increment() { // Determine max points if not given. int maxpts = avg_max_; if (maxpts == -1) { for (unsigned int idx = 0; idx != input_dsets_.size(); idx++) { DataSet_1D const& ds = static_cast<DataSet_1D const&>( *(input_dsets_[idx]) ); if (maxpts == -1) maxpts = (int)ds.Size(); else if (maxpts != (int)ds.Size()) { mprintf("Warning: # points in '%s' (%zu) is different than %i.\n", ds.legend(), ds.Size(), maxpts); maxpts = std::min( maxpts, (int)ds.Size() ); mprintf("Warning: Will only use %i points.\n", maxpts); } } } if (maxpts < 1) { mprinterr("Error: Max points to use is < 1.\n"); return 1; } if (avg_skip_ >= maxpts) { mprinterr("Error: 'avgskip' (%i) > max (%i).\n", avg_skip_, maxpts); return 1; } // sum: Hold the results of integration for each curve (increment) Darray sum; // points: Hold point values at which each avg is being calculated Iarray points; // Loop over input data sets. for (unsigned int idx = 0; idx != input_dsets_.size(); idx++) { DataSet_1D const& ds = static_cast<DataSet_1D const&>( *(input_dsets_[idx]) ); if (CheckSet(ds)) return 1; // Calculate averages for each increment Darray avg; Iarray increments; int count = 0; int endpt = maxpts -1; double currentSum = 0.0; if (debug_ > 0) mprintf("DEBUG: Lambda %g\n", xval_[idx]); for (int pt = avg_skip_; pt != maxpts; pt++) { currentSum += ds.Dval(pt); count++; if (count == avg_increment_ || pt == endpt) { avg.push_back( currentSum / ((double)(pt - avg_skip_ + 1)) ); increments.push_back(pt+1); if (debug_ > 0) mprintf("DEBUG:\t\tAvg from %i to %i: %g\n", avg_skip_+1, pt+1, avg.back()); count = 0; } } if (sum.empty()) { sum.resize(avg.size()); points = increments; } else if (sum.size() != avg.size()) { mprinterr("Error: Different # of increments for set '%s'; got %zu, expected %zu.\n", ds.legend(), avg.size(), sum.size()); return 1; } // Create increment curve data sets if (curve_.empty()) { MetaData md(dAout_->Meta().Name(), "TIcurve"); for (unsigned int j = 0; j != avg.size(); j++) { md.SetIdx( increments[j] ); DataSet* ds = masterDSL_->AddSet(DataSet::XYMESH, md); if (ds == 0) return Analysis::ERR; ds->ModifyDim(Dimension::X).SetLabel("Lambda"); ds->SetLegend( md.Name() + "_Skip" + integerToString(increments[j]) ); if (curveout_ != 0) curveout_->AddDataSet( ds ); curve_.push_back( ds ); } } for (unsigned int j = 0; j != avg.size(); j++) { DataSet_Mesh& CR = static_cast<DataSet_Mesh&>( *(curve_[j]) ); CR.AddXY(xval_[idx], avg[j]); if (mode_ == GAUSSIAN_QUAD) sum[j] += (wgt_[idx] * avg[j]); } } // END loop over data sets if (mode_ == TRAPEZOID) Integrate_Trapezoid(sum); // Store final integration values DataSet_Mesh& DA = static_cast<DataSet_Mesh&>( *dAout_ ); DA.ModifyDim(Dimension::X).SetLabel("Point"); for (unsigned int j = 0; j != points.size(); j++) DA.AddXY(points[j], sum[j]); return 0; }
// Cluster_Kmeans::Cluster() int Cluster_Kmeans::Cluster() { // First determine which frames are being clustered. Iarray const& FramesToCluster = FrameDistances().FramesToCluster(); // Determine seeds FindKmeansSeeds( FramesToCluster ); if (mode_ == RANDOM) RN_.rn_set( kseed_ ); int pointCount = (int)FramesToCluster.size(); // This array will hold the indices of the points to process each iteration. // If sequential this is just 0 -> pointCount. If random this will be // reassigned each iteration. Iarray PointIndices; PointIndices.reserve( pointCount ); for (int processIdx = 0; processIdx != pointCount; processIdx++) PointIndices.push_back( processIdx ); // Add the seed clusters for (Iarray::const_iterator seedIdx = SeedIndices_.begin(); seedIdx != SeedIndices_.end(); ++seedIdx) { int seedFrame = FramesToCluster[ *seedIdx ]; // A centroid is created for new clusters. AddCluster( ClusterDist::Cframes(1, seedFrame) ); // NOTE: No need to calc best rep frame, only 1 frame. if (debug_ > 0) mprintf("Put frame %i in cluster %i (seed index=%i).\n", seedFrame, clusters_.back().Num(), *seedIdx); } // Assign points in 3 passes. If a point looked like it belonged to cluster A // at first, but then we added many other points and altered our cluster // shapes, its possible that we will want to reassign it to cluster B. for (int iteration = 0; iteration != maxIt_; iteration++) { if (mode_ == RANDOM) ShufflePoints( PointIndices ); // Add each point to an existing cluster, and recompute centroid mprintf("\tRound %i: ", iteration); ProgressBar progress( PointIndices.size() ); int Nchanged = 0; int prog = 0; for (Iarray::const_iterator pointIdx = PointIndices.begin(); pointIdx != PointIndices.end(); ++pointIdx, ++prog) { if (debug_ < 1) progress.Update( prog ); int oldClusterIdx = -1; // if ( iteration != 0 || mode_ != SEQUENTIAL) // FIXME: Should this really happen for RANDOM // { int pointFrame = FramesToCluster[ *pointIdx ]; if (debug_ > 0) mprintf("DEBUG: Processing frame %i (index %i)\n", pointFrame, *pointIdx); bool pointWasYanked = true; if (iteration > 0) { // Yank this point out of its cluster, recompute the centroid for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1) { if (C1->HasFrame( pointFrame )) { // If this point is alone in its cluster its in the right place if (C1->Nframes() == 1) { pointWasYanked = false; continue; // FIXME: should this be a break? } //oldBestRep = C1->BestRepFrame(); oldClusterIdx = C1->Num(); C1->RemoveFrameUpdateCentroid( Cdist_, pointFrame ); // TEST // C1->RemoveFrameFromCluster( pointFrame ); //newBestRep = C1->FindBestRepFrame(); // C1->CalculateCentroid( Cdist_ ); if (debug_ > 0) mprintf("Remove Frame %i from cluster %i\n", pointFrame, C1->Num()); //if (clusterToClusterCentroid_) { // if (oldBestRep != NewBestRep) // C1->AlignToBestRep( Cdist_ ); // FIXME: Only relevant for COORDS dist? // C1->CalculateCentroid( Cdist_ ); // FIXME: Seems unnessecary to align prior //} } } } else { // First iteration. If this point is already in a cluster it is a seed. for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1) { if (C1->HasFrame( pointFrame )) { pointWasYanked = false; if (debug_ > 0) mprintf("Frame %i was already used to seed cluster %i\n", pointFrame, C1->Num()); continue; // FIXME break? } } } if (pointWasYanked) { // Find out what cluster this point is now closest to. double closestDist = -1.0; cluster_it closestCluster = clusters_.begin(); for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1) { double dist = Cdist_->FrameCentroidDist(pointFrame, C1->Cent()); if (closestDist < 0.0 || dist < closestDist) { closestDist = dist; closestCluster = C1; } } //oldBestRep = closestCluster->BestRepFrame(); closestCluster->AddFrameUpdateCentroid( Cdist_, pointFrame ); // TEST // closestCluster->AddFrameToCluster( pointFrame ); //newBestRep = closestCluster->FindBestFrameFrame(); // closestCluster->CalculateCentroid( Cdist_ ); if (closestCluster->Num() != oldClusterIdx) { Nchanged++; if (debug_ > 0) mprintf("Remove Frame %i from cluster %i, but add to cluster %i (dist= %f).\n", pointFrame, oldClusterIdx, closestCluster->Num(), closestDist); } else { if (debug_ > 0) mprintf("Frame %i staying in cluster %i\n", pointFrame, closestCluster->Num()); } if (clusterToClusterCentroid_) { //if (oldBestRep != NewBestRep) { // C1->AlignToBestRep( Cdist_ ); // FIXME: Only relevant for COORDS dist? // C1->CalculateCentroid( Cdist_ ); // FIXME: Seems unnessecary to align prior //} } } // } } // END loop over points to cluster if (Nchanged == 0) { mprintf("\tK-means round %i: No change. Skipping the rest of the iterations.\n", iteration); break; } else mprintf("\tK-means round %i: %i points changed cluster assignment.\n", iteration, Nchanged); } // END k-means iterations // Remove any empty clusters // FIXME: Will there ever be empty clusters? RemoveEmptyClusters(); // NOTE in PTRAJ here align all frames to best rep return 0; }
/** Find some seed-points for K-means clustering. Take the first point as an * arbitrary first choice. Then, at each iteration, add the point whose total * distance from our set of seeds is as large as possible. */ int Cluster_Kmeans::FindKmeansSeeds(Iarray const& FramesToCluster) { // SeedIndices will hold indices into FramesToCluster SeedIndices_.resize( nclusters_, 1 ); // 1 used to be consistent with ptraj double bestDistance = 0.0; int frameCount = (int)FramesToCluster.size(); for (int frameIdx = 0; frameIdx != frameCount; frameIdx++) { int seedFrame = FramesToCluster[ frameIdx ]; for (int candidateIdx = frameIdx; candidateIdx < frameCount; candidateIdx++) { int candidateFrame = FramesToCluster[ candidateIdx ]; double dist = FrameDistances().GetFdist( seedFrame, candidateFrame ); if (dist > bestDistance) { bestDistance = dist; SeedIndices_[0] = frameIdx; SeedIndices_[1] = candidateIdx; } } } for (int seedIdx = 2; seedIdx != nclusters_; seedIdx++) { bestDistance = 0.0; int bestIdx = 0; for (int candidateIdx = 0; candidateIdx < frameCount; candidateIdx++) { // Make sure this candidate isnt already a seed bool skipCandidate = false; for (int checkIdx = 0; checkIdx != seedIdx; checkIdx++) { if (SeedIndices_[checkIdx] == candidateIdx) { skipCandidate = true; break; } } if (!skipCandidate) { // Get the closest distance from this candidate to a current seed int candidateFrame = FramesToCluster[ candidateIdx ]; double nearestDist = -1.0; for (int checkIdx = 0; checkIdx != seedIdx; checkIdx++) { int seedFrame = FramesToCluster[ SeedIndices_[checkIdx] ]; double dist = FrameDistances().GetFdist( candidateFrame, seedFrame ); if (dist < nearestDist || nearestDist < 0.0) nearestDist = dist; } // Is this the best so far? if (nearestDist > bestDistance) { bestDistance = nearestDist; bestIdx = candidateIdx; } } } SeedIndices_[seedIdx] = bestIdx; } if (debug_ > 0) for (unsigned int si = 0; si != SeedIndices_.size(); si++) mprintf("DEBUG:\t\tSeedIndices[%u]= %i\n", si, SeedIndices_[si]); return 0; }