/** Use modern version of the Fisher-Yates shuffle to randomly reorder the * given points. */ void Cluster_Kmeans::ShufflePoints( Iarray& PointIndices ) { for (unsigned int i = PointIndices.size() - 1; i != 1; i--) { // 0 <= j <= i unsigned int j = (unsigned int)(RN_.rn_gen() * (double)i); int temp = PointIndices[j]; PointIndices[j] = PointIndices[i]; PointIndices[i] = temp; } if (debug_ > 0) { mprintf("DEBUG: Shuffled points:"); for (Iarray::const_iterator it = PointIndices.begin(); it != PointIndices.end(); ++it) mprintf(" %i", *it); mprintf("\n"); } }
int SequenceAlign(CpptrajState& State, ArgList& argIn) { std::string blastfile = argIn.GetStringKey("blastfile"); if (blastfile.empty()) { mprinterr("Error: 'blastfile' must be specified.\n"); return 1; } ReferenceFrame qref = State.DSL()->GetReferenceFrame(argIn); if (qref.error() || qref.empty()) { mprinterr("Error: Must specify reference structure for query.\n"); return 1; } std::string outfilename = argIn.GetStringKey("out"); if (outfilename.empty()) { mprinterr("Error: Must specify output file.\n"); return 1; } TrajectoryFile::TrajFormatType fmt = TrajectoryFile::GetFormatFromArg(argIn); if (fmt != TrajectoryFile::PDBFILE && fmt != TrajectoryFile::MOL2FILE) fmt = TrajectoryFile::PDBFILE; // Default to PDB int smaskoffset = argIn.getKeyInt("smaskoffset", 0) + 1; int qmaskoffset = argIn.getKeyInt("qmaskoffset", 0) + 1; // Load blast file mprintf("\tReading BLAST alignment from '%s'\n", blastfile.c_str()); BufferedLine infile; if (infile.OpenFileRead( blastfile )) return 1; // Seek down to first Query line. const char* ptr = infile.Line(); bool atFirstQuery = false; while (ptr != 0) { if (*ptr == 'Q') { if ( strncmp(ptr, "Query", 5) == 0 ) { atFirstQuery = true; break; } } ptr = infile.Line(); } if (!atFirstQuery) { mprinterr("Error: 'Query' not found.\n"); return 1; } // Read alignment. Replacing query with subject. typedef std::vector<char> Carray; typedef std::vector<int> Iarray; Carray Query; // Query residues Carray Sbjct; // Sbjct residues Iarray Smap; // Smap[Sbjct index] = Query index while (ptr != 0) { const char* qline = ptr; // query line const char* aline = infile.Line(); // alignment line const char* sline = infile.Line(); // subject line if (aline == 0 || sline == 0) { mprinterr("Error: Missing alignment line or subject line after Query:\n"); mprinterr("Error: %s", qline); return 1; } for (int idx = 12; qline[idx] != ' '; idx++) { if (qline[idx] == '-') { // Sbjct does not have corresponding res in Query Smap.push_back(-1); Sbjct.push_back( sline[idx] ); } else if (sline[idx] == '-') { // Query does not have a corresponding res in Sbjct Query.push_back( qline[idx] ); } else { // Direct Query to Sbjct map Smap.push_back( Query.size() ); Sbjct.push_back( sline[idx] ); Query.push_back( qline[idx] ); } } // Scan to next Query ptr = infile.Line(); while (ptr != 0) { if (*ptr == 'Q') { if ( strncmp(ptr, "Query", 5) == 0 ) break; } ptr = infile.Line(); } } // DEBUG std::string SmaskExp, QmaskExp; if (State.Debug() > 0) mprintf(" Map of Sbjct to Query:\n"); for (int sres = 0; sres != (int)Sbjct.size(); sres++) { if (State.Debug() > 0) mprintf("%-i %3s %i", sres+smaskoffset, Residue::ConvertResName(Sbjct[sres]), Smap[sres]+qmaskoffset); const char* qres = ""; if (Smap[sres] != -1) { qres = Residue::ConvertResName(Query[Smap[sres]]); if (SmaskExp.empty()) SmaskExp.assign( integerToString(sres+smaskoffset) ); else SmaskExp.append( "," + integerToString(sres+smaskoffset) ); if (QmaskExp.empty()) QmaskExp.assign( integerToString(Smap[sres]+qmaskoffset) ); else QmaskExp.append( "," + integerToString(Smap[sres]+qmaskoffset) ); } if (State.Debug() > 0) mprintf(" %3s\n", qres); } mprintf("Smask: %s\n", SmaskExp.c_str()); mprintf("Qmask: %s\n", QmaskExp.c_str()); // Check that query residues match reference. for (unsigned int sres = 0; sres != Sbjct.size(); sres++) { int qres = Smap[sres]; if (qres != -1) { if (Query[qres] != qref.Parm().Res(qres).SingleCharName()) { mprintf("Warning: Potential residue mismatch: Query %s reference %s\n", Residue::ConvertResName(Query[qres]), qref.Parm().Res(qres).c_str()); } } } // Build subject using coordinate from reference. //AtomMask sMask; // Contain atoms that should be in sTop Topology sTop; Frame sFrame; Iarray placeHolder; // Atom indices of placeholder residues. for (unsigned int sres = 0; sres != Sbjct.size(); sres++) { int qres = Smap[sres]; NameType SresName( Residue::ConvertResName(Sbjct[sres]) ); if (qres != -1) { Residue const& QR = qref.Parm().Res(qres); Residue SR(SresName, sres+1, ' ', QR.ChainID()); if (Query[qres] == Sbjct[sres]) { // Exact match. All non-H atoms. for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++) { if (qref.Parm()[qat].Element() != Atom::HYDROGEN) sTop.AddTopAtom( qref.Parm()[qat], SR ); sFrame.AddXYZ( qref.Coord().XYZ(qat) ); //sMask.AddAtom(qat); } } else { // Partial match. Copy only backbone and CB. for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++) { if ( qref.Parm()[qat].Name().Match("N" ) || qref.Parm()[qat].Name().Match("CA") || qref.Parm()[qat].Name().Match("CB") || qref.Parm()[qat].Name().Match("C" ) || qref.Parm()[qat].Name().Match("O" ) ) { sTop.AddTopAtom( qref.Parm()[qat], SR ); sFrame.AddXYZ( qref.Coord().XYZ(qat) ); } } } } else { // Residue in query does not exist for subject. Just put placeholder CA for now. Vec3 Zero(0.0); placeHolder.push_back( sTop.Natom() ); sTop.AddTopAtom( Atom("CA", "C "), Residue(SresName, sres+1, ' ', ' ') ); sFrame.AddXYZ( Zero.Dptr() ); } } //sTop.PrintAtomInfo("*"); mprintf("\tPlaceholder residue indices:"); for (Iarray::const_iterator p = placeHolder.begin(); p != placeHolder.end(); ++p) mprintf(" %i", *p + 1); mprintf("\n"); // Try to give placeholders more reasonable coordinates. if (!placeHolder.empty()) { Iarray current_indices; unsigned int pidx = 0; while (pidx < placeHolder.size()) { if (current_indices.empty()) { current_indices.push_back( placeHolder[pidx++] ); // Search for the end of this segment for (; pidx != placeHolder.size(); pidx++) { if (placeHolder[pidx] - current_indices.back() > 1) break; current_indices.push_back( placeHolder[pidx] ); } // DEBUG mprintf("\tSegment:"); for (Iarray::const_iterator it = current_indices.begin(); it != current_indices.end(); ++it) mprintf(" %i", *it + 1); // Get coordinates of residues bordering segment. int prev_res = sTop[current_indices.front()].ResNum() - 1; int next_res = sTop[current_indices.back() ].ResNum() + 1; mprintf(" (prev_res=%i, next_res=%i)\n", prev_res+1, next_res+1); Vec3 prev_crd(sFrame.XYZ(current_indices.front() - 1)); Vec3 next_crd(sFrame.XYZ(current_indices.back() + 1)); prev_crd.Print("prev_crd"); next_crd.Print("next_crd"); Vec3 crd_step = (next_crd - prev_crd) / (double)(current_indices.size()+1); crd_step.Print("crd_step"); double* xyz = sFrame.xAddress() + (current_indices.front() * 3); for (unsigned int i = 0; i != current_indices.size(); i++, xyz += 3) { prev_crd += crd_step; xyz[0] = prev_crd[0]; xyz[1] = prev_crd[1]; xyz[2] = prev_crd[2]; } current_indices.clear(); } } } //Topology* sTop = qref.Parm().partialModifyStateByMask( sMask ); //if (sTop == 0) return 1; //Frame sFrame(qref.Coord(), sMask); // Write output traj Trajout_Single trajout; if (trajout.PrepareTrajWrite(outfilename, argIn, &sTop, CoordinateInfo(), 1, fmt)) return 1; if (trajout.WriteSingle(0, sFrame)) return 1; trajout.EndTraj(); return 0; }
/** selectedTgt and centeredREF must correspond to each other. */ double SymmetricRmsdCalc::SymmRMSD_CenteredRef(Frame const& selectedTgt, Frame const& centeredREF) { // Create initial 1 to 1 atom map for all atoms; indices in // SymmetricAtomIndices will correspond to positions in AMap. for (int atom = 0; atom < (int)AMap_.size(); atom++) AMap_[atom] = atom; tgtRemap_.SetCoordinates(selectedTgt); // Calculate initial best fit RMSD if necessary if (fit_) { tgtRemap_.RMSD_CenteredRef(centeredREF, rotMatrix_, tgtTrans_, useMass_); // Since tgtRemap is moved to origin during RMSD calc and centeredREF // should already be at the origin, just rotate. tgtRemap_.Rotate( rotMatrix_ ); } // Correct RMSD for symmetry for (AtomIndexArray::const_iterator symmatoms = SymmetricAtomIndices_.begin(); symmatoms != SymmetricAtomIndices_.end(); ++symmatoms) { // For each array of symmetric atoms, determine the lowest distance score # ifdef DEBUGSYMMRMSD mprintf(" Symmetric atoms group %u starting with atom %i\n", symmatoms - SymmetricAtomIndices_.begin(), tgtMask_[symmatoms->front()] + 1); # endif cost_matrix_.Initialize( symmatoms->size() ); for (Iarray::const_iterator ta = symmatoms->begin(); ta != symmatoms->end(); ++ta) { for (Iarray::const_iterator ra = symmatoms->begin(); ra != symmatoms->end(); ++ra) { double dist2 = DIST2_NoImage( centeredREF.XYZ(*ra), tgtRemap_.XYZ(*ta) ); # ifdef DEBUGSYMMRMSD mprintf("\t\t%i to %i: %f\n", tgtMask_[*ta] + 1, tgtMask_[*ra] + 1, dist2); # endif cost_matrix_.AddElement( dist2 ); } } Iarray resMap = cost_matrix_.Optimize(); # ifdef DEBUGSYMMRMSD mprintf("\tMapping from Hungarian Algorithm:\n"); for (Iarray::const_iterator ha = resMap.begin(); ha != resMap.end(); ++ha) mprintf("\t\tMap col=%u row=%i\n", ha - resMap.begin(), *ha); # endif // Fill in overall map Iarray::const_iterator rmap = resMap.begin(); for (Iarray::const_iterator atmidx = symmatoms->begin(); atmidx != symmatoms->end(); ++atmidx, ++rmap) { AMap_[*atmidx] = (*symmatoms)[*rmap]; # ifdef DEBUGSYMMRMSD mprintf("\tAssigned atom %i to atom %i\n", tgtMask_[*atmidx] + 1, tgtMask_[(*symmatoms)[*rmap]] + 1); # endif } } # ifdef DEBUGSYMMRMSD mprintf(" Final Atom Mapping:\n"); for (unsigned int ref = 0; ref < AMap_.size(); ++ref) mprintf("\t%u -> %i\n", tgtMask_[ref] + 1, tgtMask_[AMap_[ref]] + 1); mprintf("----------------------------------------\n"); # endif // Remap the target frame for symmetry, then calculate new RMSD. // TODO: Does the topology need to be remapped as well? double rmsdval; tgtRemap_.SetCoordinatesByMap(selectedTgt, AMap_); if (fit_) rmsdval = tgtRemap_.RMSD_CenteredRef( centeredREF, rotMatrix_, tgtTrans_, useMass_ ); else rmsdval = tgtRemap_.RMSD_NoFit( centeredREF, useMass_ ); return rmsdval; }
// Cluster_Kmeans::Cluster() int Cluster_Kmeans::Cluster() { // First determine which frames are being clustered. Iarray const& FramesToCluster = FrameDistances().FramesToCluster(); // Determine seeds FindKmeansSeeds( FramesToCluster ); if (mode_ == RANDOM) RN_.rn_set( kseed_ ); int pointCount = (int)FramesToCluster.size(); // This array will hold the indices of the points to process each iteration. // If sequential this is just 0 -> pointCount. If random this will be // reassigned each iteration. Iarray PointIndices; PointIndices.reserve( pointCount ); for (int processIdx = 0; processIdx != pointCount; processIdx++) PointIndices.push_back( processIdx ); // Add the seed clusters for (Iarray::const_iterator seedIdx = SeedIndices_.begin(); seedIdx != SeedIndices_.end(); ++seedIdx) { int seedFrame = FramesToCluster[ *seedIdx ]; // A centroid is created for new clusters. AddCluster( ClusterDist::Cframes(1, seedFrame) ); // NOTE: No need to calc best rep frame, only 1 frame. if (debug_ > 0) mprintf("Put frame %i in cluster %i (seed index=%i).\n", seedFrame, clusters_.back().Num(), *seedIdx); } // Assign points in 3 passes. If a point looked like it belonged to cluster A // at first, but then we added many other points and altered our cluster // shapes, its possible that we will want to reassign it to cluster B. for (int iteration = 0; iteration != maxIt_; iteration++) { if (mode_ == RANDOM) ShufflePoints( PointIndices ); // Add each point to an existing cluster, and recompute centroid mprintf("\tRound %i: ", iteration); ProgressBar progress( PointIndices.size() ); int Nchanged = 0; int prog = 0; for (Iarray::const_iterator pointIdx = PointIndices.begin(); pointIdx != PointIndices.end(); ++pointIdx, ++prog) { if (debug_ < 1) progress.Update( prog ); int oldClusterIdx = -1; // if ( iteration != 0 || mode_ != SEQUENTIAL) // FIXME: Should this really happen for RANDOM // { int pointFrame = FramesToCluster[ *pointIdx ]; if (debug_ > 0) mprintf("DEBUG: Processing frame %i (index %i)\n", pointFrame, *pointIdx); bool pointWasYanked = true; if (iteration > 0) { // Yank this point out of its cluster, recompute the centroid for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1) { if (C1->HasFrame( pointFrame )) { // If this point is alone in its cluster its in the right place if (C1->Nframes() == 1) { pointWasYanked = false; continue; // FIXME: should this be a break? } //oldBestRep = C1->BestRepFrame(); oldClusterIdx = C1->Num(); C1->RemoveFrameUpdateCentroid( Cdist_, pointFrame ); // TEST // C1->RemoveFrameFromCluster( pointFrame ); //newBestRep = C1->FindBestRepFrame(); // C1->CalculateCentroid( Cdist_ ); if (debug_ > 0) mprintf("Remove Frame %i from cluster %i\n", pointFrame, C1->Num()); //if (clusterToClusterCentroid_) { // if (oldBestRep != NewBestRep) // C1->AlignToBestRep( Cdist_ ); // FIXME: Only relevant for COORDS dist? // C1->CalculateCentroid( Cdist_ ); // FIXME: Seems unnessecary to align prior //} } } } else { // First iteration. If this point is already in a cluster it is a seed. for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1) { if (C1->HasFrame( pointFrame )) { pointWasYanked = false; if (debug_ > 0) mprintf("Frame %i was already used to seed cluster %i\n", pointFrame, C1->Num()); continue; // FIXME break? } } } if (pointWasYanked) { // Find out what cluster this point is now closest to. double closestDist = -1.0; cluster_it closestCluster = clusters_.begin(); for (cluster_it C1 = clusters_.begin(); C1 != clusters_.end(); ++C1) { double dist = Cdist_->FrameCentroidDist(pointFrame, C1->Cent()); if (closestDist < 0.0 || dist < closestDist) { closestDist = dist; closestCluster = C1; } } //oldBestRep = closestCluster->BestRepFrame(); closestCluster->AddFrameUpdateCentroid( Cdist_, pointFrame ); // TEST // closestCluster->AddFrameToCluster( pointFrame ); //newBestRep = closestCluster->FindBestFrameFrame(); // closestCluster->CalculateCentroid( Cdist_ ); if (closestCluster->Num() != oldClusterIdx) { Nchanged++; if (debug_ > 0) mprintf("Remove Frame %i from cluster %i, but add to cluster %i (dist= %f).\n", pointFrame, oldClusterIdx, closestCluster->Num(), closestDist); } else { if (debug_ > 0) mprintf("Frame %i staying in cluster %i\n", pointFrame, closestCluster->Num()); } if (clusterToClusterCentroid_) { //if (oldBestRep != NewBestRep) { // C1->AlignToBestRep( Cdist_ ); // FIXME: Only relevant for COORDS dist? // C1->CalculateCentroid( Cdist_ ); // FIXME: Seems unnessecary to align prior //} } } // } } // END loop over points to cluster if (Nchanged == 0) { mprintf("\tK-means round %i: No change. Skipping the rest of the iterations.\n", iteration); break; } else mprintf("\tK-means round %i: %i points changed cluster assignment.\n", iteration, Nchanged); } // END k-means iterations // Remove any empty clusters // FIXME: Will there ever be empty clusters? RemoveEmptyClusters(); // NOTE in PTRAJ here align all frames to best rep return 0; }