int SequenceAlign(CpptrajState& State, ArgList& argIn) { std::string blastfile = argIn.GetStringKey("blastfile"); if (blastfile.empty()) { mprinterr("Error: 'blastfile' must be specified.\n"); return 1; } ReferenceFrame qref = State.DSL()->GetReferenceFrame(argIn); if (qref.error() || qref.empty()) { mprinterr("Error: Must specify reference structure for query.\n"); return 1; } std::string outfilename = argIn.GetStringKey("out"); if (outfilename.empty()) { mprinterr("Error: Must specify output file.\n"); return 1; } TrajectoryFile::TrajFormatType fmt = TrajectoryFile::GetFormatFromArg(argIn); if (fmt != TrajectoryFile::PDBFILE && fmt != TrajectoryFile::MOL2FILE) fmt = TrajectoryFile::PDBFILE; // Default to PDB int smaskoffset = argIn.getKeyInt("smaskoffset", 0) + 1; int qmaskoffset = argIn.getKeyInt("qmaskoffset", 0) + 1; // Load blast file mprintf("\tReading BLAST alignment from '%s'\n", blastfile.c_str()); BufferedLine infile; if (infile.OpenFileRead( blastfile )) return 1; // Seek down to first Query line. const char* ptr = infile.Line(); bool atFirstQuery = false; while (ptr != 0) { if (*ptr == 'Q') { if ( strncmp(ptr, "Query", 5) == 0 ) { atFirstQuery = true; break; } } ptr = infile.Line(); } if (!atFirstQuery) { mprinterr("Error: 'Query' not found.\n"); return 1; } // Read alignment. Replacing query with subject. typedef std::vector<char> Carray; typedef std::vector<int> Iarray; Carray Query; // Query residues Carray Sbjct; // Sbjct residues Iarray Smap; // Smap[Sbjct index] = Query index while (ptr != 0) { const char* qline = ptr; // query line const char* aline = infile.Line(); // alignment line const char* sline = infile.Line(); // subject line if (aline == 0 || sline == 0) { mprinterr("Error: Missing alignment line or subject line after Query:\n"); mprinterr("Error: %s", qline); return 1; } for (int idx = 12; qline[idx] != ' '; idx++) { if (qline[idx] == '-') { // Sbjct does not have corresponding res in Query Smap.push_back(-1); Sbjct.push_back( sline[idx] ); } else if (sline[idx] == '-') { // Query does not have a corresponding res in Sbjct Query.push_back( qline[idx] ); } else { // Direct Query to Sbjct map Smap.push_back( Query.size() ); Sbjct.push_back( sline[idx] ); Query.push_back( qline[idx] ); } } // Scan to next Query ptr = infile.Line(); while (ptr != 0) { if (*ptr == 'Q') { if ( strncmp(ptr, "Query", 5) == 0 ) break; } ptr = infile.Line(); } } // DEBUG std::string SmaskExp, QmaskExp; if (State.Debug() > 0) mprintf(" Map of Sbjct to Query:\n"); for (int sres = 0; sres != (int)Sbjct.size(); sres++) { if (State.Debug() > 0) mprintf("%-i %3s %i", sres+smaskoffset, Residue::ConvertResName(Sbjct[sres]), Smap[sres]+qmaskoffset); const char* qres = ""; if (Smap[sres] != -1) { qres = Residue::ConvertResName(Query[Smap[sres]]); if (SmaskExp.empty()) SmaskExp.assign( integerToString(sres+smaskoffset) ); else SmaskExp.append( "," + integerToString(sres+smaskoffset) ); if (QmaskExp.empty()) QmaskExp.assign( integerToString(Smap[sres]+qmaskoffset) ); else QmaskExp.append( "," + integerToString(Smap[sres]+qmaskoffset) ); } if (State.Debug() > 0) mprintf(" %3s\n", qres); } mprintf("Smask: %s\n", SmaskExp.c_str()); mprintf("Qmask: %s\n", QmaskExp.c_str()); // Check that query residues match reference. for (unsigned int sres = 0; sres != Sbjct.size(); sres++) { int qres = Smap[sres]; if (qres != -1) { if (Query[qres] != qref.Parm().Res(qres).SingleCharName()) { mprintf("Warning: Potential residue mismatch: Query %s reference %s\n", Residue::ConvertResName(Query[qres]), qref.Parm().Res(qres).c_str()); } } } // Build subject using coordinate from reference. //AtomMask sMask; // Contain atoms that should be in sTop Topology sTop; Frame sFrame; Iarray placeHolder; // Atom indices of placeholder residues. for (unsigned int sres = 0; sres != Sbjct.size(); sres++) { int qres = Smap[sres]; NameType SresName( Residue::ConvertResName(Sbjct[sres]) ); if (qres != -1) { Residue const& QR = qref.Parm().Res(qres); Residue SR(SresName, sres+1, ' ', QR.ChainID()); if (Query[qres] == Sbjct[sres]) { // Exact match. All non-H atoms. for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++) { if (qref.Parm()[qat].Element() != Atom::HYDROGEN) sTop.AddTopAtom( qref.Parm()[qat], SR ); sFrame.AddXYZ( qref.Coord().XYZ(qat) ); //sMask.AddAtom(qat); } } else { // Partial match. Copy only backbone and CB. for (int qat = QR.FirstAtom(); qat != QR.LastAtom(); qat++) { if ( qref.Parm()[qat].Name().Match("N" ) || qref.Parm()[qat].Name().Match("CA") || qref.Parm()[qat].Name().Match("CB") || qref.Parm()[qat].Name().Match("C" ) || qref.Parm()[qat].Name().Match("O" ) ) { sTop.AddTopAtom( qref.Parm()[qat], SR ); sFrame.AddXYZ( qref.Coord().XYZ(qat) ); } } } } else { // Residue in query does not exist for subject. Just put placeholder CA for now. Vec3 Zero(0.0); placeHolder.push_back( sTop.Natom() ); sTop.AddTopAtom( Atom("CA", "C "), Residue(SresName, sres+1, ' ', ' ') ); sFrame.AddXYZ( Zero.Dptr() ); } } //sTop.PrintAtomInfo("*"); mprintf("\tPlaceholder residue indices:"); for (Iarray::const_iterator p = placeHolder.begin(); p != placeHolder.end(); ++p) mprintf(" %i", *p + 1); mprintf("\n"); // Try to give placeholders more reasonable coordinates. if (!placeHolder.empty()) { Iarray current_indices; unsigned int pidx = 0; while (pidx < placeHolder.size()) { if (current_indices.empty()) { current_indices.push_back( placeHolder[pidx++] ); // Search for the end of this segment for (; pidx != placeHolder.size(); pidx++) { if (placeHolder[pidx] - current_indices.back() > 1) break; current_indices.push_back( placeHolder[pidx] ); } // DEBUG mprintf("\tSegment:"); for (Iarray::const_iterator it = current_indices.begin(); it != current_indices.end(); ++it) mprintf(" %i", *it + 1); // Get coordinates of residues bordering segment. int prev_res = sTop[current_indices.front()].ResNum() - 1; int next_res = sTop[current_indices.back() ].ResNum() + 1; mprintf(" (prev_res=%i, next_res=%i)\n", prev_res+1, next_res+1); Vec3 prev_crd(sFrame.XYZ(current_indices.front() - 1)); Vec3 next_crd(sFrame.XYZ(current_indices.back() + 1)); prev_crd.Print("prev_crd"); next_crd.Print("next_crd"); Vec3 crd_step = (next_crd - prev_crd) / (double)(current_indices.size()+1); crd_step.Print("crd_step"); double* xyz = sFrame.xAddress() + (current_indices.front() * 3); for (unsigned int i = 0; i != current_indices.size(); i++, xyz += 3) { prev_crd += crd_step; xyz[0] = prev_crd[0]; xyz[1] = prev_crd[1]; xyz[2] = prev_crd[2]; } current_indices.clear(); } } } //Topology* sTop = qref.Parm().partialModifyStateByMask( sMask ); //if (sTop == 0) return 1; //Frame sFrame(qref.Coord(), sMask); // Write output traj Trajout_Single trajout; if (trajout.PrepareTrajWrite(outfilename, argIn, &sTop, CoordinateInfo(), 1, fmt)) return 1; if (trajout.WriteSingle(0, sFrame)) return 1; trajout.EndTraj(); return 0; }
int Cluster_DPeaks::ChoosePointsAutomatically() { // Right now all density values are discrete. Try to choose outliers at each // value for which there is density.; /* // For each point, calculate average distance (X,Y) to points in next and // previous density values. const double dens_cut = 3.0 * 3.0; const double dist_cut = 1.32 * 1.32; for (Carray::const_iterator point0 = Points_.begin(); point0 != Points_.end(); ++point0) { int Npts = 0; for (Carray::const_iterator point1 = Points_.begin(); point1 != Points_.end(); ++point1) { if (point0 != point1) { // Only do this for close densities double dX = (double)(point0->PointsWithinEps() - point1->PointsWithinEps()); double dX2 = dX * dX; double dY = (point0->Dist() - point1->Dist()); double dY2 = dY * dY; if (dX2 < dens_cut && dY2 < dist_cut) { Npts++; } } } mprintf("%i %i %i\n", point0->PointsWithinEps(), point0->Fnum()+1, Npts); } */ /* CpptrajFile tempOut; tempOut.OpenWrite("temp.dat"); int currentDensity = -1; double distAv = 0.0; double distSD = 0.0; double sumWts = 0.0; int nValues = 0; Carray::const_iterator lastPoint = Points_.end() + 1; for (Carray::const_iterator point = Points_.begin(); point != lastPoint; ++point) { if (point == Points_.end() || point->PointsWithinEps() != currentDensity) { if (nValues > 0) { distAv = distAv / sumWts; //(double)nValues; distSD = (distSD / sumWts) - (distAv * distAv); if (distSD > 0.0) distSD = sqrt(distSD); else distSD = 0.0; //mprintf("Density %i: %i values Avg= %g SD= %g SumWts= %g\n", currentDensity, // nValues, distAv, distSD, sumWts); tempOut.Printf("%i %g\n", currentDensity, distAv); } if (point == Points_.end()) break; currentDensity = point->PointsWithinEps(); distAv = 0.0; distSD = 0.0; sumWts = 0.0; nValues = 0; } double wt = exp(point->Dist()); double dval = point->Dist() * wt; sumWts += wt; distAv += dval; distSD += (dval * dval); nValues++; } tempOut.CloseFile(); */ // BEGIN CALCULATING WEIGHTED DISTANCE AVERAGE CpptrajFile tempOut; tempOut.OpenWrite("temp.dat"); DataSet_Mesh weightedAverage; Carray::const_iterator cp = Points_.begin(); // Skip local density of 0. //while (cp->PointsWithinEps() == 0 && cp != Points_.end()) ++cp; while (cp != Points_.end()) { int densityVal = cp->PointsWithinEps(); Carray densityArray; // Add all points of current density. while (cp->PointsWithinEps() == densityVal && cp != Points_.end()) densityArray.push_back( *(cp++) ); mprintf("Density value %i has %zu points.\n", densityVal, densityArray.size()); // Sort array by distance std::sort(densityArray.begin(), densityArray.end(), Cpoint::dist_sort()); // Take the average of the points weighted by their position. double wtDistAv = 0.0; double sumWts = 0.0; //std::vector<double> weights; //weights.reserve( densityArray.size() ); int maxPt = (int)densityArray.size() - 1; for (int ip = 0; ip != (int)densityArray.size(); ++ip) { double wt = exp( (double)(ip - maxPt) ); //mprintf("\t%10i %10u %10u %10g\n", densityVal, ip, maxPt, wt); wtDistAv += (densityArray[ip].Dist() * wt); sumWts += wt; //weights.push_back( wt ); } wtDistAv /= sumWts; // Calculate the weighted sample variance //double distSD = 0.0; //for (unsigned int ip = 0; ip != densityArray.size(); ++ip) { // double diff = densityArray[ip].Dist() - wtDistAv; // distSD += weights[ip] * (diff * diff); //} //distSD /= sumWts; weightedAverage.AddXY(densityVal, wtDistAv); //tempOut.Printf("%i %g %g %g\n", densityVal, wtDistAv, sqrt(distSD), sumWts); tempOut.Printf("%i %g %g\n", densityVal, wtDistAv, sumWts); /* // Find the median. double median, Q1, Q3; if (densityArray.size() == 1) { median = densityArray[0].Dist(); Q1 = median; Q3 = median; } else { unsigned int q3_beg; unsigned int med_idx = densityArray.size() / 2; // Always 0 <= Q1 < med_idx if ((densityArray.size() % 2) == 0) { median = (densityArray[med_idx].Dist() + densityArray[med_idx-1].Dist()) / 2.0; q3_beg = med_idx; } else { median = densityArray[med_idx].Dist(); q3_beg = med_idx + 1; } if (densityArray.size() == 2) { Q1 = densityArray[0].Dist(); Q3 = densityArray[1].Dist(); } else { // Find lower quartile unsigned int q1_idx = med_idx / 2; if ((med_idx % 2) == 0) Q1 = (densityArray[q1_idx].Dist() + densityArray[q1_idx-1].Dist()) / 2.0; else Q1 = densityArray[q1_idx].Dist(); // Find upper quartile unsigned int q3_size = densityArray.size() - q3_beg; unsigned int q3_idx = (q3_size / 2) + q3_beg; if ((q3_size %2) == 0) Q3 = (densityArray[q3_idx].Dist() + densityArray[q3_idx-1].Dist()) / 2.0; else Q3 = densityArray[q3_idx].Dist(); } } mprintf("\tMedian dist value is %g. Q1= %g Q3= %g\n", median, Q1, Q3); */ } tempOut.CloseFile(); // END CALCULATING WEIGHTED DISTANCE AVERAGE /* // TEST tempOut.OpenWrite("temp2.dat"); std::vector<double> Hist( Points_.back().PointsWithinEps()+1, 0.0 ); int gWidth = 3; double cval = 3.0; double two_c_squared = 2.0 * cval * cval; mprintf("DBG: cval= %g, Gaussian denominator is %g\n", cval, two_c_squared); for (int wtIdx = 0; wtIdx != (int)weightedAverage.Size(); wtIdx++) { int bval = weightedAverage.X(wtIdx); for (int xval = std::max(bval - gWidth, 0); xval != std::min(bval + gWidth + 1, (int)Hist.size()); xval++) { // a: height (weighted average) // b: center (density value) // c: width // x: density value in histogram //int xval = weightedAverage.X(idx); //double bval = weightedAverage.X(wtIdx); //double bval = (double)wtIdx; double diff = (double)(xval - bval); //Hist[xval] += (weightedAverage.Y(wtIdx) * exp( -( (diff * diff) / two_c_squared ) )); Hist[xval] = std::max(Hist[xval], weightedAverage.Y(wtIdx) * exp( -( (diff * diff) / two_c_squared ) )); } } for (unsigned int idx = 0; idx != Hist.size(); idx++) tempOut.Printf("%u %g\n", idx, Hist[idx]); tempOut.CloseFile(); // END TEST */ /* // TEST // Construct best-fit line segments tempOut.OpenWrite("temp2.dat"); double slope, intercept, correl; int segment_length = 3; DataSet_Mesh Segment; Segment.Allocate1D( segment_length ); for (int wtIdx = 0; wtIdx != (int)weightedAverage.Size(); wtIdx++) { Segment.Clear(); for (int idx = std::max(wtIdx - 1, 0); // TODO: use segment_length idx != std::min(wtIdx + 2, (int)weightedAverage.Size()); idx++) Segment.AddXY(weightedAverage.X(idx), weightedAverage.Y(idx)); Segment.LinearRegression(slope, intercept, correl, true); for (int idx = std::max(wtIdx - 1, 0); // TODO: use segment_length idx != std::min(wtIdx + 2, (int)weightedAverage.Size()); idx++) { double x = weightedAverage.X(idx); double y = slope * x + intercept; tempOut.Printf("%g %g %i\n", x, y, weightedAverage.X(wtIdx)); } } tempOut.CloseFile(); // END TEST */ // BEGIN WEIGHTED RUNNING AVG/SD OF DISTANCES // For each point, determine if it is greater than the average of the // weighted average distances of the previous, current, and next densities. int width = 2; int currentDensity = 0; int wtIdx = 0; double currentAvg = 0.0; double deltaSD = 0.0; double deltaAv = 0.0; int Ndelta = 0; CpptrajFile raOut; if (!rafile_.empty()) raOut.OpenWrite(rafile_); CpptrajFile raDelta; if (!radelta_.empty()) raDelta.OpenWrite(radelta_); std::vector<unsigned int> candidateIdxs; std::vector<double> candidateDeltas; cp = Points_.begin(); // Skip over points with zero density while (cp != Points_.end() && cp->PointsWithinEps() == 0) ++cp; while (weightedAverage.X(wtIdx) != cp->PointsWithinEps() && wtIdx < (int)Points_.size()) ++wtIdx; for (Carray::const_iterator point = cp; point != Points_.end(); ++point) { if (point->PointsWithinEps() != currentDensity) { //currentAvg = weightedAverage.Y(wtIdx); // New density value. Determine average. currentAvg = 0.0; // unsigned int Npt = 0; double currentWt = 0.0; for (int idx = std::max(wtIdx - width, 0); idx != std::min(wtIdx + width + 1, (int)weightedAverage.Size()); idx++) { //currentAvg += weightedAverage.Y(idx); //Npt++; double wt = weightedAverage.Y(idx); currentAvg += (weightedAverage.Y(idx) * wt); currentWt += wt; } //currentAvg /= (double)Npt; currentAvg /= currentWt; //smoothAv += currentAvg; //smoothSD += (currentAvg * currentAvg); //Nsmooth++; currentDensity = point->PointsWithinEps(); if (raOut.IsOpen()) raOut.Printf("%i %g %g\n", currentDensity, currentAvg, weightedAverage.Y(wtIdx)); wtIdx++; } double delta = (point->Dist() - currentAvg); if (delta > 0.0) { //delta *= log((double)currentDensity); if (raDelta.IsOpen()) raDelta.Printf("%8i %8.3f %8i %8.3f %8.3f\n", currentDensity, delta, point->Fnum()+1, point->Dist(), currentAvg); candidateIdxs.push_back( point - Points_.begin() ); candidateDeltas.push_back( delta ); deltaAv += delta; deltaSD += (delta * delta); Ndelta++; } } raOut.CloseFile(); deltaAv /= (double)Ndelta; deltaSD = (deltaSD / (double)Ndelta) - (deltaAv * deltaAv); if (deltaSD > 0.0) deltaSD = sqrt(deltaSD); else deltaSD = 0.0; if (raDelta.IsOpen()) raDelta.Printf("#DeltaAvg= %g DeltaSD= %g\n", deltaAv, deltaSD); raDelta.CloseFile(); int cnum = 0; for (unsigned int i = 0; i != candidateIdxs.size(); i++) { if (candidateDeltas[i] > (deltaSD)) { Points_[candidateIdxs[i]].SetCluster( cnum++ ); mprintf("\tPoint %u (frame %i, density %i) selected as candidate for cluster %i\n", candidateIdxs[i], Points_[candidateIdxs[i]].Fnum()+1, Points_[candidateIdxs[i]].PointsWithinEps(), cnum-1); } } // END WEIGHTED AVG/SD OF DISTANCES /* // Currently doing this by calculating the running average of density vs // distance, then choosing points with distance > twice the SD of the // running average. // NOTE: Store in a mesh data set for now in case we want to spline etc later. if (avg_factor_ < 1) avg_factor_ = 10; unsigned int window_size = Points_.size() / (unsigned int)avg_factor_; mprintf("\tRunning avg window size is %u\n", window_size); // FIXME: Handle case where window_size < frames DataSet_Mesh runavg; unsigned int ra_size = Points_.size() - window_size + 1; runavg.Allocate1D( ra_size ); double dwindow = (double)window_size; double sumx = 0.0; double sumy = 0.0; for (unsigned int i = 0; i < window_size; i++) { sumx += (double)Points_[i].PointsWithinEps(); sumy += Points_[i].Dist(); } runavg.AddXY( sumx / dwindow, sumy / dwindow ); for (unsigned int i = 1; i < ra_size; i++) { unsigned int nextwin = i + window_size - 1; unsigned int prevwin = i - 1; sumx = (double)Points_[nextwin].PointsWithinEps() - (double)Points_[prevwin].PointsWithinEps() + sumx; sumy = Points_[nextwin].Dist() - Points_[prevwin].Dist() + sumy; runavg.AddXY( sumx / dwindow, sumy / dwindow ); } // Write running average if (!rafile_.empty()) { CpptrajFile raOut; if (raOut.OpenWrite(rafile_)) mprinterr("Error: Could not open running avg file '%s' for write.\n", rafile_.c_str()); else { for (unsigned int i = 0; i != runavg.Size(); i++) raOut.Printf("%g %g\n", runavg.X(i), runavg.Y(i)); raOut.CloseFile(); } } double ra_sd; double ra_avg = runavg.Avg( ra_sd ); // Double stdev to use as cutoff for findning anomalously high peaks. ra_sd *= 2.0; mprintf("\tAvg of running avg set is %g, SD*2.0 (delta cutoff) is %g\n", ra_avg, ra_sd); // For each point in density vs distance plot, determine which running // average point is closest. If the difference between the point and the // running average point is > 2.0 the SD of the running average data, // consider it a 'peak'. CpptrajFile raDelta; if (!radelta_.empty()) raDelta.OpenWrite("radelta.dat"); if (raDelta.IsOpen()) raDelta.Printf("%-10s %10s %10s\n", "#Frame", "RnAvgPos", "Delta"); unsigned int ra_position = 0; // Position in the runavg DataSet unsigned int ra_end = runavg.Size() - 1; int cnum = 0; for (Carray::iterator point = Points_.begin(); point != Points_.end(); ++point) { if (ra_position != ra_end) { // Is the next running avgd point closer to this point? while (ra_position != ra_end) { double dens = (double)point->PointsWithinEps(); double diff0 = fabs( dens - runavg.X(ra_position ) ); double diff1 = fabs( dens - runavg.X(ra_position+1) ); if (diff1 < diff0) ++ra_position; // Next running avg position is closer for this point. else break; // This position is closer. } } double delta = point->Dist() - runavg.Y(ra_position); if (raDelta.IsOpen()) raDelta.Printf("%-10i %10u %10g", point->Fnum()+1, ra_position, delta); if (delta > ra_sd) { if (raDelta.IsOpen()) raDelta.Printf(" POTENTIAL CLUSTER %i", cnum); point->SetCluster(cnum++); } if (raDelta.IsOpen()) raDelta.Printf("\n"); } raDelta.CloseFile(); */ return cnum; }