void makeOutputFFIndex(char *par, const int mpi_rank, void (*print)(HHblits &, std::stringstream &), std::vector<OutputFFIndex> &outDatabases) { if (*par) { OutputFFIndex db; strcpy(db.base, par); db.offset = 0; db.print = print; char data_filename_out_rank[NAMELEN]; char index_filename_out_rank[NAMELEN]; snprintf(data_filename_out_rank, FILENAME_MAX, "%s.ffdata.%d", par, mpi_rank); snprintf(index_filename_out_rank, FILENAME_MAX, "%s.ffindex.%d", par, mpi_rank); db.data_fh = fopen(data_filename_out_rank, "w+"); db.index_fh = fopen(index_filename_out_rank, "w+"); if (db.data_fh == NULL) { HH_LOG(WARNING) << "Could not open datafile " << data_filename_out_rank << std::endl; return; } if (db.index_fh == NULL) { HH_LOG(WARNING) << "Could not open indexfile " << index_filename_out_rank << std::endl; return; } outDatabases.push_back(db); } }
void Prefilter::checkCSFormat(size_t nr_checks) { for (size_t n = 0; n < std::min(nr_checks, num_dbs); n++) { if (first[n][0] == '>') { nr_checks--; } } if (nr_checks == 0) { HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl; HH_LOG(ERROR) << "\tYour cs database is in an old format!" << std::endl; HH_LOG(ERROR) << "\tThis format is no longer supportet!" << std::endl; HH_LOG(ERROR) << "\tCorrespond to the user manual!" << std::endl; exit(1); } }
void checkOutput(Parameters& par) { if (!*par.outfile) { RemoveExtension(par.outfile, par.infile); strcat(par.outfile, ".hhr"); HH_LOG(INFO) << "Search results will be written to " << par.outfile << "\n"; } }
void SetBlosumMatrix(const char matrix, const float BlosumXX[], float* pb, float P[20][20]) { int a,b,n=0; HH_LOG(DEBUG) << "Using the BLOSUM " << matrix << " matrix" << std::endl; for (a=0; a<20; ++a) for (pb[a]=0.0f, b=0; b<=a; ++b,++n) P[a][b] = BlosumXX[n]; for (a=0; a<19; a++) for (b=a+1; b<20; ++b) P[a][b] = P[b][a]; }
/////////////////////////////////////////////////////////////////////////////////////////////////// // Pull out all names from prefilter db file and copy into dbfiles_new for full HMM-HMM comparison /////////////////////////////////////////////////////////////////////////////////////////////////// void Prefilter::init_no_prefiltering(FFindexDatabase* cs219_database, std::vector<std::pair<int, std::string> >& prefiltered_entries) { ffindex_index_t* db_index = cs219_database->db_index; for (size_t n = 0; n < db_index->n_entries; n++) { ffindex_entry_t* entry = ffindex_get_entry_by_index(db_index, n); prefiltered_entries.push_back( std::make_pair<int, std::string>(entry->length, std::string(entry->name))); } HH_LOG(INFO) << "Searching " << prefiltered_entries.size() << " database HHMs without prefiltering" << std::endl; }
int main(int argc, char **argv) { Parameters par; HHblits::ProcessAllArguments(argc, argv, par); checkOutput(par); std::vector<HHblitsDatabase*> databases; HHblits::prepareDatabases(par, databases); #ifdef OPENMP omp_set_num_threads(par.threads); #endif HHblits hhblits(par, databases); FILE* inf; if(strcmp(par.infile, "stdin") == 0) { inf = stdin; } else { inf = fopen(par.infile, "r"); } if(!inf) { HH_LOG(ERROR) << "Input file (" << par.infile << ") could not be opened!" << std::endl; exit(1); } hhblits.run(inf, par.infile); fclose(inf); if(Log::reporting_level() >= INFO) { hhblits.printHitList(); } hhblits.writeHHRFile(par.outfile); hhblits.writeAlisFile(par.alisbasename); hhblits.writeScoresFile(par.scorefile); hhblits.writeM8(par.m8file); hhblits.writePairwiseAlisFile(par.pairwisealisfile, par.outformat); hhblits.writeAlitabFile(par.alitabfile); hhblits.writePsiFile(par.psifile); hhblits.writeHMMFile(par.hhmfile); hhblits.writeA3MFile(par.alnfile); hhblits.writeMatricesFile(par.matrices_output_file); for(size_t i = 0; i < databases.size(); i++) { delete databases[i]; } databases.clear(); }
////////////////////////////////////////////////////////////// // Reading in column state sequences for prefiltering ////////////////////////////////////////////////////////////// void Prefilter::init_prefilter(FFindexDatabase* cs219_database) { // Set up variables for prefiltering num_dbs = cs219_database->db_index->n_entries; first = (unsigned char**) mem_align(ALIGN_FLOAT, num_dbs * sizeof(unsigned char*)); length = (int*) mem_align(ALIGN_FLOAT, num_dbs * sizeof(int)); dbnames = (char**) mem_align(ALIGN_FLOAT, num_dbs * sizeof(char*)); for (size_t n = 0; n < num_dbs; n++) { ffindex_entry_t* entry = ffindex_get_entry_by_index( cs219_database->db_index, n); first[n] = (unsigned char*) ffindex_get_data_by_entry( cs219_database->db_data, entry); length[n] = entry->length - 1; dbnames[n] = new char[strlen(entry->name) + 1]; strcpy(dbnames[n], entry->name); } //check if cs219 format is new binary format checkCSFormat(5); HH_LOG(INFO) << "Searching " << num_dbs << " column state sequences." << std::endl; }
//// Processing input options from command line void ProcessArguments(Parameters& par) { const int argc = par.argc; const char** argv = par.argv; // Read command line options for (int i = 1; i <= argc - 1; i++) { HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl; if (!strcmp(argv[i], "-i")) { if (++i > argc - 1 || argv[i][0] == '-') { HH_LOG(ERROR) << "No input file following -f" << std::endl; exit(4); } else { strcpy(par.infile, argv[i]); } } else if (!strcmp(argv[i], "-o")) { par.append = 0; if (++i > argc - 1) { HH_LOG(ERROR) << "No output file following -o" << std::endl; exit(4); } else strcpy(par.outfile, argv[i]); } else if (!strcmp(argv[i], "-a")) { par.append = 1; if (++i > argc - 1) { HH_LOG(ERROR) << "No output file following -a" << std::endl; exit(4); } else strcpy(par.outfile, argv[i]); } else if (!strcmp(argv[i], "-v") && (i + 1 < argc) && argv[i + 1][0] != '-') { int v = atoi(argv[++i]); par.v = Log::from_int(v); Log::reporting_level() = par.v; } else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1)) par.maxseq = atoi(argv[++i]); else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) { par.maxres = atoi(argv[++i]); par.maxcol = par.maxres * 2; } else if (!strcmp(argv[i], "-id") && (i < argc - 1)) par.max_seqid = atoi(argv[++i]); else if (!strcmp(argv[i], "-qid") && (i < argc - 1)) par.qid = atoi(argv[++i]); else if (!strcmp(argv[i], "-qsc") && (i < argc - 1)) par.qsc = atof(argv[++i]); else if (!strcmp(argv[i], "-cov") && (i < argc - 1)) par.coverage = atoi(argv[++i]); else if (!strcmp(argv[i], "-diff") && (i < argc - 1)) par.Ndiff = atoi(argv[++i]); else if (!strcmp(argv[i], "-neff") && (i < argc - 1)) par.Neff = atof(argv[++i]); else if (!strcmp(argv[i], "-Neff") && (i < argc - 1)) par.Neff = atof(argv[++i]); else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) par.M = 3; else if (argv[i][0] >= '0' && argv[i][0] <= '9') { par.Mgaps = atoi(argv[i]); par.M = 2; } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { help(par); exit(0); } else { HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl; } } }
int main(int argc, const char **argv) { Parameters par(argc, argv); strcpy(par.infile, ""); strcpy(par.outfile, ""); // maximum number of sequences to be written par.nseqdis = par.maxseq - 1; // no filtering for maximum diversity par.Ndiff = 0; ProcessArguments(par); // Check command line input and default values if (!*par.infile) { help(par); HH_LOG(ERROR) << "Input file is missing!" << std::endl; exit(4); } if (!*par.outfile) { help(par); HH_LOG(ERROR) << "Output file is missing!" << std::endl; exit(4); } HH_LOG(INFO) << "Input file = " << par.infile << "\n"; HH_LOG(INFO) << "Output file = " << par.outfile << "\n"; // Reads in an alignment from par.infile into matrix X[k][l] as ASCII FILE* inf = NULL; if (strcmp(par.infile, "stdin")) { inf = fopen(par.infile, "r"); if (!inf) { OpenFileError(par.infile, __FILE__, __LINE__, __func__); } } else { inf = stdin; } Alignment qali(par.maxseq, par.maxres); qali.Read(inf, par.infile, par.mark, par.maxcol, par.nseqdis); fclose(inf); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] // and store marked sequences in name[k] and seq[k] qali.Compress(par.infile, par.cons, par.maxcol, par.M, par.Mgaps); // substitution matrix flavours float __attribute__((aligned(16))) P[20][20]; float __attribute__((aligned(16))) R[20][20]; float __attribute__((aligned(16))) Sim[20][20]; float __attribute__((aligned(16))) S[20][20]; float __attribute__((aligned(16))) pb[21]; SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) qali.N_filtered = qali.Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc,par.Ndiff); // Atune alignment diversity q.Neff with qsc to value Neff_goal if (par.Neff >= 1.0) { qali.FilterNeff(par.wg, par.mark, par.cons, par.showcons, par.max_seqid, par.coverage, par.Neff, pb, S, Sim); } // Write filtered alignment WITH insert states (lower case) to alignment file qali.WriteToFile(par.outfile, par.append); }
///////////////////////////////////////////////////////////////////////////////////// // Set (global variable) substitution matrix with derived matrices and background frequencies ///////////////////////////////////////////////////////////////////////////////////// void SetSubstitutionMatrix(const char matrix, float* pb, float P[20][20], float R[20][20], float S[20][20], float Sim[20][20]) { int a,b; switch (matrix) { default: case 0: //Gonnet matrix HH_LOG(DEBUG) << "Using the Gonnet matrix" << std::endl; for (a=0; a<20; ++a) for (pb[a]=0.0f, b=0; b<20; ++b) P[a][b] = 0.000001f*Gonnet[a*20+b]; break; case 30: //BLOSUM30 SetBlosumMatrix(matrix, Blosum30, pb, P); break; case 40: //BLOSUM40 SetBlosumMatrix(matrix, Blosum40, pb, P); break; case 50: //BLOSUM50 SetBlosumMatrix(matrix, Blosum50, pb, P); break; case 62: //BLOSUM62 SetBlosumMatrix(matrix, Blosum62, pb, P); break; case 65: //BLOSUM65 SetBlosumMatrix(matrix, Blosum65, pb, P); break; case 80: //BLOSUM80 SetBlosumMatrix(matrix, Blosum80, pb, P); break; } // Check transition probability matrix, renormalize P and calculate pb[a] float sumab=0.0f; for (a=0; a<20; a++) for (b=0; b<20; ++b) sumab+=P[a][b]; for (a=0; a<20; a++) for (b=0; b<20; ++b) P[a][b]/=sumab; for (a=0; a<20; a++) for (pb[a]=0.0f, b=0; b<20; ++b) pb[a]+=P[a][b]; //Compute similarity matrix for amino acid pairs (for calculating consensus sequence) for (a=0; a<20; ++a) for (b=0; b<20; ++b) Sim[a][b] = P[a][b]*P[a][b]/P[a][a]/P[b][b]; //Precompute matrix R for amino acid pseudocounts: for (a=0; a<20; ++a) for (b=0; b<20; ++b) R[a][b] = P[a][b]/pb[b]; //R[a][b]=P(a|b) //Precompute matrix R for amino acid pseudocounts: for (a=0; a<20; ++a) for (b=0; b<20; ++b) S[a][b] = log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b)) // Evaluate sequence identity underlying substitution matrix if (Log::reporting_level() >= DEBUG) { float id=0.0f; float entropy=0.0f; float entropy_pb=0.0f; float mut_info=0.0f; for (a=0; a<20; ++a) id+=P[a][a]; for (a=0; a<20; ++a) entropy_pb-=pb[a]*log2(pb[a]); for (a=0; a<20; ++a) for (b=0; b<20; ++b) { entropy-=P[a][b]*log2(R[a][b]); mut_info += P[a][b]*S[a][b]; } HH_LOG(DEBUG) << "sequence identity = " << 100*id << " %; entropy per column = " << entropy << " bits (out of " << entropy_pb << "); mutual information = " << mut_info << " bits" << std::endl; } //Debugging: probability matrix and dissimilarity matrix if (Log::reporting_level() >= DEBUG1) { HH_LOG(DEBUG) << "Check matrix: before renormalization sum P(a,b)= "<<sumab<<"...\n";//PRINT HH_LOG(DEBUG) <<" A R N D C Q E G H I L K M F P S T W Y V\n"; HH_LOG(DEBUG) <<"p[] "; for (a=0; a<20; a++) HH_LOG(DEBUG) << 100*pb[a] << " "; HH_LOG(DEBUG) <<std::endl<<"\nSubstitution matrix log2( P(a,b)/p(a)/p(b) ) (in bits):\n"; HH_LOG(DEBUG) <<" A R N D C Q E G H I L K M F P S T W Y V\n"; for (b=0; b<20; b++) { HH_LOG(DEBUG) << i2aa(b) << " "; for (a=0; a<20; a++) HH_LOG(DEBUG) << S[a][b] << " "; HH_LOG(DEBUG) << std::endl; } HH_LOG(DEBUG) << std::endl << "\nOdds matrix P(a,b)/p(a)/p(b):\n"; HH_LOG(DEBUG) <<" A R N D C Q E G H I L K M F P S T W Y V\n"; for (b=0; b<20; b++) { HH_LOG(DEBUG) <<i2aa(b)<<" "; for (a=0; a<20; a++) HH_LOG(DEBUG) << P[b][a]/pb[a]/pb[b] << " "; HH_LOG(DEBUG) <<std::endl; } HH_LOG(DEBUG) <<std::endl<<"\nMatrix of conditional probabilities P(a|b) = P(a,b)/p(b) (in %):\n"; HH_LOG(DEBUG) <<" A R N D C Q E G H I L K M F P S T W Y V\n"; for (b=0; b<20; b++) { HH_LOG(DEBUG) <<i2aa(b)<<" "; for (a=0; a<20; a++) HH_LOG(DEBUG) << 100*R[b][a] << " "; HH_LOG(DEBUG) <<std::endl; } HH_LOG(DEBUG) <<std::endl<<"\nProbability matrix P(a,b) (in 10E-6):\n"; HH_LOG(DEBUG) <<" A R N D C Q E G H I L K M F P S T W Y V\n"; for (b=0; b<20; b++) { HH_LOG(DEBUG) <<i2aa(b)<<" "; for (a=0; a<20; a++) HH_LOG(DEBUG) << 1000000*P[b][a] << " "; HH_LOG(DEBUG) <<std::endl; } HH_LOG(DEBUG) <<std::endl<<"Similarity matrix P(a,b)^2/P(a,a)/P(b,b) (in %):\n"; HH_LOG(DEBUG) <<" A R N D C Q E G H I L K M F P S T W Y V\n"; for (b=0; b<20; b++) { HH_LOG(DEBUG) <<i2aa(b)<<" "; for (a=0; a<20; a++) HH_LOG(DEBUG) << 100*Sim[b][a] << " "; HH_LOG(DEBUG) <<std::endl; } HH_LOG(DEBUG) <<std::endl; } }
//////////////////////////////////////////////////////////////////////// // Main prefilter function //////////////////////////////////////////////////////////////////////// void Prefilter::prefilter_db(HMM* q_tmp, Hash<Hit>* previous_hits, const int threads, const int prefilter_gap_open, const int prefilter_gap_extend, const int prefilter_score_offset, const int prefilter_bit_factor, const double prefilter_evalue_thresh, const double prefilter_evalue_coarse_thresh, const int preprefilter_smax_thresh, const int min_prefilter_hits, const int maxnumdb, const float R[20][20], std::vector<std::pair<int, std::string> >& new_prefilter_hits, std::vector<std::pair<int, std::string> >& old_prefilter_hits) { Hash<char>* doubled = new Hash<char>; doubled->New(16381, 0); int element_count = (VECSIZE_INT * 4); //W = (LQ+15) / 16; // band width = hochgerundetes LQ/16 int W = (q_tmp->L + (element_count - 1)) / element_count; // query profile (states + 1 because of ANY char) unsigned char* qc = (unsigned char*)malloc_simd_int((hh::NUMCOLSTATES+1)*(q_tmp->L+element_count)*sizeof(unsigned char)); stripe_query_profile(q_tmp, prefilter_score_offset, prefilter_bit_factor, W, qc); simd_int ** workspace = new simd_int *[threads]; std::vector<std::pair<int, int> > first_prefilter; std::vector<std::pair<double, int> > hits; int count_dbs = 0; int gap_init = prefilter_gap_open + prefilter_gap_extend; int gap_extend = prefilter_gap_extend; int LQ = q_tmp->L; const float log_qlen = flog2(LQ); const double factor = (double) num_dbs * LQ; for (int i = 0; i < threads; i++) workspace[i] = (simd_int*) malloc_simd_int( 3 * (LQ + element_count) * sizeof(char)); #pragma omp parallel for schedule(static) // Loop over all database sequences for (size_t n = 0; n < num_dbs; n++) { int thread_id = 0; #ifdef OPENMP thread_id = omp_get_thread_num(); #endif // Perform search step int score = ungapped_sse_score(qc, LQ, first[n], length[n], prefilter_score_offset, workspace[thread_id]); score = score - (int) (prefilter_bit_factor * (log_qlen + flog2(length[n]))); #pragma omp critical first_prefilter.push_back(std::pair<int, int>(score, n)); } //filter after calculation of ungapped sse score to include at least min_prefilter_hits std::vector<std::pair<int, int> >::iterator it; sort(first_prefilter.begin(), first_prefilter.end()); std::reverse(first_prefilter.begin(), first_prefilter.end()); std::vector<std::pair<int, int> >::iterator first_prefilter_begin_erase = first_prefilter.end(); std::vector<std::pair<int, int> >::iterator first_prefilter_end_erase = first_prefilter.end(); count_dbs = 0; for (it = first_prefilter.begin(); it < first_prefilter.end(); it++) { if (count_dbs >= min_prefilter_hits && (*it).first <= preprefilter_smax_thresh) { first_prefilter_begin_erase = it; break; } else { count_dbs++; } } first_prefilter.erase(first_prefilter_begin_erase, first_prefilter_end_erase); HH_LOG(INFO) << "HMMs passed 1st prefilter (gapless profile-profile alignment) : " << count_dbs << std::endl; #pragma omp parallel for schedule(static) // Loop over all database sequences // for (int n = 0; n < count_dbs; n++) { for (size_t i = 0; i < first_prefilter.size(); i++) { int thread_id = 0; #ifdef OPENMP thread_id = omp_get_thread_num(); #endif int n = first_prefilter[i].second; // Perform search step int score = swStripedByte(qc, LQ, first[n], length[n], gap_init, gap_extend, workspace[thread_id], workspace[thread_id] + W, workspace[thread_id] + 2 * W, prefilter_score_offset); double evalue = factor * length[n] * fpow2(-score / prefilter_bit_factor); if (evalue < prefilter_evalue_coarse_thresh) { #pragma omp critical hits.push_back(std::pair<double, int>(evalue, n)); } } //filter after calculation of evalues to include at least min_prefilter_hits sort(hits.begin(), hits.end()); std::vector<std::pair<double, int> >::iterator second_prefilter_begin_erase = hits.end(); std::vector<std::pair<double, int> >::iterator second_prefilter_end_erase = hits.end(); std::vector<std::pair<double, int> >::iterator it2; count_dbs = 0; for (it2 = hits.begin(); it2 < hits.end(); it2++) { if (count_dbs >= min_prefilter_hits && (*it2).first > prefilter_evalue_thresh) { second_prefilter_begin_erase = it2; break; } else { count_dbs++; } } hits.erase(second_prefilter_begin_erase, second_prefilter_end_erase); count_dbs = 0; for (it2 = hits.begin(); it2 < hits.end(); it2++) { // Add hit to dbfiles count_dbs++; char db_name[NAMELEN]; strcpy(db_name, dbnames[(*it2).second]); char name[NAMELEN]; RemoveExtension(name, db_name); if (!doubled->Contains(db_name)) { doubled->Add(db_name); std::pair<int, std::string> result; result.first = length[(*it2).second]; result.second = std::string(db_name); // check, if DB was searched in previous rounds strcat(name, "__1"); // irep=1 if (previous_hits->Contains(name)) { old_prefilter_hits.push_back(result); } else { new_prefilter_hits.push_back(result); } } if (count_dbs >= maxnumdb) { HH_LOG(WARNING) << "Number of hits passing 2nd prefilter (reduced from " << hits.size() << " to allowed maximum of " << maxnumdb << ").\n" <<"You can increase the allowed maximum using the -maxfilt <max> option.\n"; break; } } // Free memory free(qc); for (int i = 0; i < threads; i++) free(workspace[i]); delete[] workspace; if (doubled) delete doubled; }
///////////////////////////////////////////////////////////////////////////////////// //// MAIN PROGRAM ///////////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { char* argv_conf[MAXOPT]; // Input arguments from .hhdefaults file (first=1: argv_conf[0] is not used) int argc_conf; // Number of arguments in argv_conf strcpy(par.infile, ""); strcpy(par.outfile, ""); strcpy(par.alnfile, ""); //Default parameter settings par.nseqdis = MAXSEQ - 1; // maximum number of sequences to be written par.showcons = 0; par.cons = 1; par.Ndiff = 0; par.max_seqid = 100; par.coverage = 0; par.pc_hhm_context_engine.pca = 0.0; // no amino acid pseudocounts par.pc_hhm_nocontext_a = 0.0; // no amino acid pseudocounts par.gapb = 0.0; // no transition pseudocounts // Make command line input globally available par.argv = argv; par.argc = argc; RemovePathAndExtension(program_name, argv[0]); // Enable changing verbose mode before defaults file and command line are processed int v = 2; for (int i = 1; i < argc; i++) { if (!strcmp(argv[i], "-def")) par.readdefaultsfile = 1; else if (strcmp(argv[i], "-v") == 0) { v = atoi(argv[i + 1]); } } par.v = Log::from_int(v); Log::reporting_level() = par.v; par.SetDefaultPaths(); // Read .hhdefaults file? if (par.readdefaultsfile) { // Process default otpions from .hhconfig file ReadDefaultsFile(argc_conf, argv_conf); ProcessArguments(argc_conf, argv_conf); } // Process command line options (they override defaults from .hhdefaults file) ProcessArguments(argc, argv); Alignment* qali = new Alignment(MAXSEQ, par.maxres); HMM* q = new HMM(MAXSEQDIS, par.maxres); //Create a HMM with maximum of par.maxres match states // q is only available after maxres is known, so we had to move this here for (int i = 1; i <= argc - 1; i++) { if (!strcmp(argv[i], "-name") && (i < argc - 1)) { strmcpy(q->name, argv[++i], NAMELEN - 1); //copy longname to name... strmcpy(q->longname, argv[i], DESCLEN - 1); //copy full name to longname } } // Check command line input and default values if (!*par.infile) { help(); HH_LOG(ERROR) << "Input file is missing!" << std::endl; exit(4); } // Get basename RemoveExtension(q->file, par.infile); //Get basename of infile (w/o extension): // Outfile not given? Name it basename.hhm if (!*par.outfile && !*par.alnfile) { RemoveExtension(par.outfile, par.infile); strcat(par.outfile, ".seq"); } // Prepare CS pseudocounts lib if (!par.nocontxt && *par.clusterfile) { InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); } // Set substitution matrix; adjust to query aa distribution if par.pcm==3 SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim); // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. char input_format = 0; ReadQueryFile(par, par.infile, input_format, par.wg, q, qali, pb, S, Sim); // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction // Add Pseudocounts, if no HMMER input if (input_format == 0) { // Transform transition freqs to lin space if not already done q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg, par.gaph, par.gapi, par.gapb, par.gapb); // Comput substitution matrix pseudocounts if (par.nocontxt) { // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q->PreparePseudocounts(R); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } else { // Add full context specific pseudocounts to query q->AddContextSpecificPseudocounts(pc_hhm_context_engine, pc_hhm_context_mode); } } else { q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c); } q->CalculateAminoAcidBackground(pb); if (par.columnscore == 5 && !q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs( par.half_window_size_local_aa_bg_freqs, pb); // Write consensus sequence to sequence file // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions() if (*par.outfile) { FILE* outf = NULL; if (strcmp(par.outfile, "stdout")) { outf = fopen(par.outfile, "a"); if (!outf) OpenFileError(par.outfile, __FILE__, __LINE__, __func__); } else outf = stdout; // OLD //// ">name_consensus" -> ">name consensus" //strsubst(q->sname[q->nfirst],"_consensus"," consensus"); //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1); // NEW (long header needed for NR30cons database) fprintf(outf, ">%s\n%s\n", q->longname, q->seq[q->nfirst] + 1); fclose(outf); } // Print A3M/A2M/FASTA output alignment if (*par.alnfile) { HalfAlignment qa; int n = imin(q->n_display, par.nseqdis + (q->nss_dssp >= 0) + (q->nss_pred >= 0) + (q->nss_conf >= 0) + (q->ncons >= 0)); qa.Set(q->name, q->seq, q->sname, n, q->L, q->nss_dssp, q->nss_pred, q->nss_conf, q->nsa_dssp, q->ncons); if (par.outformat == 1) qa.BuildFASTA(); else if (par.outformat == 2) qa.BuildA2M(); else if (par.outformat == 3) qa.BuildA3M(); if (qali->readCommentLine) qa.Print(par.alnfile, par.append, qali->longname); // print alignment to outfile else qa.Print(par.alnfile, par.append); // print alignment to outfile } delete qali; delete q; DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode); }
///////////////////////////////////////////////////////////////////////////////////// //// Processing input options from command line and .hhdefaults file ///////////////////////////////////////////////////////////////////////////////////// void ProcessArguments(int argc, char** argv) { // Read command line options for (int i = 1; i <= argc - 1; i++) { HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl; if (!strcmp(argv[i], "-i")) { if (++i >= argc || argv[i][0] == '-') { help(); HH_LOG(ERROR) << "No input file following -i" << std::endl; exit(4); } else strcpy(par.infile, argv[i]); } else if (!strcmp(argv[i], "-s")) { if (++i >= argc) { help(); HH_LOG(ERROR) << "No output file following -s" << std::endl; exit(4); } else strcpy(par.outfile, argv[i]); } else if (!strcmp(argv[i], "-o")) { par.outformat = 3; if (++i >= argc) { help(); HH_LOG(ERROR) << "No output file following -o" << std::endl; exit(4); } else strcpy(par.alnfile, argv[i]); } else if (!strcmp(argv[i], "-ofas")) { par.outformat = 1; if (++i >= argc || argv[i][0] == '-') { help(); HH_LOG(ERROR) << "No output file following -o" << std::endl; exit(4); } else strcpy(par.alnfile, argv[i]); } else if (!strcmp(argv[i], "-oa2m")) { par.outformat = 2; if (++i >= argc || argv[i][0] == '-') { help(); HH_LOG(ERROR) << "No output file following -o" << std::endl; exit(4); } else strcpy(par.alnfile, argv[i]); } else if (!strcmp(argv[i], "-oa3m")) { par.outformat = 3; if (++i >= argc || argv[i][0] == '-') { help(); HH_LOG(ERROR) << "No output file following -o" << std::endl; exit(4); } else strcpy(par.alnfile, argv[i]); } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { help(); exit(0); } else if (!strcmp(argv[i], "-v") && (i < argc - 1) && argv[i + 1][0] != '-') { int v = atoi(argv[++i]); par.v = Log::from_int(v); Log::reporting_level() = par.v; } else if (!strcmp(argv[i], "-seq") && (i < argc - 1)) par.nseqdis = atoi(argv[++i]); else if (!strcmp(argv[i], "-id") && (i < argc - 1)) par.max_seqid = atoi(argv[++i]); else if (!strcmp(argv[i], "-qid") && (i < argc - 1)) par.qid = atoi(argv[++i]); else if (!strcmp(argv[i], "-qsc") && (i < argc - 1)) par.qsc = atof(argv[++i]); else if (!strcmp(argv[i], "-cov") && (i < argc - 1)) par.coverage = atoi(argv[++i]); else if (!strcmp(argv[i], "-diff") && (i < argc - 1)) par.Ndiff = atoi(argv[++i]); else if (!strcmp(argv[i], "-M") && (i < argc - 1)) if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")) par.M = 1; else if (!strcmp(argv[i], "first")) par.M = 3; else if (argv[i][0] >= '0' && argv[i][0] <= '9') { par.Mgaps = atoi(argv[i]); par.M = 2; } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; else if (!strcmp(argv[i], "-Gonnet")) par.matrix = 0; else if (!strncmp(argv[i], "-BLOSUM", 7) || !strncmp(argv[i], "-Blosum", 7)) { if (!strcmp(argv[i] + 7, "30")) par.matrix = 30; else if (!strcmp(argv[i] + 7, "40")) par.matrix = 40; else if (!strcmp(argv[i] + 7, "50")) par.matrix = 50; else if (!strcmp(argv[i] + 7, "65")) par.matrix = 65; else if (!strcmp(argv[i] + 7, "80")) par.matrix = 80; else HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl; } else if (!strcmp(argv[i], "-pcm") && (i < argc - 1)) par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) atoi(argv[++i]); else if (!strcmp(argv[i], "-pca") && (i < argc - 1)) par.pc_hhm_context_engine.pca = atof(argv[++i]); else if (!strcmp(argv[i], "-pcb") && (i < argc - 1)) par.pc_hhm_context_engine.pcb = atof(argv[++i]); else if (!strcmp(argv[i], "-pcc") && (i < argc - 1)) par.pc_hhm_context_engine.pcc = atof(argv[++i]); else if (!strcmp(argv[i], "-gapb") && (i < argc - 1)) { par.gapb = atof(argv[++i]); if (par.gapb <= 0.01) par.gapb = 0.01; } else if (!strcmp(argv[i], "-gapd") && (i < argc - 1)) par.gapd = atof(argv[++i]); else if (!strcmp(argv[i], "-gape") && (i < argc - 1)) par.gape = atof(argv[++i]); else if (!strcmp(argv[i], "-gapf") && (i < argc - 1)) par.gapf = atof(argv[++i]); else if (!strcmp(argv[i], "-gapg") && (i < argc - 1)) par.gapg = atof(argv[++i]); else if (!strcmp(argv[i], "-gaph") && (i < argc - 1)) par.gaph = atof(argv[++i]); else if (!strcmp(argv[i], "-gapi") && (i < argc - 1)) par.gapi = atof(argv[++i]); else if (!strcmp(argv[i], "-def")) par.readdefaultsfile = 1; else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) par.maxres = par.maxcol = atoi(argv[++i]); else if (!strcmp(argv[i], "-nocontxt")) par.nocontxt = 1; else if (!strcmp(argv[i], "-csb") && (i < argc - 1)) par.csb = atof(argv[++i]); else if (!strcmp(argv[i], "-csw") && (i < argc - 1)) par.csw = atof(argv[++i]); else if (!strcmp(argv[i], "-cs")) { if (++i >= argc || argv[i][0] == '-') { help(); HH_LOG(ERROR) << "No query file following -cs" << std::endl; exit(4); } else strcpy(par.clusterfile, argv[i]); } else if (!strcmp(argv[i], "-name")) { // skip this, its handled somewhere else ; } else { HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << " ...\n"; } HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl; } // end of for-loop for command line input }
std::vector<Hit> ViterbiRunner::alignment(Parameters& par, HMMSimd * q_simd, std::vector<HHEntry*> dbfiles, const float qsc, float* pb, const float S[20][20], const float Sim[20][20], const float R[20][20], const int ssm_mode, const float S73[NDSSP][NSSPRED][MAXCF], const float S33[NSSPRED][MAXCF][NSSPRED][MAXCF], const float S37[NSSPRED][MAXCF][NDSSP]) { HMM * q = q_simd->GetHMM(0); // Initialize memory std::vector<HMM*> t_hmm; for(size_t i = 0; i < HMMSimd::VEC_SIZE * thread_count; i++) { HMM* t = new HMM(MAXSEQDIS, par.maxres); t_hmm.push_back(t); } HMMSimd** t_hmm_simd = new HMMSimd*[thread_count]; std::vector<ViterbiConsumerThread *> threads; for (int thread_id = 0; thread_id < thread_count; thread_id++) { t_hmm_simd[thread_id] = new HMMSimd(par.maxres); ViterbiConsumerThread * thread = new ViterbiConsumerThread(thread_id, par, q_simd, t_hmm_simd[thread_id],viterbiMatrix[thread_id], ssm_mode, S73, S33, S37); threads.push_back(thread); } std::vector<Hit> ret_hits; std::vector<HHEntry*> dbfiles_to_align; std::map<std::string, std::vector<Viterbi::BacktraceResult> > excludeAlignments; // For all the databases comming through prefilter std::copy(dbfiles.begin(), dbfiles.end(), std::back_inserter(dbfiles_to_align)); // loop to detect second/thrid/... best alignemtns for (int alignment = 0; alignment < par.altali; alignment++) { HH_LOG(INFO) << "Alternative alignment: " << alignment << std::endl; unsigned int allElementToAlignCount = dbfiles_to_align.size(); unsigned int seqBlockSize = allElementToAlignCount; if(alignment == 0 && par.early_stopping_filter){ seqBlockSize = 2000; } for(unsigned int seqJunkStart = 0; seqJunkStart < allElementToAlignCount; seqJunkStart += seqBlockSize ){ //sort by length to improve performance. //desc sort (for better utilisation ofthreads) unsigned int seqJunkSize = imin(allElementToAlignCount - (seqJunkStart), seqBlockSize); sort(dbfiles_to_align.begin() + seqJunkStart, dbfiles_to_align.begin() + (seqJunkStart + seqJunkSize), HHDatabaseEntryCompare()); // read in data for thread #pragma omp parallel for schedule(dynamic, 1) for (unsigned int idb = seqJunkStart; idb < (seqJunkStart + seqJunkSize); idb += HMMSimd::VEC_SIZE) { int current_thread_id = 0; #ifdef OPENMP current_thread_id = omp_get_thread_num(); #endif const int current_t_index = (current_thread_id * HMMSimd::VEC_SIZE); std::vector<HMM *> templates_to_align; // read in alignment int maxResElem = imin((seqJunkStart + seqJunkSize) - (idb), HMMSimd::VEC_SIZE); for (int i = 0; i < maxResElem; i++) { HHEntry* entry = dbfiles_to_align.at(idb + i); int format_tmp = 0; char wg = 1; // performance reason entry->getTemplateHMM(par, wg, qsc, format_tmp, pb, S, Sim, t_hmm[current_t_index + i]); t_hmm[current_t_index + i]->entry = entry; PrepareTemplateHMM(par, q, t_hmm[current_t_index + i], format_tmp, false, pb, R); templates_to_align.push_back(t_hmm[current_t_index + i]); } t_hmm_simd[current_thread_id]->MapHMMVector(templates_to_align); exclude_alignments(maxResElem, q_simd, t_hmm_simd[current_thread_id], excludeAlignments, viterbiMatrix[current_thread_id]); if(par.exclstr) { // Mask excluded regions exclude_regions(par.exclstr, maxResElem, q_simd, t_hmm_simd[current_thread_id], viterbiMatrix[current_thread_id]); } if(par.template_exclstr) { // Mask excluded regions exclude_template_regions(par.template_exclstr, maxResElem, q_simd, t_hmm_simd[current_thread_id], viterbiMatrix[current_thread_id]); } // start next job threads[current_thread_id]->align(maxResElem, par.nseqdis, par.smin); } // idb loop // merge thread results // search hits for next alignment HH_LOG(INFO) << (seqJunkStart + seqJunkSize) << " alignments done" << std::endl; merge_thread_results(ret_hits, dbfiles_to_align, excludeAlignments, threads, alignment, par.smin); for (unsigned int thread = 0; thread < threads.size(); thread++) { threads[thread]->clear(); } if ( alignment == 0 && par.early_stopping_filter ) { float early_stopping_sum = calculateEarlyStop(par, q, ret_hits, seqJunkStart); float filter_cutoff = seqJunkSize * par.filter_thresh; if( early_stopping_sum < filter_cutoff){ HH_LOG(INFO) << "Stop after DB-HHM: " << (seqJunkStart + seqJunkSize) << " because early stop " << early_stopping_sum << " < filter cutoff " << filter_cutoff << "\n"; break; // stop junk loop and just find alternative alignments } } } // junk loop // earse first elements. These are the elements from alignment run before, // new elements are after + elementToAlignCount dbfiles_to_align.erase(dbfiles_to_align.begin(), dbfiles_to_align.begin() + allElementToAlignCount); } // Alignment loop // clean memory for (int thread_id = 0; thread_id < thread_count; thread_id++) { delete t_hmm_simd[thread_id]; delete threads[thread_id]; } threads.clear(); delete[] t_hmm_simd; for(size_t i = 0; i < HMMSimd::VEC_SIZE * thread_count; i++) { delete t_hmm[i]; } t_hmm.clear(); return ret_hits; }