Exemplo n.º 1
0
void makeOutputFFIndex(char *par, const int mpi_rank,
                       void (*print)(HHblits &, std::stringstream &),
                       std::vector<OutputFFIndex> &outDatabases) {
    if (*par) {
        OutputFFIndex db;

        strcpy(db.base, par);
        db.offset = 0;
        db.print = print;

        char data_filename_out_rank[NAMELEN];
        char index_filename_out_rank[NAMELEN];

        snprintf(data_filename_out_rank, FILENAME_MAX, "%s.ffdata.%d", par,
                 mpi_rank);
        snprintf(index_filename_out_rank, FILENAME_MAX, "%s.ffindex.%d", par,
                 mpi_rank);

        db.data_fh = fopen(data_filename_out_rank, "w+");
        db.index_fh = fopen(index_filename_out_rank, "w+");

        if (db.data_fh == NULL) {
            HH_LOG(WARNING) << "Could not open datafile " << data_filename_out_rank << std::endl;
            return;
        }

        if (db.index_fh == NULL) {
            HH_LOG(WARNING) << "Could not open indexfile " << index_filename_out_rank << std::endl;
            return;
        }

        outDatabases.push_back(db);
    }
}
Exemplo n.º 2
0
  void Prefilter::checkCSFormat(size_t nr_checks) {
    for (size_t n = 0; n < std::min(nr_checks, num_dbs); n++) {
      if (first[n][0] == '>') {
        nr_checks--;
      }
    }

    if (nr_checks == 0) {
      HH_LOG(ERROR) << "In " << __FILE__ << ":" << __LINE__ << ": " << __func__ << ":" << std::endl;
      HH_LOG(ERROR) << "\tYour cs database is in an old format!" << std::endl;
      HH_LOG(ERROR) << "\tThis format is no longer supportet!" << std::endl;
      HH_LOG(ERROR) << "\tCorrespond to the user manual!" << std::endl;
      exit(1);
    }
  }
Exemplo n.º 3
0
void checkOutput(Parameters& par) {
  if (!*par.outfile) {
    RemoveExtension(par.outfile, par.infile);
    strcat(par.outfile, ".hhr");
    HH_LOG(INFO) << "Search results will be written to " << par.outfile << "\n";
  }
}
Exemplo n.º 4
0
void SetBlosumMatrix(const char matrix, const float BlosumXX[], float* pb, float P[20][20])
{
  int a,b,n=0;
  HH_LOG(DEBUG) << "Using the BLOSUM " << matrix << " matrix" << std::endl;
  for (a=0; a<20; ++a)
    for (pb[a]=0.0f, b=0; b<=a; ++b,++n)
      P[a][b] = BlosumXX[n];
  for (a=0; a<19; a++)
    for (b=a+1; b<20; ++b)
      P[a][b] = P[b][a];
}
Exemplo n.º 5
0
  ///////////////////////////////////////////////////////////////////////////////////////////////////
// Pull out all names from prefilter db file and copy into dbfiles_new for full HMM-HMM comparison
///////////////////////////////////////////////////////////////////////////////////////////////////
  void Prefilter::init_no_prefiltering(FFindexDatabase* cs219_database,
      std::vector<std::pair<int, std::string> >& prefiltered_entries) {
    ffindex_index_t* db_index = cs219_database->db_index;

    for (size_t n = 0; n < db_index->n_entries; n++) {
      ffindex_entry_t* entry = ffindex_get_entry_by_index(db_index, n);

      prefiltered_entries.push_back(
          std::make_pair<int, std::string>(entry->length,
              std::string(entry->name)));
    }

    HH_LOG(INFO) << "Searching " << prefiltered_entries.size()
        << " database HHMs without prefiltering" << std::endl;
  }
Exemplo n.º 6
0
int main(int argc, char **argv) {
  Parameters par;
  HHblits::ProcessAllArguments(argc, argv, par);
  checkOutput(par);

  std::vector<HHblitsDatabase*> databases;
  HHblits::prepareDatabases(par, databases);
#ifdef OPENMP
  omp_set_num_threads(par.threads);
#endif
  HHblits hhblits(par, databases);

  FILE* inf;
  if(strcmp(par.infile, "stdin") == 0) {
	  inf = stdin;
  }
  else {
	  inf = fopen(par.infile, "r");
  }

  if(!inf) {
	  HH_LOG(ERROR) << "Input file (" << par.infile << ") could not be opened!" << std::endl;
	  exit(1);
  }

  hhblits.run(inf, par.infile);
  fclose(inf);

  if(Log::reporting_level() >= INFO) {
    hhblits.printHitList();
  }

  hhblits.writeHHRFile(par.outfile);
  hhblits.writeAlisFile(par.alisbasename);
  hhblits.writeScoresFile(par.scorefile);
  hhblits.writeM8(par.m8file);
  hhblits.writePairwiseAlisFile(par.pairwisealisfile, par.outformat);
  hhblits.writeAlitabFile(par.alitabfile);
  hhblits.writePsiFile(par.psifile);
  hhblits.writeHMMFile(par.hhmfile);
  hhblits.writeA3MFile(par.alnfile);
  hhblits.writeMatricesFile(par.matrices_output_file);

  for(size_t i = 0; i < databases.size(); i++) {
    delete databases[i];
  }
  databases.clear();
}
Exemplo n.º 7
0
//////////////////////////////////////////////////////////////
// Reading in column state sequences for prefiltering
//////////////////////////////////////////////////////////////
  void Prefilter::init_prefilter(FFindexDatabase* cs219_database) {
    // Set up variables for prefiltering
    num_dbs = cs219_database->db_index->n_entries;
    first = (unsigned char**) mem_align(ALIGN_FLOAT, num_dbs * sizeof(unsigned char*));
    length = (int*) mem_align(ALIGN_FLOAT, num_dbs * sizeof(int));
    dbnames = (char**) mem_align(ALIGN_FLOAT, num_dbs * sizeof(char*));
    for (size_t n = 0; n < num_dbs; n++) {
      ffindex_entry_t* entry = ffindex_get_entry_by_index(
          cs219_database->db_index, n);
      first[n] = (unsigned char*) ffindex_get_data_by_entry(
          cs219_database->db_data, entry);
      length[n] = entry->length - 1;
      dbnames[n] = new char[strlen(entry->name) + 1];
      strcpy(dbnames[n], entry->name);
    }

    //check if cs219 format is new binary format
    checkCSFormat(5);

    HH_LOG(INFO) << "Searching " << num_dbs
        << " column state sequences." << std::endl;
  }
Exemplo n.º 8
0
//// Processing input options from command line
void ProcessArguments(Parameters& par) {
  const int argc = par.argc;
  const char** argv = par.argv;

  // Read command line options
  for (int i = 1; i <= argc - 1; i++) {
	HH_LOG(DEBUG1) << i << "  " << argv[i] << std::endl;
	if (!strcmp(argv[i], "-i")) {
      if (++i > argc - 1 || argv[i][0] == '-') {
        HH_LOG(ERROR) << "No input file following -f" << std::endl;
        exit(4);
      }
      else {
        strcpy(par.infile, argv[i]);
      }
    } else if (!strcmp(argv[i], "-o")) {
      par.append = 0;
      if (++i > argc - 1) {
        HH_LOG(ERROR) << "No output file following -o" << std::endl;
        exit(4);
      }
      else
        strcpy(par.outfile, argv[i]);
    } else if (!strcmp(argv[i], "-a")) {
      par.append = 1;
      if (++i > argc - 1) {
        HH_LOG(ERROR) << "No output file following -a" << std::endl;
        exit(4);
      }
      else
        strcpy(par.outfile, argv[i]);
    } else if (!strcmp(argv[i], "-v") && (i + 1 < argc) && argv[i + 1][0] != '-') {
		int v = atoi(argv[++i]);
		par.v = Log::from_int(v);
		Log::reporting_level() = par.v;
    }
    else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1))
      par.maxseq = atoi(argv[++i]);
    else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) {
      par.maxres = atoi(argv[++i]);
      par.maxcol = par.maxres * 2;
    } else if (!strcmp(argv[i], "-id") && (i < argc - 1))
      par.max_seqid = atoi(argv[++i]);
    else if (!strcmp(argv[i], "-qid") && (i < argc - 1))
      par.qid = atoi(argv[++i]);
    else if (!strcmp(argv[i], "-qsc") && (i < argc - 1))
      par.qsc = atof(argv[++i]);
    else if (!strcmp(argv[i], "-cov") && (i < argc - 1))
      par.coverage = atoi(argv[++i]);
    else if (!strcmp(argv[i], "-diff") && (i < argc - 1))
      par.Ndiff = atoi(argv[++i]);
    else if (!strcmp(argv[i], "-neff") && (i < argc - 1))
      par.Neff = atof(argv[++i]);
    else if (!strcmp(argv[i], "-Neff") && (i < argc - 1))
      par.Neff = atof(argv[++i]);
    else if (!strcmp(argv[i], "-M") && (i < argc - 1)) {
      if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m"))
        par.M = 1;
      else if (!strcmp(argv[i], "first"))
        par.M = 3;
      else if (argv[i][0] >= '0' && argv[i][0] <= '9') {
        par.Mgaps = atoi(argv[i]);
        par.M = 2;
      }
      else
        HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl;
    } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
      help(par);
      exit(0);
    } else {
    	HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
    }
  }
}
Exemplo n.º 9
0
int main(int argc, const char **argv) {
  Parameters par(argc, argv);

  strcpy(par.infile, "");
  strcpy(par.outfile, "");

  // maximum number of sequences to be written
  par.nseqdis = par.maxseq - 1;
  // no filtering for maximum diversity
  par.Ndiff = 0;

  ProcessArguments(par);

  // Check command line input and default values
  if (!*par.infile) {
    help(par);
    HH_LOG(ERROR) << "Input file is missing!" << std::endl;
    exit(4);
  }
  if (!*par.outfile) {
    help(par);
    HH_LOG(ERROR) << "Output file is missing!" << std::endl;
    exit(4);
  }

  HH_LOG(INFO) << "Input file = " << par.infile << "\n";
  HH_LOG(INFO) << "Output file = " << par.outfile << "\n";

  // Reads in an alignment from par.infile into matrix X[k][l] as ASCII
  FILE* inf = NULL;
  if (strcmp(par.infile, "stdin")) {
    inf = fopen(par.infile, "r");
    if (!inf) {
      OpenFileError(par.infile, __FILE__, __LINE__, __func__);
    }
  }
  else {
    inf = stdin;
  }

  Alignment qali(par.maxseq, par.maxres);
  qali.Read(inf, par.infile, par.mark, par.maxcol, par.nseqdis);
  fclose(inf);

  // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
  // and store marked sequences in name[k] and seq[k]
  qali.Compress(par.infile, par.cons, par.maxcol, par.M, par.Mgaps);

  // substitution matrix flavours
  float __attribute__((aligned(16))) P[20][20];
  float __attribute__((aligned(16))) R[20][20];
  float __attribute__((aligned(16))) Sim[20][20];
  float __attribute__((aligned(16))) S[20][20];
  float __attribute__((aligned(16))) pb[21];
  SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);

  // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
  qali.N_filtered = qali.Filter(par.max_seqid, S, par.coverage, par.qid, par.qsc,par.Ndiff);

  // Atune alignment diversity q.Neff with qsc to value Neff_goal
  if (par.Neff >= 1.0) {
    qali.FilterNeff(par.wg, par.mark, par.cons, par.showcons, par.max_seqid, par.coverage, par.Neff, pb, S, Sim);
  }

  // Write filtered alignment WITH insert states (lower case) to alignment file
  qali.WriteToFile(par.outfile, par.append);
}
Exemplo n.º 10
0
/////////////////////////////////////////////////////////////////////////////////////
// Set (global variable) substitution matrix with derived matrices and background frequencies
/////////////////////////////////////////////////////////////////////////////////////
void SetSubstitutionMatrix(const char matrix, float* pb, float P[20][20], float R[20][20], float S[20][20], float Sim[20][20])
{
  int a,b;
  switch (matrix)
    {
    default:
    case 0:  //Gonnet matrix
      HH_LOG(DEBUG) << "Using the Gonnet matrix" << std::endl;
      for (a=0; a<20; ++a)
        for (pb[a]=0.0f, b=0; b<20; ++b)
          P[a][b] = 0.000001f*Gonnet[a*20+b];
            break;
    case 30:  //BLOSUM30
      SetBlosumMatrix(matrix, Blosum30, pb, P);
      break;
    case 40:  //BLOSUM40
      SetBlosumMatrix(matrix, Blosum40, pb, P);
      break;
    case 50:  //BLOSUM50
      SetBlosumMatrix(matrix, Blosum50, pb, P);
      break;
    case 62:  //BLOSUM62
      SetBlosumMatrix(matrix, Blosum62, pb, P);
      break;
    case 65:  //BLOSUM65
      SetBlosumMatrix(matrix, Blosum65, pb, P);
      break;
    case 80:  //BLOSUM80
      SetBlosumMatrix(matrix, Blosum80, pb, P);
      break;
   }
  
  // Check transition probability matrix, renormalize P and calculate pb[a]
  float sumab=0.0f;
  for (a=0; a<20; a++)
    for (b=0; b<20; ++b) sumab+=P[a][b];
  for (a=0; a<20; a++)
    for (b=0; b<20; ++b) P[a][b]/=sumab;
  for (a=0; a<20; a++)
    for (pb[a]=0.0f, b=0; b<20; ++b) pb[a]+=P[a][b];

  //Compute similarity matrix for amino acid pairs (for calculating consensus sequence)
  for (a=0; a<20; ++a)
    for (b=0; b<20; ++b)
      Sim[a][b] = P[a][b]*P[a][b]/P[a][a]/P[b][b];

  //Precompute matrix R for amino acid pseudocounts:
  for (a=0; a<20; ++a)
    for (b=0; b<20; ++b)   
      R[a][b] = P[a][b]/pb[b]; //R[a][b]=P(a|b)
  
  //Precompute matrix R for amino acid pseudocounts:
  for (a=0; a<20; ++a)
    for (b=0; b<20; ++b)   
      S[a][b] = log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b))
  
  // Evaluate sequence identity underlying substitution matrix
  if (Log::reporting_level() >= DEBUG) {
      float id=0.0f;
      float entropy=0.0f; 
      float entropy_pb=0.0f;
      float mut_info=0.0f;
      for (a=0; a<20; ++a) id+=P[a][a];
      for (a=0; a<20; ++a) entropy_pb-=pb[a]*log2(pb[a]);
      for (a=0; a<20; ++a) 
	  for (b=0; b<20; ++b) 
	    {
	      entropy-=P[a][b]*log2(R[a][b]);
	      mut_info += P[a][b]*S[a][b];
	    }
      
      HH_LOG(DEBUG) << "sequence identity = " << 100*id << " %; entropy per column = " << entropy << " bits (out of " << entropy_pb << "); mutual information = " << mut_info << " bits" << std::endl;
  }

  //Debugging: probability matrix and dissimilarity matrix
  if (Log::reporting_level() >= DEBUG1) {
      HH_LOG(DEBUG) << "Check matrix: before renormalization sum P(a,b)= "<<sumab<<"...\n";//PRINT
      HH_LOG(DEBUG) <<"      A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
      HH_LOG(DEBUG) <<"p[] ";
      for (a=0; a<20; a++)  HH_LOG(DEBUG) << 100*pb[a] << " ";
      HH_LOG(DEBUG) <<std::endl<<"\nSubstitution matrix log2( P(a,b)/p(a)/p(b) ) (in bits):\n";
      HH_LOG(DEBUG) <<"      A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
      for (b=0; b<20; b++)
	{
      HH_LOG(DEBUG) << i2aa(b) << "   ";
	  for (a=0; a<20; a++)  HH_LOG(DEBUG) << S[a][b] << " ";
	  HH_LOG(DEBUG) << std::endl;
	}
      HH_LOG(DEBUG) << std::endl << "\nOdds matrix P(a,b)/p(a)/p(b):\n";
      HH_LOG(DEBUG) <<"      A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
      for (b=0; b<20; b++)
	{
    	  HH_LOG(DEBUG) <<i2aa(b)<<"   ";
	  for (a=0; a<20; a++)  HH_LOG(DEBUG) << P[b][a]/pb[a]/pb[b] << " ";
	  HH_LOG(DEBUG) <<std::endl;
	}
      HH_LOG(DEBUG) <<std::endl<<"\nMatrix of conditional probabilities P(a|b) = P(a,b)/p(b) (in %):\n";
      HH_LOG(DEBUG) <<"      A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
      for (b=0; b<20; b++)
	{
      HH_LOG(DEBUG) <<i2aa(b)<<"   ";
	  for (a=0; a<20; a++)  HH_LOG(DEBUG) << 100*R[b][a] << " ";
	  HH_LOG(DEBUG) <<std::endl;
	}
      HH_LOG(DEBUG) <<std::endl<<"\nProbability matrix P(a,b) (in 10E-6):\n";
      HH_LOG(DEBUG) <<"      A     R     N     D     C     Q     E     G     H     I     L     K     M     F     P     S     T     W     Y     V\n";
      for (b=0; b<20; b++)
	{
      HH_LOG(DEBUG) <<i2aa(b)<<"   ";
	  for (a=0; a<20; a++)  HH_LOG(DEBUG) << 1000000*P[b][a] << " ";
	  HH_LOG(DEBUG) <<std::endl;
	}
      HH_LOG(DEBUG) <<std::endl<<"Similarity matrix P(a,b)^2/P(a,a)/P(b,b) (in %):\n";
      HH_LOG(DEBUG) <<"      A    R    N    D    C    Q    E    G    H    I    L    K    M    F    P    S    T    W    Y    V\n";
      for (b=0; b<20; b++)
	{
      HH_LOG(DEBUG) <<i2aa(b)<<"   ";
	  for (a=0; a<20; a++)  HH_LOG(DEBUG) << 100*Sim[b][a] << " ";
	  HH_LOG(DEBUG) <<std::endl;
	}
      HH_LOG(DEBUG) <<std::endl;
  }
}
Exemplo n.º 11
0
////////////////////////////////////////////////////////////////////////
// Main prefilter function
////////////////////////////////////////////////////////////////////////
  void Prefilter::prefilter_db(HMM* q_tmp, Hash<Hit>* previous_hits,
      const int threads, const int prefilter_gap_open,
      const int prefilter_gap_extend, const int prefilter_score_offset,
      const int prefilter_bit_factor, const double prefilter_evalue_thresh,
      const double prefilter_evalue_coarse_thresh,
      const int preprefilter_smax_thresh, const int min_prefilter_hits, const int maxnumdb,
      const float R[20][20],
      std::vector<std::pair<int, std::string> >& new_prefilter_hits,
      std::vector<std::pair<int, std::string> >& old_prefilter_hits) {

    Hash<char>* doubled = new Hash<char>;
    doubled->New(16381, 0);

    int element_count = (VECSIZE_INT * 4);
    //W = (LQ+15) / 16;   // band width = hochgerundetes LQ/16
    int W = (q_tmp->L + (element_count - 1)) / element_count;
    // query profile (states + 1 because of ANY char)
    unsigned char* qc = (unsigned char*)malloc_simd_int((hh::NUMCOLSTATES+1)*(q_tmp->L+element_count)*sizeof(unsigned char));
    stripe_query_profile(q_tmp, prefilter_score_offset, prefilter_bit_factor, W, qc);

    simd_int ** workspace = new simd_int *[threads];

    std::vector<std::pair<int, int> > first_prefilter;
    std::vector<std::pair<double, int> > hits;

    int count_dbs = 0;
    int gap_init = prefilter_gap_open + prefilter_gap_extend;
    int gap_extend = prefilter_gap_extend;
    int LQ = q_tmp->L;
    const float log_qlen = flog2(LQ);
    const double factor = (double) num_dbs * LQ;

    for (int i = 0; i < threads; i++)
      workspace[i] = (simd_int*) malloc_simd_int(
          3 * (LQ + element_count) * sizeof(char));

#pragma omp parallel for schedule(static)
    // Loop over all database sequences
    for (size_t n = 0; n < num_dbs; n++) {
      int thread_id = 0;
#ifdef OPENMP
      thread_id = omp_get_thread_num();
#endif
      // Perform search step
      int score = ungapped_sse_score(qc, LQ, first[n], length[n],
          prefilter_score_offset, workspace[thread_id]);

      score = score
          - (int) (prefilter_bit_factor * (log_qlen + flog2(length[n])));

#pragma omp critical
      first_prefilter.push_back(std::pair<int, int>(score, n));
    }
    //filter after calculation of ungapped sse score to include at least min_prefilter_hits
    std::vector<std::pair<int, int> >::iterator it;

    sort(first_prefilter.begin(), first_prefilter.end());
    std::reverse(first_prefilter.begin(), first_prefilter.end());

    std::vector<std::pair<int, int> >::iterator first_prefilter_begin_erase =
        first_prefilter.end();
    std::vector<std::pair<int, int> >::iterator first_prefilter_end_erase =
        first_prefilter.end();
    count_dbs = 0;
    for (it = first_prefilter.begin(); it < first_prefilter.end(); it++) {
      if (count_dbs >= min_prefilter_hits
          && (*it).first <= preprefilter_smax_thresh) {
        first_prefilter_begin_erase = it;
        break;
      }
      else {
        count_dbs++;
      }
    }

    first_prefilter.erase(first_prefilter_begin_erase,
        first_prefilter_end_erase);

    HH_LOG(INFO)
        << "HMMs passed 1st prefilter (gapless profile-profile alignment)  : "
        << count_dbs << std::endl;

#pragma omp parallel for schedule(static)
    // Loop over all database sequences
//  for (int n = 0; n < count_dbs; n++) {
    for (size_t i = 0; i < first_prefilter.size(); i++) {
      int thread_id = 0;
#ifdef OPENMP
      thread_id = omp_get_thread_num();
#endif

      int n = first_prefilter[i].second;

      // Perform search step
      int score = swStripedByte(qc, LQ, first[n], length[n], gap_init,
          gap_extend, workspace[thread_id], workspace[thread_id] + W,
          workspace[thread_id] + 2 * W, prefilter_score_offset);

      double evalue = factor * length[n] * fpow2(-score / prefilter_bit_factor);

      if (evalue < prefilter_evalue_coarse_thresh) {
#pragma omp critical
        hits.push_back(std::pair<double, int>(evalue, n));
      }
    }

    //filter after calculation of evalues to include at least min_prefilter_hits
    sort(hits.begin(), hits.end());

    std::vector<std::pair<double, int> >::iterator second_prefilter_begin_erase =
        hits.end();
    std::vector<std::pair<double, int> >::iterator second_prefilter_end_erase =
        hits.end();
    std::vector<std::pair<double, int> >::iterator it2;

    count_dbs = 0;
    for (it2 = hits.begin(); it2 < hits.end(); it2++) {
      if (count_dbs >= min_prefilter_hits
          && (*it2).first > prefilter_evalue_thresh) {
        second_prefilter_begin_erase = it2;
        break;
      }
      else {
        count_dbs++;
      }
    }

    hits.erase(second_prefilter_begin_erase, second_prefilter_end_erase);

    count_dbs = 0;

    for (it2 = hits.begin(); it2 < hits.end(); it2++) {
      // Add hit to dbfiles
      count_dbs++;
      char db_name[NAMELEN];
      strcpy(db_name, dbnames[(*it2).second]);

      char name[NAMELEN];
      RemoveExtension(name, db_name);

      if (!doubled->Contains(db_name)) {
        doubled->Add(db_name);

        std::pair<int, std::string> result;
        result.first = length[(*it2).second];
        result.second = std::string(db_name);

        // check, if DB was searched in previous rounds
        strcat(name, "__1");  // irep=1

        if (previous_hits->Contains(name)) {
          old_prefilter_hits.push_back(result);
        }
        else {
          new_prefilter_hits.push_back(result);
        }
      }
      if (count_dbs >= maxnumdb)
      {
        HH_LOG(WARNING)
        << "Number of hits passing 2nd prefilter (reduced from " << hits.size() << " to allowed maximum of " << maxnumdb << ").\n"
        <<"You can increase the allowed maximum using the -maxfilt <max> option.\n";
        break;
      }
    }


      
      
    // Free memory
    free(qc);
    for (int i = 0; i < threads; i++)
      free(workspace[i]);
    delete[] workspace;
    if (doubled)
      delete doubled;
  }
Exemplo n.º 12
0
/////////////////////////////////////////////////////////////////////////////////////
//// MAIN PROGRAM
/////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
    char* argv_conf[MAXOPT]; // Input arguments from .hhdefaults file (first=1: argv_conf[0] is not used)
    int argc_conf;               // Number of arguments in argv_conf

    strcpy(par.infile, "");
    strcpy(par.outfile, "");
    strcpy(par.alnfile, "");

    //Default parameter settings
    par.nseqdis = MAXSEQ - 1;        // maximum number of sequences to be written
    par.showcons = 0;
    par.cons = 1;
    par.Ndiff = 0;
    par.max_seqid = 100;
    par.coverage = 0;
    par.pc_hhm_context_engine.pca = 0.0;  // no amino acid pseudocounts
    par.pc_hhm_nocontext_a = 0.0;  // no amino acid pseudocounts
    par.gapb = 0.0; // no transition pseudocounts

    // Make command line input globally available
    par.argv = argv;
    par.argc = argc;
    RemovePathAndExtension(program_name, argv[0]);

    // Enable changing verbose mode before defaults file and command line are processed
    int v = 2;
    for (int i = 1; i < argc; i++) {
        if (!strcmp(argv[i], "-def"))
            par.readdefaultsfile = 1;
        else if (strcmp(argv[i], "-v") == 0) {
            v = atoi(argv[i + 1]);
        }
    }
    par.v = Log::from_int(v);
    Log::reporting_level() = par.v;

    par.SetDefaultPaths();

    // Read .hhdefaults file?
    if (par.readdefaultsfile) {
        // Process default otpions from .hhconfig file
        ReadDefaultsFile(argc_conf, argv_conf);
        ProcessArguments(argc_conf, argv_conf);
    }

    // Process command line options (they override defaults from .hhdefaults file)
    ProcessArguments(argc, argv);

    Alignment* qali = new Alignment(MAXSEQ, par.maxres);
    HMM* q = new HMM(MAXSEQDIS, par.maxres);        //Create a HMM with maximum of par.maxres match states

    // q is only available after maxres is known, so we had to move this here
    for (int i = 1; i <= argc - 1; i++) {
        if (!strcmp(argv[i], "-name") && (i < argc - 1)) {
            strmcpy(q->name, argv[++i], NAMELEN - 1); //copy longname to name...
            strmcpy(q->longname, argv[i], DESCLEN - 1);   //copy full name to longname
        }
    }

    // Check command line input and default values
    if (!*par.infile) {
        help();
        HH_LOG(ERROR) << "Input file is missing!" << std::endl;
        exit(4);
    }

    // Get basename
    RemoveExtension(q->file, par.infile); //Get basename of infile (w/o extension):

    // Outfile not given? Name it basename.hhm
    if (!*par.outfile && !*par.alnfile) {
        RemoveExtension(par.outfile, par.infile);
        strcat(par.outfile, ".seq");
    }

    // Prepare CS pseudocounts lib
    if (!par.nocontxt && *par.clusterfile) {
        InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine,
                                     pc_hhm_context_mode, pc_prefilter_context_engine,
                                     pc_prefilter_context_mode);
    }

    // Set substitution matrix; adjust to query aa distribution if par.pcm==3
    SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);

    // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
    char input_format = 0;
    ReadQueryFile(par, par.infile, input_format, par.wg, q, qali, pb, S, Sim);

    // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction
    // Add Pseudocounts, if no HMMER input
    if (input_format == 0) {
        // Transform transition freqs to lin space if not already done
        q->AddTransitionPseudocounts(par.gapd, par.gape, par.gapf, par.gapg,
                                     par.gaph, par.gapi, par.gapb, par.gapb);

        // Comput substitution matrix pseudocounts
        if (par.nocontxt) {
            // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
            q->PreparePseudocounts(R);
            // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
            q->AddAminoAcidPseudocounts(par.pc_hhm_nocontext_mode,
                                        par.pc_hhm_nocontext_a, par.pc_hhm_nocontext_b,
                                        par.pc_hhm_nocontext_c);
        }
        else {
            // Add full context specific pseudocounts to query
            q->AddContextSpecificPseudocounts(pc_hhm_context_engine,
                                              pc_hhm_context_mode);
        }
    }
    else {
        q->AddAminoAcidPseudocounts(0, par.pc_hhm_nocontext_a,
                                    par.pc_hhm_nocontext_b, par.pc_hhm_nocontext_c);
    }

    q->CalculateAminoAcidBackground(pb);

    if (par.columnscore == 5 && !q->divided_by_local_bg_freqs)
        q->DivideBySqrtOfLocalBackgroundFreqs(
            par.half_window_size_local_aa_bg_freqs, pb);

    // Write consensus sequence to sequence file
    // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions()
    if (*par.outfile) {
        FILE* outf = NULL;
        if (strcmp(par.outfile, "stdout")) {
            outf = fopen(par.outfile, "a");
            if (!outf)
                OpenFileError(par.outfile, __FILE__, __LINE__, __func__);
        }
        else
            outf = stdout;
        // OLD
        //// ">name_consensus" -> ">name consensus"
        //strsubst(q->sname[q->nfirst],"_consensus"," consensus");
        //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1);
        // NEW (long header needed for NR30cons database)
        fprintf(outf, ">%s\n%s\n", q->longname, q->seq[q->nfirst] + 1);
        fclose(outf);
    }

    // Print A3M/A2M/FASTA output alignment
    if (*par.alnfile) {
        HalfAlignment qa;
        int n = imin(q->n_display,
                     par.nseqdis + (q->nss_dssp >= 0) + (q->nss_pred >= 0)
                     + (q->nss_conf >= 0) + (q->ncons >= 0));
        qa.Set(q->name, q->seq, q->sname, n, q->L, q->nss_dssp, q->nss_pred,
               q->nss_conf, q->nsa_dssp, q->ncons);

        if (par.outformat == 1)
            qa.BuildFASTA();
        else if (par.outformat == 2)
            qa.BuildA2M();
        else if (par.outformat == 3)
            qa.BuildA3M();
        if (qali->readCommentLine)
            qa.Print(par.alnfile, par.append, qali->longname); // print alignment to outfile
        else
            qa.Print(par.alnfile, par.append);   // print alignment to outfile
    }

    delete qali;
    delete q;

    DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine,
                             pc_hhm_context_mode, pc_prefilter_context_engine,
                             pc_prefilter_context_mode);
}
Exemplo n.º 13
0
/////////////////////////////////////////////////////////////////////////////////////
//// Processing input options from command line and .hhdefaults file
/////////////////////////////////////////////////////////////////////////////////////
void ProcessArguments(int argc, char** argv) {
    // Read command line options
    for (int i = 1; i <= argc - 1; i++) {
        HH_LOG(DEBUG1) << i << "  " << argv[i] << std::endl;
        if (!strcmp(argv[i], "-i")) {
            if (++i >= argc || argv[i][0] == '-') {
                help();
                HH_LOG(ERROR) << "No input file following -i" << std::endl;
                exit(4);
            }
            else
                strcpy(par.infile, argv[i]);
        }
        else if (!strcmp(argv[i], "-s")) {
            if (++i >= argc) {
                help();
                HH_LOG(ERROR) << "No output file following -s" << std::endl;
                exit(4);
            }
            else
                strcpy(par.outfile, argv[i]);
        }
        else if (!strcmp(argv[i], "-o")) {
            par.outformat = 3;
            if (++i >= argc) {
                help();
                HH_LOG(ERROR) << "No output file following -o" << std::endl;
                exit(4);
            }
            else
                strcpy(par.alnfile, argv[i]);
        }
        else if (!strcmp(argv[i], "-ofas")) {
            par.outformat = 1;
            if (++i >= argc || argv[i][0] == '-') {
                help();
                HH_LOG(ERROR) << "No output file following -o" << std::endl;
                exit(4);
            }
            else
                strcpy(par.alnfile, argv[i]);
        }
        else if (!strcmp(argv[i], "-oa2m")) {
            par.outformat = 2;
            if (++i >= argc || argv[i][0] == '-') {
                help();
                HH_LOG(ERROR) << "No output file following -o" << std::endl;
                exit(4);
            }
            else
                strcpy(par.alnfile, argv[i]);
        }
        else if (!strcmp(argv[i], "-oa3m")) {
            par.outformat = 3;
            if (++i >= argc || argv[i][0] == '-') {
                help();
                HH_LOG(ERROR) << "No output file following -o" << std::endl;
                exit(4);
            }
            else
                strcpy(par.alnfile, argv[i]);
        }
        else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
            help();
            exit(0);
        }
        else if (!strcmp(argv[i], "-v") && (i < argc - 1) && argv[i + 1][0] != '-') {
            int v = atoi(argv[++i]);
            par.v = Log::from_int(v);
            Log::reporting_level() = par.v;
        }
        else if (!strcmp(argv[i], "-seq") && (i < argc - 1))
            par.nseqdis = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-id") && (i < argc - 1))
            par.max_seqid = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-qid") && (i < argc - 1))
            par.qid = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-qsc") && (i < argc - 1))
            par.qsc = atof(argv[++i]);
        else if (!strcmp(argv[i], "-cov") && (i < argc - 1))
            par.coverage = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-diff") && (i < argc - 1))
            par.Ndiff = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-M") && (i < argc - 1))
            if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m"))
                par.M = 1;
            else if (!strcmp(argv[i], "first"))
                par.M = 3;
            else if (argv[i][0] >= '0' && argv[i][0] <= '9') {
                par.Mgaps = atoi(argv[i]);
                par.M = 2;
            }
            else
                HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl;
        else if (!strcmp(argv[i], "-Gonnet"))
            par.matrix = 0;
        else if (!strncmp(argv[i], "-BLOSUM", 7)
                 || !strncmp(argv[i], "-Blosum", 7)) {
            if (!strcmp(argv[i] + 7, "30"))
                par.matrix = 30;
            else if (!strcmp(argv[i] + 7, "40"))
                par.matrix = 40;
            else if (!strcmp(argv[i] + 7, "50"))
                par.matrix = 50;
            else if (!strcmp(argv[i] + 7, "65"))
                par.matrix = 65;
            else if (!strcmp(argv[i] + 7, "80"))
                par.matrix = 80;
            else
                HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
        }
        else if (!strcmp(argv[i], "-pcm") && (i < argc - 1))
            par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) atoi(argv[++i]);
        else if (!strcmp(argv[i], "-pca") && (i < argc - 1))
            par.pc_hhm_context_engine.pca = atof(argv[++i]);
        else if (!strcmp(argv[i], "-pcb") && (i < argc - 1))
            par.pc_hhm_context_engine.pcb = atof(argv[++i]);
        else if (!strcmp(argv[i], "-pcc") && (i < argc - 1))
            par.pc_hhm_context_engine.pcc = atof(argv[++i]);
        else if (!strcmp(argv[i], "-gapb") && (i < argc - 1)) {
            par.gapb = atof(argv[++i]);
            if (par.gapb <= 0.01)
                par.gapb = 0.01;
        }
        else if (!strcmp(argv[i], "-gapd") && (i < argc - 1))
            par.gapd = atof(argv[++i]);
        else if (!strcmp(argv[i], "-gape") && (i < argc - 1))
            par.gape = atof(argv[++i]);
        else if (!strcmp(argv[i], "-gapf") && (i < argc - 1))
            par.gapf = atof(argv[++i]);
        else if (!strcmp(argv[i], "-gapg") && (i < argc - 1))
            par.gapg = atof(argv[++i]);
        else if (!strcmp(argv[i], "-gaph") && (i < argc - 1))
            par.gaph = atof(argv[++i]);
        else if (!strcmp(argv[i], "-gapi") && (i < argc - 1))
            par.gapi = atof(argv[++i]);
        else if (!strcmp(argv[i], "-def"))
            par.readdefaultsfile = 1;
        else if (!strcmp(argv[i], "-maxres") && (i < argc - 1))
            par.maxres = par.maxcol = atoi(argv[++i]);
        else if (!strcmp(argv[i], "-nocontxt"))
            par.nocontxt = 1;
        else if (!strcmp(argv[i], "-csb") && (i < argc - 1))
            par.csb = atof(argv[++i]);
        else if (!strcmp(argv[i], "-csw") && (i < argc - 1))
            par.csw = atof(argv[++i]);
        else if (!strcmp(argv[i], "-cs")) {
            if (++i >= argc || argv[i][0] == '-') {
                help();
                HH_LOG(ERROR) << "No query file following -cs" << std::endl;
                exit(4);
            }
            else
                strcpy(par.clusterfile, argv[i]);
        }
        else if (!strcmp(argv[i], "-name")) {
            // skip this, its handled somewhere else
            ;
        }
        else {
            HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << " ...\n";
        }

        HH_LOG(DEBUG1) << i << "  " << argv[i] << std::endl;
    } // end of for-loop for command line input
}
Exemplo n.º 14
0
std::vector<Hit> ViterbiRunner::alignment(Parameters& par, HMMSimd * q_simd,
    std::vector<HHEntry*> dbfiles, const float qsc, float* pb,
    const float S[20][20], const float Sim[20][20], const float R[20][20], const int ssm_mode,
    const float S73[NDSSP][NSSPRED][MAXCF], const float S33[NSSPRED][MAXCF][NSSPRED][MAXCF],
    const float S37[NSSPRED][MAXCF][NDSSP]) {

    HMM * q = q_simd->GetHMM(0);
    // Initialize memory
    std::vector<HMM*> t_hmm;
    for(size_t i = 0; i < HMMSimd::VEC_SIZE * thread_count; i++) {
      HMM* t = new HMM(MAXSEQDIS, par.maxres);
      t_hmm.push_back(t);
    }

    HMMSimd** t_hmm_simd = new HMMSimd*[thread_count];
    std::vector<ViterbiConsumerThread *> threads;
    for (int thread_id = 0; thread_id < thread_count; thread_id++) {
        t_hmm_simd[thread_id] = new HMMSimd(par.maxres);
        ViterbiConsumerThread * thread = new ViterbiConsumerThread(thread_id, par, q_simd, t_hmm_simd[thread_id],viterbiMatrix[thread_id], ssm_mode, S73, S33, S37);
        threads.push_back(thread);
    }

    std::vector<Hit> ret_hits;
    std::vector<HHEntry*> dbfiles_to_align;
    std::map<std::string, std::vector<Viterbi::BacktraceResult> > excludeAlignments;
    // For all the databases comming through prefilter
    std::copy(dbfiles.begin(), dbfiles.end(), std::back_inserter(dbfiles_to_align));

    // loop to detect second/thrid/... best alignemtns
    for (int alignment = 0; alignment < par.altali; alignment++) {
        HH_LOG(INFO) << "Alternative alignment: " << alignment << std::endl;
        unsigned int allElementToAlignCount = dbfiles_to_align.size();
        unsigned int seqBlockSize = allElementToAlignCount;

        if(alignment == 0 && par.early_stopping_filter){
            seqBlockSize = 2000;
        }

        for(unsigned int seqJunkStart = 0; seqJunkStart <  allElementToAlignCount; seqJunkStart += seqBlockSize ){
            //sort by length to improve performance.
            //desc sort (for better utilisation ofthreads)
            unsigned int seqJunkSize = imin(allElementToAlignCount - (seqJunkStart), seqBlockSize);
            sort(dbfiles_to_align.begin() + seqJunkStart,
                 dbfiles_to_align.begin() + (seqJunkStart + seqJunkSize),
                 HHDatabaseEntryCompare());

            // read in data for thread
#pragma omp parallel for schedule(dynamic, 1)
            for (unsigned int idb = seqJunkStart; idb < (seqJunkStart + seqJunkSize); idb += HMMSimd::VEC_SIZE) {
                int current_thread_id = 0;
                #ifdef OPENMP
                    current_thread_id = omp_get_thread_num();
                #endif
                const int current_t_index = (current_thread_id * HMMSimd::VEC_SIZE);

                std::vector<HMM *> templates_to_align;

                // read in alignment
                int maxResElem = imin((seqJunkStart + seqJunkSize) - (idb),
                                      HMMSimd::VEC_SIZE);

                for (int i = 0; i < maxResElem; i++) {
                    HHEntry* entry = dbfiles_to_align.at(idb + i);

                    int format_tmp = 0;
                    char wg = 1; // performance reason
                    entry->getTemplateHMM(par, wg, qsc, format_tmp, pb, S, Sim, t_hmm[current_t_index + i]);
                    t_hmm[current_t_index + i]->entry = entry;

                    PrepareTemplateHMM(par, q, t_hmm[current_t_index + i], format_tmp, false, pb, R);
                    templates_to_align.push_back(t_hmm[current_t_index + i]);
                }
                t_hmm_simd[current_thread_id]->MapHMMVector(templates_to_align);
                exclude_alignments(maxResElem, q_simd, t_hmm_simd[current_thread_id],
                                   excludeAlignments, viterbiMatrix[current_thread_id]);


                if(par.exclstr) {
                  // Mask excluded regions
                  exclude_regions(par.exclstr, maxResElem, q_simd, t_hmm_simd[current_thread_id], viterbiMatrix[current_thread_id]);
                }

                if(par.template_exclstr) {
                  // Mask excluded regions
                  exclude_template_regions(par.template_exclstr, maxResElem, q_simd, t_hmm_simd[current_thread_id], viterbiMatrix[current_thread_id]);
                }

                // start next job
                threads[current_thread_id]->align(maxResElem, par.nseqdis, par.smin);
            } // idb loop
            // merge thread results
            // search hits for next alignment
            HH_LOG(INFO) << (seqJunkStart + seqJunkSize) <<  " alignments done" << std::endl;

            merge_thread_results(ret_hits, dbfiles_to_align, excludeAlignments, threads, alignment, par.smin);
            for (unsigned int thread = 0; thread < threads.size(); thread++) {
                threads[thread]->clear();
            }

            if ( alignment == 0  && par.early_stopping_filter )
            {
                float early_stopping_sum = calculateEarlyStop(par, q, ret_hits, seqJunkStart);
                float filter_cutoff = seqJunkSize * par.filter_thresh;

                if( early_stopping_sum < filter_cutoff){
                    HH_LOG(INFO) << "Stop after DB-HHM: " << (seqJunkStart + seqJunkSize) << " because early stop  "
                    << early_stopping_sum << " < filter cutoff " << filter_cutoff << "\n";
                    break; // stop junk loop and just find alternative alignments
                }
            }
        } // junk loop
        // earse first elements. These are the elements from alignment run before,
        // new elements are after  + elementToAlignCount
        dbfiles_to_align.erase(dbfiles_to_align.begin(), dbfiles_to_align.begin() + allElementToAlignCount);

    }  // Alignment loop

    // clean memory
    for (int thread_id = 0; thread_id < thread_count; thread_id++) {
        delete t_hmm_simd[thread_id];
        delete threads[thread_id];
    }
    threads.clear();
    delete[] t_hmm_simd;

    for(size_t i = 0; i < HMMSimd::VEC_SIZE * thread_count; i++) {
      delete t_hmm[i];
    }
    t_hmm.clear();

    return ret_hits;
}