Ejemplo n.º 1
0
//-------------------[         Implementation          ]-------------------//
//-----------< FUNCTION: main >----------------------------------------------
// Purpose:    program entry point
// Parameters: none
// Returns:    0 if successful
//             nonzero otherwise
//---------------------------------------------------------------------------
int main (int argc, char* argv[])
{
   read_seqs();
   report_freqs();
   if (getuid() == 0)
      report_diags();
}
Ejemplo n.º 2
0
void align_reads(const char *ref_path,
                 const char *qry_path,
                 const char *output_path,
                 const int32_t match,       /* 2 */
                 const int32_t mismatch,    /* 2 */
                 const int32_t gap_o,       /* 3 */
                 const int32_t gap_e,       /* 1 */
                 const uint8_t n_threads,   /* 1 */
                 const int32_t n_keep,
                 const int32_t max_drop,
                 const char *read_group,
                 const char *read_group_id)
{
    gzFile read_fp, ref_fp;
    FILE *out_fp;
    int32_t j, k, l;
    const int m = 5;
    kseq_t *seq;
    int8_t *mat = (int8_t *)calloc(25, sizeof(int8_t));

    /* This table is used to transform nucleotide letters into numbers. */
    uint8_t table[128] = {
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 4, 4, 4,  3, 0, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 0, 4, 1,  4, 4, 4, 2,  4, 4, 4, 4,  4, 4, 4, 4,
        4, 4, 4, 4,  3, 0, 4, 4,  4, 4, 4, 4,  4, 4, 4, 4
    };

    // initialize scoring matrix for genome sequences
    for(l = k = 0; LIKELY(l < 4); ++l) {
        for(j = 0; LIKELY(j < 4); ++j) mat[k++] = l == j ? match : -mismatch;	/* weight_match : -weight_mismatch */
        mat[k++] = 0; // ambiguous base
    }
    for(j = 0; LIKELY(j < 5); ++j) mat[k++] = 0;


    // Read reference sequences
    ref_fp = gzopen(ref_path, "r");
    assert(ref_fp != NULL && "Failed to open reference");
    seq = kseq_init(ref_fp);
    kseq_v ref_seqs;
    ref_seqs = read_seqs(seq, 0);
    kseq_destroy(seq);
    gzclose(ref_fp);

    fprintf(stderr, "[sw_align] Read %lu references\n",
            kv_size(ref_seqs));

    // Print SAM header
    out_fp = fopen(output_path, "w");
    fprintf(out_fp, "@HD\tVN:1.4\tSO:unsorted\n");
    for(size_t i = 0; i < kv_size(ref_seqs); i++) {
        seq = &kv_A(ref_seqs, i);
        fprintf(out_fp, "@SQ\tSN:%s\tLN:%d\n",
                seq->name.s, (int32_t)seq->seq.l);
    }
    if(read_group) {
        fputs(read_group, out_fp);
        fputc('\n', out_fp);
    }

    align_config_t conf;
    conf.gap_o = gap_o;
    conf.gap_e = gap_e;
    conf.m = m;
    conf.table = table;
    conf.mat = mat;
    conf.n_keep = n_keep;
    conf.max_drop = max_drop;

    read_fp = gzopen(qry_path, "r");
    assert(read_fp != NULL && "Failed to open query");
    size_t count = 0;
    seq = kseq_init(read_fp);
    while(true) {
        kseq_v reads = read_seqs(seq, 5000 * n_threads);
        const size_t n_reads = kv_size(reads);
        if(!n_reads) {
            break;
        }

        worker_t *w = calloc(n_threads, sizeof(worker_t));
        kstring_t *sams  = calloc(n_reads, sizeof(kstring_t));
        for(size_t i = 0; i < n_threads; i++) {
            w[i].start = i;
            w[i].n = n_reads;
            w[i].step = n_threads;
            w[i].ref_seqs = ref_seqs;
            w[i].reads = reads;
            w[i].sams = sams;
            w[i].config = &conf;
            w[i].read_group_id = read_group_id;
        }

        if(n_threads == 1) {
            worker(w);
        } else {
            pthread_t *tid = calloc(n_threads, sizeof(pthread_t));
            for(size_t i = 0; i < n_threads; ++i)
                pthread_create(&tid[i], 0, worker, &w[i]);
            for(size_t i = 0; i < n_threads; ++i)
                pthread_join(tid[i], 0);
        }
        free(w);

        for(size_t i = 0; i < n_reads; i++) {
            fputs(sams[i].s, out_fp);
            free(sams[i].s);
        }
        free(sams);
        count += n_reads;
        kv_destroy(reads);
    }
    kseq_destroy(seq);
    fprintf(stderr, "[sw_align] Aligned %lu reads\n", count);

    // Clean up reference sequences
    kvi_destroy(kseq_stack_destroy, ref_seqs);

    gzclose(read_fp);
    fclose(out_fp);
    free(mat);
}
Ejemplo n.º 3
0
void ProcessPairedEndReads(const string& command, const string& index_file,
                           const string& reads_file_p1,
                           const string& reads_file_p2,
                           const string& output_file,
                           const uint32_t& n_reads_to_process,
                           const uint32_t& max_mismatches,
                           const string& adaptor, const uint32_t& top_k,
                           const int& frag_range, const bool& ambiguous,
                           const bool& unmapped, const bool& SAM,
                           const int& num_of_threads) {
  // LOAD THE INDEX HEAD INFO
  Genome genome;
  HashTable hash_table;

  uint32_t size_of_index;
  ReadIndexHeadInfo(index_file, genome, size_of_index);
  genome.sequence.resize(genome.length_of_genome);
  hash_table.counter.resize(power(4, F2SEEDKEYWIGTH) + 1);
  hash_table.index.resize(size_of_index);

  vector<vector<string> > index_names(2, vector<string>(2));
  index_names[0][0] = index_file + "_CT00";
  index_names[0][1] = index_file + "_CT01";
  index_names[1][0] = index_file + "_GA10";
  index_names[1][1] = index_file + "_GA11";

  vector<vector<string> > read_names(2, vector<string>(n_reads_to_process));
  vector<vector<string> > read_seqs(2, vector<string>(n_reads_to_process));
  vector<vector<string> > read_scores(2, vector<string>(n_reads_to_process));

  vector<int> ranked_results_size(2);
  vector<vector<CandidatePosition> > ranked_results(2,
          vector<CandidatePosition>(MAX_NUM_EXACT_MAPPED));

  vector<vector<TopCandidates> > top_results(2,
         vector<TopCandidates>(n_reads_to_process));

  FILE * fin[2];
  fin[0] = fopen(reads_file_p1.c_str(), "r");
  if (!fin[0]) {
    throw SMITHLABException("cannot open input file " + reads_file_p1);
  }
  fin[1] = fopen(reads_file_p2.c_str(), "r");
  if (!fin[1]) {
    throw SMITHLABException("cannot open input file " + reads_file_p2);
  }

  string adaptors[2];
  extract_adaptors(adaptor, adaptors[0], adaptors[1]);
  clock_t start_t = clock();
  FILE * fout = fopen(output_file.c_str(), "w");
  if (!fout) {
    throw SMITHLABException("cannot open input file " + output_file);
  }
  uint32_t num_of_reads[2];
  StatPairedReads stat_paired_reads(ambiguous, unmapped, output_file, SAM);
  bool AG_WILDCARD = true;
  fprintf(stderr, "[MAPPING PAIRED-END READS FROM THE FOLLOWING TWO FILES]\n");
  fprintf(stderr, "   %s (AND)\n   %s\n", reads_file_p1.c_str(),
          reads_file_p2.c_str());
  fprintf(stderr, "[OUTPUT MAPPING RESULTS TO %s]\n", output_file.c_str());
  if (SAM) {
    SAMHead(index_file, command, fout);
  }
  omp_set_dynamic(0);
  omp_set_num_threads(num_of_threads);
  for (uint32_t i = 0;; i += n_reads_to_process) {
    num_of_reads[0] = num_of_reads[1] = 0;
    for (uint32_t pi = 0; pi < 2; ++pi) {  // paired end reads _1 and _2
      AG_WILDCARD = pi == 1 ? true : false;
      LoadReadsFromFastqFile(fin[pi], i, n_reads_to_process, adaptors[pi],
                             num_of_reads[pi], read_names[pi], read_seqs[pi],
                             read_scores[pi]);
      if (num_of_reads[pi] == 0)
        break;

      //Initialize the paired results
      for (uint32_t j = 0; j < num_of_reads[pi]; ++j) {
        top_results[pi][j].Clear();
        top_results[pi][j].SetSize(top_k);
      }

      for (uint32_t fi = 0; fi < 2; ++fi) {
        ReadIndex(index_names[pi][fi], genome, hash_table);
        char strand = fi == 0 ? '+' : '-';
#pragma omp parallel for
        for (uint32_t j = 0; j < num_of_reads[pi]; ++j) {
          PairEndMapping(read_seqs[pi][j], genome, hash_table, strand,
                         AG_WILDCARD, max_mismatches, top_results[pi][j]);
        }
      }
    }
    if (num_of_reads[0] != num_of_reads[1]) {
      fprintf(stderr,
              "The number of reads in paired-end files should be the same.\n");
      exit( EXIT_FAILURE);
    }
    if (num_of_reads[0] == 0) {
      break;
    }
    stat_paired_reads.total_read_pairs += num_of_reads[0];
    ///////////////////////////////////////////////////////////
    // Merge Paired-end results
    for (uint32_t j = 0; j < num_of_reads[0]; ++j) {
      for (uint32_t pi = 0; pi < 2; ++pi) {
        ranked_results_size[pi] = 0;
        while (!top_results[pi][j].candidates.empty()) {
          ranked_results[pi][ranked_results_size[pi]++] =
              top_results[pi][j].Top();
          top_results[pi][j].Pop();
        }
      }

      MergePairedEndResults(genome, read_names[0][j], read_seqs[0][j],
                            read_scores[0][j], read_seqs[1][j],
                            read_scores[1][j], ranked_results,
                            ranked_results_size, frag_range, max_mismatches,
                            SAM, stat_paired_reads, fout);
    }

    if (num_of_reads[0] < n_reads_to_process)
      break;
  }

  fclose(fin[0]);
  fclose(fin[1]);
  fclose(fout);

  OutputPairedStatInfo(stat_paired_reads, output_file);
  fprintf(stderr, "[MAPPING TAKES %.0lf SECONDS]\n",
          (double(clock() - start_t) / CLOCKS_PER_SEC));
}
Ejemplo n.º 4
0
int main(int argc, char* argv[])
{
    if(argc < 3)
    {
        std::cout
            << "Error: no se especificaron suficientes archivos de entrada." 
            << std::endl;
        return 1;
    }
    
    std::string filename1 = argv[1];
    std::string filename2 = argv[2];
    
    FASTAReader reader1(filename1);
    FASTAReader reader2(filename2);
    
    reader1.setDefault(0);
    reader2.setDefault(1);
    
    //matriz de sustitucion
    int smatrix[]{ 5, -4, -4, -4,
                -4,  5, -4, -4,
                -4, -4,  5, -4,
                -4, -4, -4,  5};
    int gap_open   = 10;
    int gap_extend =  1;
    int match = 5;
    int mismatch = -4;
    
    #pragma omp parallel
    {
        int seq_len = DEFAULT_SEQ_LEN;
        
        //container vectors for sequences
        Buffer<int16_t> seqs1(seq_len * VSIZE, ALNSIZE);
        Buffer<int16_t> seqs2(seq_len * VSIZE, ALNSIZE);
        
        //containers for ids
        std::vector<std::string> seqs1_ids(VSIZE);
        std::vector<std::string> seqs2_ids(VSIZE);
        
        
        
        //legths of sequences
        int seqs1_len[VSIZE];
        int seqs2_len[VSIZE];
        
        //containter for flags
        Buffer<int8_t> flags(seq_len * seq_len * VSIZE, ALNSIZE);
        int16_t __attribute((aligned(ALNSIZE))) scores[VSIZE];
        int16_t __attribute((aligned(ALNSIZE))) ipos[VSIZE];
        int16_t __attribute((aligned(ALNSIZE))) jpos[VSIZE];
        
        //containers for arrays
        int16_t inf = gap_open + gap_extend + 1;
        //int16_t aF[256 * VSIZE] __attribute((aligned(ALNSIZE))) = {(int16_t)(-inf)};
        //int16_t aH[256 * VSIZE] __attribute((aligned(ALNSIZE))) = {0};

        int bsize = 128 * VSIZE;
	
        //Buffer<int16_t> E(bsize, ALNSIZE);
        Buffer<int16_t> F(bsize, ALNSIZE);
        Buffer<int16_t> H(bsize, ALNSIZE);
        //int16_t __attribute((aligned(ALNSIZE))) H[128 * VSIZE];
        
        //alignments
        char aln1[256];
        char aln2[256];
        
        //max sizes
        int max_x, max_y;
        
        //alignment start position
        int x0, y0;
        
        while(read_seqs(reader1, reader2, &seqs1, &seqs2, seqs1_len, seqs2_len, 
              &seqs1_ids, &seqs2_ids))
        {
            max_x = *std::max_element(seqs1_len, seqs1_len + VSIZE) + 1;
            max_y = *std::max_element(seqs2_len, seqs2_len + VSIZE) + 1;
            //E.clear(-inf);
            F.clear(-inf);
            H.clear(0);
            //flags.clear(0);
	    
            smith_waterman(seqs1.data(), seqs2.data(), match, mismatch, gap_open, gap_extend, 
			   flags.data(), scores, ipos, jpos, max_x, max_y, F.data(), H.data());
            
            for(int i = 0; i < VSIZE; i++)
            {
                //std::cout << scores[i] << std::endl;
                //std::cout << ipos[i] << std::endl;
                //std::cout << jpos[i] << std::endl;
                sw_backtrack(i, flags.data(), seqs1.data(), seqs2.data(), max_x, max_y,
                    aln1, aln2, ipos[i], jpos[i], x0, y0);
                //puts(aln1);
                //puts(aln2);
                print_alignment (stdout, seqs1_ids, seqs2_ids, scores, 
                    aln1, aln2, strlen(aln1), i);
            }
        }
    }
    return 0;
}