//-------------------[ Implementation ]-------------------// //-----------< FUNCTION: main >---------------------------------------------- // Purpose: program entry point // Parameters: none // Returns: 0 if successful // nonzero otherwise //--------------------------------------------------------------------------- int main (int argc, char* argv[]) { read_seqs(); report_freqs(); if (getuid() == 0) report_diags(); }
void align_reads(const char *ref_path, const char *qry_path, const char *output_path, const int32_t match, /* 2 */ const int32_t mismatch, /* 2 */ const int32_t gap_o, /* 3 */ const int32_t gap_e, /* 1 */ const uint8_t n_threads, /* 1 */ const int32_t n_keep, const int32_t max_drop, const char *read_group, const char *read_group_id) { gzFile read_fp, ref_fp; FILE *out_fp; int32_t j, k, l; const int m = 5; kseq_t *seq; int8_t *mat = (int8_t *)calloc(25, sizeof(int8_t)); /* This table is used to transform nucleotide letters into numbers. */ uint8_t table[128] = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; // initialize scoring matrix for genome sequences for(l = k = 0; LIKELY(l < 4); ++l) { for(j = 0; LIKELY(j < 4); ++j) mat[k++] = l == j ? match : -mismatch; /* weight_match : -weight_mismatch */ mat[k++] = 0; // ambiguous base } for(j = 0; LIKELY(j < 5); ++j) mat[k++] = 0; // Read reference sequences ref_fp = gzopen(ref_path, "r"); assert(ref_fp != NULL && "Failed to open reference"); seq = kseq_init(ref_fp); kseq_v ref_seqs; ref_seqs = read_seqs(seq, 0); kseq_destroy(seq); gzclose(ref_fp); fprintf(stderr, "[sw_align] Read %lu references\n", kv_size(ref_seqs)); // Print SAM header out_fp = fopen(output_path, "w"); fprintf(out_fp, "@HD\tVN:1.4\tSO:unsorted\n"); for(size_t i = 0; i < kv_size(ref_seqs); i++) { seq = &kv_A(ref_seqs, i); fprintf(out_fp, "@SQ\tSN:%s\tLN:%d\n", seq->name.s, (int32_t)seq->seq.l); } if(read_group) { fputs(read_group, out_fp); fputc('\n', out_fp); } align_config_t conf; conf.gap_o = gap_o; conf.gap_e = gap_e; conf.m = m; conf.table = table; conf.mat = mat; conf.n_keep = n_keep; conf.max_drop = max_drop; read_fp = gzopen(qry_path, "r"); assert(read_fp != NULL && "Failed to open query"); size_t count = 0; seq = kseq_init(read_fp); while(true) { kseq_v reads = read_seqs(seq, 5000 * n_threads); const size_t n_reads = kv_size(reads); if(!n_reads) { break; } worker_t *w = calloc(n_threads, sizeof(worker_t)); kstring_t *sams = calloc(n_reads, sizeof(kstring_t)); for(size_t i = 0; i < n_threads; i++) { w[i].start = i; w[i].n = n_reads; w[i].step = n_threads; w[i].ref_seqs = ref_seqs; w[i].reads = reads; w[i].sams = sams; w[i].config = &conf; w[i].read_group_id = read_group_id; } if(n_threads == 1) { worker(w); } else { pthread_t *tid = calloc(n_threads, sizeof(pthread_t)); for(size_t i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); for(size_t i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); } free(w); for(size_t i = 0; i < n_reads; i++) { fputs(sams[i].s, out_fp); free(sams[i].s); } free(sams); count += n_reads; kv_destroy(reads); } kseq_destroy(seq); fprintf(stderr, "[sw_align] Aligned %lu reads\n", count); // Clean up reference sequences kvi_destroy(kseq_stack_destroy, ref_seqs); gzclose(read_fp); fclose(out_fp); free(mat); }
void ProcessPairedEndReads(const string& command, const string& index_file, const string& reads_file_p1, const string& reads_file_p2, const string& output_file, const uint32_t& n_reads_to_process, const uint32_t& max_mismatches, const string& adaptor, const uint32_t& top_k, const int& frag_range, const bool& ambiguous, const bool& unmapped, const bool& SAM, const int& num_of_threads) { // LOAD THE INDEX HEAD INFO Genome genome; HashTable hash_table; uint32_t size_of_index; ReadIndexHeadInfo(index_file, genome, size_of_index); genome.sequence.resize(genome.length_of_genome); hash_table.counter.resize(power(4, F2SEEDKEYWIGTH) + 1); hash_table.index.resize(size_of_index); vector<vector<string> > index_names(2, vector<string>(2)); index_names[0][0] = index_file + "_CT00"; index_names[0][1] = index_file + "_CT01"; index_names[1][0] = index_file + "_GA10"; index_names[1][1] = index_file + "_GA11"; vector<vector<string> > read_names(2, vector<string>(n_reads_to_process)); vector<vector<string> > read_seqs(2, vector<string>(n_reads_to_process)); vector<vector<string> > read_scores(2, vector<string>(n_reads_to_process)); vector<int> ranked_results_size(2); vector<vector<CandidatePosition> > ranked_results(2, vector<CandidatePosition>(MAX_NUM_EXACT_MAPPED)); vector<vector<TopCandidates> > top_results(2, vector<TopCandidates>(n_reads_to_process)); FILE * fin[2]; fin[0] = fopen(reads_file_p1.c_str(), "r"); if (!fin[0]) { throw SMITHLABException("cannot open input file " + reads_file_p1); } fin[1] = fopen(reads_file_p2.c_str(), "r"); if (!fin[1]) { throw SMITHLABException("cannot open input file " + reads_file_p2); } string adaptors[2]; extract_adaptors(adaptor, adaptors[0], adaptors[1]); clock_t start_t = clock(); FILE * fout = fopen(output_file.c_str(), "w"); if (!fout) { throw SMITHLABException("cannot open input file " + output_file); } uint32_t num_of_reads[2]; StatPairedReads stat_paired_reads(ambiguous, unmapped, output_file, SAM); bool AG_WILDCARD = true; fprintf(stderr, "[MAPPING PAIRED-END READS FROM THE FOLLOWING TWO FILES]\n"); fprintf(stderr, " %s (AND)\n %s\n", reads_file_p1.c_str(), reads_file_p2.c_str()); fprintf(stderr, "[OUTPUT MAPPING RESULTS TO %s]\n", output_file.c_str()); if (SAM) { SAMHead(index_file, command, fout); } omp_set_dynamic(0); omp_set_num_threads(num_of_threads); for (uint32_t i = 0;; i += n_reads_to_process) { num_of_reads[0] = num_of_reads[1] = 0; for (uint32_t pi = 0; pi < 2; ++pi) { // paired end reads _1 and _2 AG_WILDCARD = pi == 1 ? true : false; LoadReadsFromFastqFile(fin[pi], i, n_reads_to_process, adaptors[pi], num_of_reads[pi], read_names[pi], read_seqs[pi], read_scores[pi]); if (num_of_reads[pi] == 0) break; //Initialize the paired results for (uint32_t j = 0; j < num_of_reads[pi]; ++j) { top_results[pi][j].Clear(); top_results[pi][j].SetSize(top_k); } for (uint32_t fi = 0; fi < 2; ++fi) { ReadIndex(index_names[pi][fi], genome, hash_table); char strand = fi == 0 ? '+' : '-'; #pragma omp parallel for for (uint32_t j = 0; j < num_of_reads[pi]; ++j) { PairEndMapping(read_seqs[pi][j], genome, hash_table, strand, AG_WILDCARD, max_mismatches, top_results[pi][j]); } } } if (num_of_reads[0] != num_of_reads[1]) { fprintf(stderr, "The number of reads in paired-end files should be the same.\n"); exit( EXIT_FAILURE); } if (num_of_reads[0] == 0) { break; } stat_paired_reads.total_read_pairs += num_of_reads[0]; /////////////////////////////////////////////////////////// // Merge Paired-end results for (uint32_t j = 0; j < num_of_reads[0]; ++j) { for (uint32_t pi = 0; pi < 2; ++pi) { ranked_results_size[pi] = 0; while (!top_results[pi][j].candidates.empty()) { ranked_results[pi][ranked_results_size[pi]++] = top_results[pi][j].Top(); top_results[pi][j].Pop(); } } MergePairedEndResults(genome, read_names[0][j], read_seqs[0][j], read_scores[0][j], read_seqs[1][j], read_scores[1][j], ranked_results, ranked_results_size, frag_range, max_mismatches, SAM, stat_paired_reads, fout); } if (num_of_reads[0] < n_reads_to_process) break; } fclose(fin[0]); fclose(fin[1]); fclose(fout); OutputPairedStatInfo(stat_paired_reads, output_file); fprintf(stderr, "[MAPPING TAKES %.0lf SECONDS]\n", (double(clock() - start_t) / CLOCKS_PER_SEC)); }
int main(int argc, char* argv[]) { if(argc < 3) { std::cout << "Error: no se especificaron suficientes archivos de entrada." << std::endl; return 1; } std::string filename1 = argv[1]; std::string filename2 = argv[2]; FASTAReader reader1(filename1); FASTAReader reader2(filename2); reader1.setDefault(0); reader2.setDefault(1); //matriz de sustitucion int smatrix[]{ 5, -4, -4, -4, -4, 5, -4, -4, -4, -4, 5, -4, -4, -4, -4, 5}; int gap_open = 10; int gap_extend = 1; int match = 5; int mismatch = -4; #pragma omp parallel { int seq_len = DEFAULT_SEQ_LEN; //container vectors for sequences Buffer<int16_t> seqs1(seq_len * VSIZE, ALNSIZE); Buffer<int16_t> seqs2(seq_len * VSIZE, ALNSIZE); //containers for ids std::vector<std::string> seqs1_ids(VSIZE); std::vector<std::string> seqs2_ids(VSIZE); //legths of sequences int seqs1_len[VSIZE]; int seqs2_len[VSIZE]; //containter for flags Buffer<int8_t> flags(seq_len * seq_len * VSIZE, ALNSIZE); int16_t __attribute((aligned(ALNSIZE))) scores[VSIZE]; int16_t __attribute((aligned(ALNSIZE))) ipos[VSIZE]; int16_t __attribute((aligned(ALNSIZE))) jpos[VSIZE]; //containers for arrays int16_t inf = gap_open + gap_extend + 1; //int16_t aF[256 * VSIZE] __attribute((aligned(ALNSIZE))) = {(int16_t)(-inf)}; //int16_t aH[256 * VSIZE] __attribute((aligned(ALNSIZE))) = {0}; int bsize = 128 * VSIZE; //Buffer<int16_t> E(bsize, ALNSIZE); Buffer<int16_t> F(bsize, ALNSIZE); Buffer<int16_t> H(bsize, ALNSIZE); //int16_t __attribute((aligned(ALNSIZE))) H[128 * VSIZE]; //alignments char aln1[256]; char aln2[256]; //max sizes int max_x, max_y; //alignment start position int x0, y0; while(read_seqs(reader1, reader2, &seqs1, &seqs2, seqs1_len, seqs2_len, &seqs1_ids, &seqs2_ids)) { max_x = *std::max_element(seqs1_len, seqs1_len + VSIZE) + 1; max_y = *std::max_element(seqs2_len, seqs2_len + VSIZE) + 1; //E.clear(-inf); F.clear(-inf); H.clear(0); //flags.clear(0); smith_waterman(seqs1.data(), seqs2.data(), match, mismatch, gap_open, gap_extend, flags.data(), scores, ipos, jpos, max_x, max_y, F.data(), H.data()); for(int i = 0; i < VSIZE; i++) { //std::cout << scores[i] << std::endl; //std::cout << ipos[i] << std::endl; //std::cout << jpos[i] << std::endl; sw_backtrack(i, flags.data(), seqs1.data(), seqs2.data(), max_x, max_y, aln1, aln2, ipos[i], jpos[i], x0, y0); //puts(aln1); //puts(aln2); print_alignment (stdout, seqs1_ids, seqs2_ids, scores, aln1, aln2, strlen(aln1), i); } } } return 0; }