StatSingleReads(const bool& _ambiguous, const bool& _unmapped, const string& output_file, const bool& _SAM) : ambiguous(_ambiguous), unmapped(_unmapped), SAM(_SAM) { total_reads = 0; unique_mapped_reads = 0; ambiguous_mapped_reads = 0; unmapped_reads = 0; num_of_short_reads = 0; if (ambiguous && !SAM) { fambiguous = fopen(string(output_file + "_ambiguous").c_str(), "w"); if (!fambiguous) { throw SMITHLABException( "cannot open input file " + string(output_file + "_ambiguous")); } } if (unmapped && !SAM) { funmapped = fopen(string(output_file + "_unmapped").c_str(), "w"); if (!funmapped) { throw SMITHLABException( "cannot open input file " + string(output_file + "_unmapped")); } } }
void WriteIndexHeadInfo(const string& index_file, const Genome& genome, const uint32_t& size_of_index) { fprintf(stderr, "[WRITTING INDEX HEAD TO %s]\n", index_file.c_str()); FILE * fout = fopen(index_file.c_str(), "wb"); if (!fout) { throw SMITHLABException("cannot open input file " + index_file); } uint32_t num_of_chroms = genome.num_of_chroms; fwrite(&num_of_chroms, sizeof(uint32_t), 1, fout); for (uint32_t i = 0; i < num_of_chroms; ++i) { uint32_t chrom_name_len = genome.name[i].size(); if (chrom_name_len > 255) { chrom_name_len = 255; } fwrite(&chrom_name_len, sizeof(uint32_t), 1, fout); fwrite(genome.name[i].c_str(), sizeof(char), chrom_name_len, fout); } fwrite(&(genome.length[0]), sizeof(uint32_t), num_of_chroms, fout); fwrite(&(genome.length_of_genome), sizeof(uint32_t), 1, fout); fwrite(&size_of_index, sizeof(uint32_t), 1, fout); fclose(fout); }
void ReadIndex(const string& index_file, Genome& genome, HashTable& hash_table) { FILE * fin = fopen(index_file.c_str(), "rb"); if (!fin) { throw SMITHLABException("cannot open input file " + index_file); } FREAD_CHECK(fread(&(genome.strand), sizeof(char), 1, fin), 1); FREAD_CHECK( fread(&(genome.sequence[0]), sizeof(char), genome.length_of_genome, fin), genome.length_of_genome); /* read hash table from disk */ FREAD_CHECK(fread(&(hash_table.counter_size), sizeof(uint32_t), 1, fin), 1); FREAD_CHECK(fread(&(hash_table.index_size), sizeof(uint32_t), 1, fin), 1); FREAD_CHECK( fread(&(hash_table.counter[0]), sizeof(uint32_t), hash_table.counter_size + 1, fin), hash_table.counter_size + 1); FREAD_CHECK( fread(&(hash_table.index[0]), sizeof(uint32_t), hash_table.index_size, fin), hash_table.index_size); fclose(fin); }
void WriteIndex(const string& index_file, const Genome& genome, const HashTable& hash_table) { fprintf(stderr, "[WRITTING INDEX TO %s]\n", index_file.c_str()); FILE * fout = fopen(index_file.c_str(), "wb"); if (!fout) { throw SMITHLABException("cannot open input file " + index_file); } fwrite(&(genome.strand), sizeof(char), 1, fout); fwrite(&(genome.sequence[0]), sizeof(char), genome.length_of_genome, fout); /* write hash table to disk */ fwrite(&(hash_table.counter_size), sizeof(uint32_t), 1, fout); fwrite(&(hash_table.index_size), sizeof(uint32_t), 1, fout); fwrite(&(hash_table.counter[0]), sizeof(uint32_t), hash_table.counter_size + 1, fout); fwrite(&(hash_table.index[0]), sizeof(uint32_t), hash_table.index_size, fout); fclose(fout); }
void ReadIndexHeadInfo(const string& index_file, Genome& genome, uint32_t& size_of_index) { FILE * fin = fopen(index_file.c_str(), "rb"); if (!fin) { throw SMITHLABException("cannot open input file " + index_file); } uint32_t num_of_chroms; FREAD_CHECK(fread(&num_of_chroms, sizeof(uint32_t), 1, fin), 1); genome.num_of_chroms = num_of_chroms; genome.name.resize(num_of_chroms); genome.length.resize(num_of_chroms); genome.start_index.resize(num_of_chroms + 1); char chrom_name[256]; uint32_t chrom_name_len; for (uint32_t i = 0; i < num_of_chroms; ++i) { FREAD_CHECK(fread(&chrom_name_len, sizeof(uint32_t), 1, fin), 1); FREAD_CHECK(fread(chrom_name, sizeof(char), chrom_name_len, fin), chrom_name_len); chrom_name[chrom_name_len] = 0; genome.name[i] = chrom_name; } FREAD_CHECK(fread(&(genome.length[0]), sizeof(uint32_t), num_of_chroms, fin), num_of_chroms); FREAD_CHECK(fread(&(genome.length_of_genome), sizeof(uint32_t), 1, fin), 1); genome.start_index[0] = 0; for (uint32_t i = 1; i <= num_of_chroms; ++i) { genome.start_index[i] = genome.start_index[i - 1] + genome.length[i - 1]; } FREAD_CHECK(fread(&size_of_index, sizeof(uint32_t), 1, fin), 1); fclose(fin); }
void ProcessPairedEndReads(const string& command, const string& index_file, const string& reads_file_p1, const string& reads_file_p2, const string& output_file, const uint32_t& n_reads_to_process, const uint32_t& max_mismatches, const string& adaptor, const uint32_t& top_k, const int& frag_range, const bool& ambiguous, const bool& unmapped, const bool& SAM, const int& num_of_threads) { // LOAD THE INDEX HEAD INFO Genome genome; HashTable hash_table; uint32_t size_of_index; ReadIndexHeadInfo(index_file, genome, size_of_index); genome.sequence.resize(genome.length_of_genome); hash_table.counter.resize(power(4, F2SEEDKEYWIGTH) + 1); hash_table.index.resize(size_of_index); vector<vector<string> > index_names(2, vector<string>(2)); index_names[0][0] = index_file + "_CT00"; index_names[0][1] = index_file + "_CT01"; index_names[1][0] = index_file + "_GA10"; index_names[1][1] = index_file + "_GA11"; vector<vector<string> > read_names(2, vector<string>(n_reads_to_process)); vector<vector<string> > read_seqs(2, vector<string>(n_reads_to_process)); vector<vector<string> > read_scores(2, vector<string>(n_reads_to_process)); vector<int> ranked_results_size(2); vector<vector<CandidatePosition> > ranked_results(2, vector<CandidatePosition>(MAX_NUM_EXACT_MAPPED)); vector<vector<TopCandidates> > top_results(2, vector<TopCandidates>(n_reads_to_process)); FILE * fin[2]; fin[0] = fopen(reads_file_p1.c_str(), "r"); if (!fin[0]) { throw SMITHLABException("cannot open input file " + reads_file_p1); } fin[1] = fopen(reads_file_p2.c_str(), "r"); if (!fin[1]) { throw SMITHLABException("cannot open input file " + reads_file_p2); } string adaptors[2]; extract_adaptors(adaptor, adaptors[0], adaptors[1]); clock_t start_t = clock(); FILE * fout = fopen(output_file.c_str(), "w"); if (!fout) { throw SMITHLABException("cannot open input file " + output_file); } uint32_t num_of_reads[2]; StatPairedReads stat_paired_reads(ambiguous, unmapped, output_file, SAM); bool AG_WILDCARD = true; fprintf(stderr, "[MAPPING PAIRED-END READS FROM THE FOLLOWING TWO FILES]\n"); fprintf(stderr, " %s (AND)\n %s\n", reads_file_p1.c_str(), reads_file_p2.c_str()); fprintf(stderr, "[OUTPUT MAPPING RESULTS TO %s]\n", output_file.c_str()); if (SAM) { SAMHead(index_file, command, fout); } omp_set_dynamic(0); omp_set_num_threads(num_of_threads); for (uint32_t i = 0;; i += n_reads_to_process) { num_of_reads[0] = num_of_reads[1] = 0; for (uint32_t pi = 0; pi < 2; ++pi) { // paired end reads _1 and _2 AG_WILDCARD = pi == 1 ? true : false; LoadReadsFromFastqFile(fin[pi], i, n_reads_to_process, adaptors[pi], num_of_reads[pi], read_names[pi], read_seqs[pi], read_scores[pi]); if (num_of_reads[pi] == 0) break; //Initialize the paired results for (uint32_t j = 0; j < num_of_reads[pi]; ++j) { top_results[pi][j].Clear(); top_results[pi][j].SetSize(top_k); } for (uint32_t fi = 0; fi < 2; ++fi) { ReadIndex(index_names[pi][fi], genome, hash_table); char strand = fi == 0 ? '+' : '-'; #pragma omp parallel for for (uint32_t j = 0; j < num_of_reads[pi]; ++j) { PairEndMapping(read_seqs[pi][j], genome, hash_table, strand, AG_WILDCARD, max_mismatches, top_results[pi][j]); } } } if (num_of_reads[0] != num_of_reads[1]) { fprintf(stderr, "The number of reads in paired-end files should be the same.\n"); exit( EXIT_FAILURE); } if (num_of_reads[0] == 0) { break; } stat_paired_reads.total_read_pairs += num_of_reads[0]; /////////////////////////////////////////////////////////// // Merge Paired-end results for (uint32_t j = 0; j < num_of_reads[0]; ++j) { for (uint32_t pi = 0; pi < 2; ++pi) { ranked_results_size[pi] = 0; while (!top_results[pi][j].candidates.empty()) { ranked_results[pi][ranked_results_size[pi]++] = top_results[pi][j].Top(); top_results[pi][j].Pop(); } } MergePairedEndResults(genome, read_names[0][j], read_seqs[0][j], read_scores[0][j], read_seqs[1][j], read_scores[1][j], ranked_results, ranked_results_size, frag_range, max_mismatches, SAM, stat_paired_reads, fout); } if (num_of_reads[0] < n_reads_to_process) break; } fclose(fin[0]); fclose(fin[1]); fclose(fout); OutputPairedStatInfo(stat_paired_reads, output_file); fprintf(stderr, "[MAPPING TAKES %.0lf SECONDS]\n", (double(clock() - start_t) / CLOCKS_PER_SEC)); }