int main( int argc, char *argv[] ) { short error; error = G2A( argc, argv ); if( error ) ILError( error, "G2A" ); return 0; }
void BuildIndex(const vector<string>& chrom_files, const int& indicator, const string& output_file, uint32_t& size_of_index) { switch (indicator) { case 0: fprintf(stderr, "[BIULD INDEX FOR FORWARD STRAND (C->T)]\n"); break; case 1: fprintf(stderr, "[BIULD INDEX FOR REVERSE STRAND (C->T)]\n"); break; case 2: fprintf(stderr, "[BIULD INDEX FOR FORWARD STRAND (G->A)]\n"); break; case 3: fprintf(stderr, "[BIULD INDEX FOR REVERSE STRAND (G->A)]\n"); } Genome genome; HashTable hash_table; ReadGenome(chrom_files, genome); if (indicator % 2) { ReverseComplementGenome(genome); } if (indicator == 0 || indicator == 1) { C2T(genome.sequence); } else { G2A(genome.sequence); } set<uint32_t> extremal_large_bucket; CountBucketSize(genome, hash_table, extremal_large_bucket); HashToBucket(genome, hash_table, extremal_large_bucket); SortHashTableBucket(genome, hash_table); WriteIndex(output_file, genome, hash_table); size_of_index = hash_table.index_size > size_of_index ? hash_table.index_size : size_of_index; }
void PairEndMapping(const string& org_read, const Genome& genome, const HashTable& hash_table, const char& strand, const bool& AG_WILDCARD, const uint32_t& max_mismatches, TopCandidates& top_match) { uint32_t read_len = org_read.size(); if (read_len < MINIMALREADLEN) { return; } /* return the maximal seed length for a particular read length */ uint32_t seed_pattern_repeats = (read_len - SEEPATTERNLEN + 1) / SEEPATTERNLEN; uint32_t seed_len = seed_pattern_repeats * SEEPATTERNCAREDWEIGHT; string read; if (AG_WILDCARD) { G2A(org_read, read_len, read); } else { C2T(org_read, read_len, read); } uint32_t cur_max_mismatches = max_mismatches; for (uint32_t seed_i = 0; seed_i < SEEPATTERNLEN; ++seed_i) { /* all exact matches are covered by the first seed */ if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 0 && seed_i) break; #if defined SEEDPATTERN3 || SEEDPATTERN5 /* all matches with 1 mismatch are covered by the first two seeds */ if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1 && seed_i >= 2) break; #endif #ifdef SEEDPATTERN7 /* all matches with 1 mismatch are covered by the first two seeds */ if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1 && seed_i >= 4) break; #endif string read_seed = read.substr(seed_i); uint32_t hash_value = getHashValue(read_seed.c_str()); pair<uint32_t, uint32_t> region; region.first = hash_table.counter[hash_value]; region.second = hash_table.counter[hash_value + 1]; if (region.first == region.second) continue; IndexRegion(read_seed, genome, hash_table, seed_len, region); if (region.second - region.first + 1 > 5000) { continue; } for (uint32_t j = region.first; j <= region.second; ++j) { uint32_t genome_pos = hash_table.index[j]; uint32_t chr_id = getChromID(genome.start_index, genome_pos); if (genome_pos - genome.start_index[chr_id] < seed_i) continue; genome_pos = genome_pos - seed_i; if (genome_pos + read_len >= genome.start_index[chr_id + 1]) continue; /* check the position */ uint32_t num_of_mismatch = 0; uint32_t num_of_nocared = seed_pattern_repeats * SEEPATTERNNOCAREDWEIGHT + seed_i; for (uint32_t p = 0; p < num_of_nocared && num_of_mismatch <= cur_max_mismatches; ++p) { if (genome.sequence[genome_pos + F2NOCAREDPOSITION[seed_i][p]] != read[F2NOCAREDPOSITION[seed_i][p]]) { num_of_mismatch++; } } for (uint32_t p = seed_pattern_repeats * SEEPATTERNLEN + seed_i; p < read_len && num_of_mismatch <= cur_max_mismatches; ++p) { if (genome.sequence[genome_pos + p] != read[p]) { num_of_mismatch++; } } if (num_of_mismatch > max_mismatches) { continue; } top_match.Push(CandidatePosition(genome_pos, strand, num_of_mismatch)); if (top_match.Full()) { cur_max_mismatches = top_match.Top().mismatch; } } } }