Beispiel #1
0
int main( int argc, char *argv[] )
{
	short error;

	error = G2A( argc, argv );
	if( error ) ILError( error, "G2A" );
	
	return 0;
}
Beispiel #2
0
void BuildIndex(const vector<string>& chrom_files, const int& indicator,
                const string& output_file, uint32_t& size_of_index) {
  switch (indicator) {
    case 0:
      fprintf(stderr, "[BIULD INDEX FOR FORWARD STRAND (C->T)]\n");
      break;
    case 1:
      fprintf(stderr, "[BIULD INDEX FOR REVERSE STRAND (C->T)]\n");
      break;
    case 2:
      fprintf(stderr, "[BIULD INDEX FOR FORWARD STRAND (G->A)]\n");
      break;
    case 3:
      fprintf(stderr, "[BIULD INDEX FOR REVERSE STRAND (G->A)]\n");
  }

  Genome genome;
  HashTable hash_table;
  ReadGenome(chrom_files, genome);

  if (indicator % 2) {
    ReverseComplementGenome(genome);
  }

  if (indicator == 0 || indicator == 1) {
    C2T(genome.sequence);
  } else {
    G2A(genome.sequence);
  }

  set<uint32_t> extremal_large_bucket;
  CountBucketSize(genome, hash_table, extremal_large_bucket);
  HashToBucket(genome, hash_table, extremal_large_bucket);
  SortHashTableBucket(genome, hash_table);
  WriteIndex(output_file, genome, hash_table);

  size_of_index =
      hash_table.index_size > size_of_index ?
          hash_table.index_size : size_of_index;
}
Beispiel #3
0
void PairEndMapping(const string& org_read, const Genome& genome,
                    const HashTable& hash_table, const char& strand,
                    const bool& AG_WILDCARD, const uint32_t& max_mismatches,
                    TopCandidates& top_match) {
  uint32_t read_len = org_read.size();
  if (read_len < MINIMALREADLEN) {
    return;
  }

  /* return the maximal seed length for a particular read length */
  uint32_t seed_pattern_repeats = (read_len - SEEPATTERNLEN + 1)
      / SEEPATTERNLEN;
  uint32_t seed_len = seed_pattern_repeats * SEEPATTERNCAREDWEIGHT;

  string read;
  if (AG_WILDCARD) {
    G2A(org_read, read_len, read);
  } else {
    C2T(org_read, read_len, read);
  }

  uint32_t cur_max_mismatches = max_mismatches;
  for (uint32_t seed_i = 0; seed_i < SEEPATTERNLEN; ++seed_i) {
    /* all exact matches are covered by the first seed */
    if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 0
        && seed_i)
      break;

#if defined SEEDPATTERN3 || SEEDPATTERN5
    /* all matches with 1 mismatch are covered by the first two seeds */
    if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1
        && seed_i >= 2)
      break;
#endif

#ifdef SEEDPATTERN7
    /* all matches with 1 mismatch are covered by the first two seeds */
    if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1
        && seed_i >= 4)
      break;
#endif

    string read_seed = read.substr(seed_i);
    uint32_t hash_value = getHashValue(read_seed.c_str());
    pair<uint32_t, uint32_t> region;
    region.first = hash_table.counter[hash_value];
    region.second = hash_table.counter[hash_value + 1];

    if (region.first == region.second)
      continue;

    IndexRegion(read_seed, genome, hash_table, seed_len, region);
    if (region.second - region.first + 1 > 5000) {
      continue;
    }
    for (uint32_t j = region.first; j <= region.second; ++j) {
      uint32_t genome_pos = hash_table.index[j];
      uint32_t chr_id = getChromID(genome.start_index, genome_pos);
      if (genome_pos - genome.start_index[chr_id] < seed_i)
        continue;
      genome_pos = genome_pos - seed_i;
      if (genome_pos + read_len >= genome.start_index[chr_id + 1])
        continue;

      /* check the position */
      uint32_t num_of_mismatch = 0;
      uint32_t num_of_nocared = seed_pattern_repeats * SEEPATTERNNOCAREDWEIGHT
          + seed_i;
      for (uint32_t p = 0;
          p < num_of_nocared && num_of_mismatch <= cur_max_mismatches; ++p) {
        if (genome.sequence[genome_pos + F2NOCAREDPOSITION[seed_i][p]]
            != read[F2NOCAREDPOSITION[seed_i][p]]) {
          num_of_mismatch++;
        }
      }
      for (uint32_t p = seed_pattern_repeats * SEEPATTERNLEN + seed_i;
          p < read_len && num_of_mismatch <= cur_max_mismatches; ++p) {
        if (genome.sequence[genome_pos + p] != read[p]) {
          num_of_mismatch++;
        }
      }

      if (num_of_mismatch > max_mismatches) {
        continue;
      }
      top_match.Push(CandidatePosition(genome_pos, strand, num_of_mismatch));
      if (top_match.Full()) {
        cur_max_mismatches = top_match.Top().mismatch;
      }
    }
  }
}