示例#1
0
  StatSingleReads(const bool& _ambiguous, const bool& _unmapped,
                  const string& output_file, const bool& _SAM)
      : ambiguous(_ambiguous),
        unmapped(_unmapped),
        SAM(_SAM) {
    total_reads = 0;
    unique_mapped_reads = 0;
    ambiguous_mapped_reads = 0;
    unmapped_reads = 0;

    num_of_short_reads = 0;

    if (ambiguous && !SAM) {
      fambiguous = fopen(string(output_file + "_ambiguous").c_str(), "w");
      if (!fambiguous) {
        throw SMITHLABException(
            "cannot open input file " + string(output_file + "_ambiguous"));
      }
    }
    if (unmapped && !SAM) {
      funmapped = fopen(string(output_file + "_unmapped").c_str(), "w");
      if (!funmapped) {
        throw SMITHLABException(
            "cannot open input file " + string(output_file + "_unmapped"));
      }
    }
  }
示例#2
0
void WriteIndexHeadInfo(const string& index_file, const Genome& genome,
                        const uint32_t& size_of_index) {
  fprintf(stderr, "[WRITTING INDEX HEAD TO %s]\n", index_file.c_str());
  FILE * fout = fopen(index_file.c_str(), "wb");
  if (!fout) {
    throw SMITHLABException("cannot open input file " + index_file);
  }

  uint32_t num_of_chroms = genome.num_of_chroms;
  fwrite(&num_of_chroms, sizeof(uint32_t), 1, fout);

  for (uint32_t i = 0; i < num_of_chroms; ++i) {
    uint32_t chrom_name_len = genome.name[i].size();
    if (chrom_name_len > 255) {
      chrom_name_len = 255;
    }
    fwrite(&chrom_name_len, sizeof(uint32_t), 1, fout);
    fwrite(genome.name[i].c_str(), sizeof(char), chrom_name_len, fout);
  }

  fwrite(&(genome.length[0]), sizeof(uint32_t), num_of_chroms, fout);
  fwrite(&(genome.length_of_genome), sizeof(uint32_t), 1, fout);

  fwrite(&size_of_index, sizeof(uint32_t), 1, fout);

  fclose(fout);
}
示例#3
0
void ReadIndex(const string& index_file, Genome& genome,
               HashTable& hash_table) {
  FILE * fin = fopen(index_file.c_str(), "rb");
  if (!fin) {
    throw SMITHLABException("cannot open input file " + index_file);
  }

  FREAD_CHECK(fread(&(genome.strand), sizeof(char), 1, fin), 1);
  FREAD_CHECK(
      fread(&(genome.sequence[0]), sizeof(char), genome.length_of_genome,
            fin),
      genome.length_of_genome);

  /* read hash table from disk */
  FREAD_CHECK(fread(&(hash_table.counter_size), sizeof(uint32_t), 1, fin), 1);
  FREAD_CHECK(fread(&(hash_table.index_size), sizeof(uint32_t), 1, fin), 1);

  FREAD_CHECK(
      fread(&(hash_table.counter[0]), sizeof(uint32_t),
            hash_table.counter_size + 1, fin),
      hash_table.counter_size + 1);
  FREAD_CHECK(
      fread(&(hash_table.index[0]), sizeof(uint32_t), hash_table.index_size,
            fin),
      hash_table.index_size);

  fclose(fin);
}
示例#4
0
void WriteIndex(const string& index_file, const Genome& genome,
                const HashTable& hash_table) {
  fprintf(stderr, "[WRITTING INDEX TO %s]\n", index_file.c_str());
  FILE * fout = fopen(index_file.c_str(), "wb");
  if (!fout) {
    throw SMITHLABException("cannot open input file " + index_file);
  }

  fwrite(&(genome.strand), sizeof(char), 1, fout);
  fwrite(&(genome.sequence[0]), sizeof(char), genome.length_of_genome, fout);

  /* write hash table to disk */
  fwrite(&(hash_table.counter_size), sizeof(uint32_t), 1, fout);
  fwrite(&(hash_table.index_size), sizeof(uint32_t), 1, fout);

  fwrite(&(hash_table.counter[0]), sizeof(uint32_t),
         hash_table.counter_size + 1, fout);
  fwrite(&(hash_table.index[0]), sizeof(uint32_t), hash_table.index_size, fout);

  fclose(fout);
}
示例#5
0
void ReadIndexHeadInfo(const string& index_file, Genome& genome,
                       uint32_t& size_of_index) {
  FILE * fin = fopen(index_file.c_str(), "rb");
  if (!fin) {
    throw SMITHLABException("cannot open input file " + index_file);
  }

  uint32_t num_of_chroms;
  FREAD_CHECK(fread(&num_of_chroms, sizeof(uint32_t), 1, fin), 1);
  genome.num_of_chroms = num_of_chroms;
  genome.name.resize(num_of_chroms);
  genome.length.resize(num_of_chroms);
  genome.start_index.resize(num_of_chroms + 1);

  char chrom_name[256];
  uint32_t chrom_name_len;
  for (uint32_t i = 0; i < num_of_chroms; ++i) {
    FREAD_CHECK(fread(&chrom_name_len, sizeof(uint32_t), 1, fin), 1);
    FREAD_CHECK(fread(chrom_name, sizeof(char), chrom_name_len, fin),
                chrom_name_len);
    chrom_name[chrom_name_len] = 0;
    genome.name[i] = chrom_name;
  }

  FREAD_CHECK(fread(&(genome.length[0]), sizeof(uint32_t), num_of_chroms, fin),
              num_of_chroms);
  FREAD_CHECK(fread(&(genome.length_of_genome), sizeof(uint32_t), 1, fin), 1);

  genome.start_index[0] = 0;
  for (uint32_t i = 1; i <= num_of_chroms; ++i) {
    genome.start_index[i] = genome.start_index[i - 1] + genome.length[i - 1];
  }

  FREAD_CHECK(fread(&size_of_index, sizeof(uint32_t), 1, fin), 1);

  fclose(fin);
}
示例#6
0
文件: paired.cpp 项目: acgtun/walt
void ProcessPairedEndReads(const string& command, const string& index_file,
                           const string& reads_file_p1,
                           const string& reads_file_p2,
                           const string& output_file,
                           const uint32_t& n_reads_to_process,
                           const uint32_t& max_mismatches,
                           const string& adaptor, const uint32_t& top_k,
                           const int& frag_range, const bool& ambiguous,
                           const bool& unmapped, const bool& SAM,
                           const int& num_of_threads) {
  // LOAD THE INDEX HEAD INFO
  Genome genome;
  HashTable hash_table;

  uint32_t size_of_index;
  ReadIndexHeadInfo(index_file, genome, size_of_index);
  genome.sequence.resize(genome.length_of_genome);
  hash_table.counter.resize(power(4, F2SEEDKEYWIGTH) + 1);
  hash_table.index.resize(size_of_index);

  vector<vector<string> > index_names(2, vector<string>(2));
  index_names[0][0] = index_file + "_CT00";
  index_names[0][1] = index_file + "_CT01";
  index_names[1][0] = index_file + "_GA10";
  index_names[1][1] = index_file + "_GA11";

  vector<vector<string> > read_names(2, vector<string>(n_reads_to_process));
  vector<vector<string> > read_seqs(2, vector<string>(n_reads_to_process));
  vector<vector<string> > read_scores(2, vector<string>(n_reads_to_process));

  vector<int> ranked_results_size(2);
  vector<vector<CandidatePosition> > ranked_results(2,
          vector<CandidatePosition>(MAX_NUM_EXACT_MAPPED));

  vector<vector<TopCandidates> > top_results(2,
         vector<TopCandidates>(n_reads_to_process));

  FILE * fin[2];
  fin[0] = fopen(reads_file_p1.c_str(), "r");
  if (!fin[0]) {
    throw SMITHLABException("cannot open input file " + reads_file_p1);
  }
  fin[1] = fopen(reads_file_p2.c_str(), "r");
  if (!fin[1]) {
    throw SMITHLABException("cannot open input file " + reads_file_p2);
  }

  string adaptors[2];
  extract_adaptors(adaptor, adaptors[0], adaptors[1]);
  clock_t start_t = clock();
  FILE * fout = fopen(output_file.c_str(), "w");
  if (!fout) {
    throw SMITHLABException("cannot open input file " + output_file);
  }
  uint32_t num_of_reads[2];
  StatPairedReads stat_paired_reads(ambiguous, unmapped, output_file, SAM);
  bool AG_WILDCARD = true;
  fprintf(stderr, "[MAPPING PAIRED-END READS FROM THE FOLLOWING TWO FILES]\n");
  fprintf(stderr, "   %s (AND)\n   %s\n", reads_file_p1.c_str(),
          reads_file_p2.c_str());
  fprintf(stderr, "[OUTPUT MAPPING RESULTS TO %s]\n", output_file.c_str());
  if (SAM) {
    SAMHead(index_file, command, fout);
  }
  omp_set_dynamic(0);
  omp_set_num_threads(num_of_threads);
  for (uint32_t i = 0;; i += n_reads_to_process) {
    num_of_reads[0] = num_of_reads[1] = 0;
    for (uint32_t pi = 0; pi < 2; ++pi) {  // paired end reads _1 and _2
      AG_WILDCARD = pi == 1 ? true : false;
      LoadReadsFromFastqFile(fin[pi], i, n_reads_to_process, adaptors[pi],
                             num_of_reads[pi], read_names[pi], read_seqs[pi],
                             read_scores[pi]);
      if (num_of_reads[pi] == 0)
        break;

      //Initialize the paired results
      for (uint32_t j = 0; j < num_of_reads[pi]; ++j) {
        top_results[pi][j].Clear();
        top_results[pi][j].SetSize(top_k);
      }

      for (uint32_t fi = 0; fi < 2; ++fi) {
        ReadIndex(index_names[pi][fi], genome, hash_table);
        char strand = fi == 0 ? '+' : '-';
#pragma omp parallel for
        for (uint32_t j = 0; j < num_of_reads[pi]; ++j) {
          PairEndMapping(read_seqs[pi][j], genome, hash_table, strand,
                         AG_WILDCARD, max_mismatches, top_results[pi][j]);
        }
      }
    }
    if (num_of_reads[0] != num_of_reads[1]) {
      fprintf(stderr,
              "The number of reads in paired-end files should be the same.\n");
      exit( EXIT_FAILURE);
    }
    if (num_of_reads[0] == 0) {
      break;
    }
    stat_paired_reads.total_read_pairs += num_of_reads[0];
    ///////////////////////////////////////////////////////////
    // Merge Paired-end results
    for (uint32_t j = 0; j < num_of_reads[0]; ++j) {
      for (uint32_t pi = 0; pi < 2; ++pi) {
        ranked_results_size[pi] = 0;
        while (!top_results[pi][j].candidates.empty()) {
          ranked_results[pi][ranked_results_size[pi]++] =
              top_results[pi][j].Top();
          top_results[pi][j].Pop();
        }
      }

      MergePairedEndResults(genome, read_names[0][j], read_seqs[0][j],
                            read_scores[0][j], read_seqs[1][j],
                            read_scores[1][j], ranked_results,
                            ranked_results_size, frag_range, max_mismatches,
                            SAM, stat_paired_reads, fout);
    }

    if (num_of_reads[0] < n_reads_to_process)
      break;
  }

  fclose(fin[0]);
  fclose(fin[1]);
  fclose(fout);

  OutputPairedStatInfo(stat_paired_reads, output_file);
  fprintf(stderr, "[MAPPING TAKES %.0lf SECONDS]\n",
          (double(clock() - start_t) / CLOCKS_PER_SEC));
}