示例#1
0
unsigned int SubsetPartition::output_partitioned_file(const std::string infilename,
						      const std::string outputfile,
						      bool output_unassigned,
						      CallbackFn callback,
						      void * callback_data)
{
  IParser* parser = IParser::get_parser(infilename);
  ofstream outfile(outputfile.c_str());

  unsigned int total_reads = 0;
  unsigned int reads_kept = 0;
  unsigned int n_singletons = 0;

  PartitionSet partitions;

  Read read;
  string seq;

  std::string first_kmer;
  HashIntoType kmer = 0;

  const unsigned int ksize = _ht->ksize();

  //
  // go through all the reads, and take those with assigned partitions
  // and output them.
  //

  while(!parser->is_complete()) {
    read = parser->get_next_read();
    seq = read.seq;

    if (_ht->check_read(seq)) {
      const char * kmer_s = seq.c_str();

      bool found_tag = false;
      for (unsigned int i = 0; i < seq.length() - ksize + 1; i++) {
	kmer = _hash(kmer_s + i, ksize);

	// is this a known tag?
	if (partition_map.find(kmer) != partition_map.end()) {
	  found_tag = true;
	  break;
	}
      }

      // all sequences should have at least one tag in them.
      // assert(found_tag);  @CTB currently breaks tests.  give fn flag to
      // disable.

      PartitionID partition_id = 0;
      if (found_tag) {
	PartitionID * partition_p = partition_map[kmer];
	if (partition_p == NULL ){
	  partition_id = 0;
	  n_singletons++;
	} else {
	  partition_id = *partition_p;
	  partitions.insert(partition_id);
	}
      }

      if (partition_id > 0 || output_unassigned) {
	outfile << ">" << read.name << "\t" << partition_id;
	outfile << "\n" << seq << "\n";
      }
#ifdef VALIDATE_PARTITIONS
      std::cout << "checking: " << read.name << "\n";
      assert(is_single_partition(seq));
#endif // VALIDATE_PARTITIONS
	       
      total_reads++;

      // run callback, if specified
      if (total_reads % CALLBACK_PERIOD == 0 && callback) {
	try {
	  callback("output_partitions", callback_data,
		   total_reads, reads_kept);
	} catch (...) {
	  delete parser; parser = NULL;
	  outfile.close();
	  throw;
	}
      }
    }
  }

  delete parser; parser = NULL;

  return partitions.size() + n_singletons;
}
示例#2
0
 virtual unsigned int output_partitioned_file(const std::string infilename,
                                              const std::string outputfilename,
                                              CallbackFn callback=0,
                                              void * callback_data=0)
 {
   IParser* parser = IParser::get_parser(infilename);
   ofstream outfile(outputfilename.c_str());
   
   unsigned int total_reads = 0;
   unsigned int reads_kept = 0;
   
   Read read;
   string seq;
   
   std::string first_kmer;
   HashIntoType forward_hash = 0, reverse_hash = 0;
   
   map<SetID, unsigned int> lReadCounts;
   
   while(!parser->is_complete()) {
     read = parser->get_next_read();
     seq = read.seq;
     
     if (check_read(seq)) {
       first_kmer = seq.substr(0, _ksize);
       
       // generate the hash for the first kmer in the read (fair amount of work)
       HashIntoType bin = _hash(first_kmer.c_str(), _ksize, forward_hash, reverse_hash);
       
       SetID lActualFinalSetID = _sets[ _set_IDs[ bin ] ]->getCurrentPrimarySetID();
       
       lReadCounts[lActualFinalSetID]++;
       
       outfile << ">" << read.name << "\t" << lActualFinalSetID
       << " " << "\n" 
       << seq << "\n";
       
       // reset the sequence info, increment read number
       total_reads++;
       
       // run callback, if specified
       if (total_reads % CALLBACK_PERIOD == 0 && callback) {
         try {
           callback("do_truncated_partition/output", callback_data,
                    total_reads, reads_kept);
         } catch (...) {
           delete parser; parser = NULL;
           outfile.close();
           throw;
         }
       }
     }
   }
   
   for ( map<SetID, unsigned int>::iterator lIt = lReadCounts.begin(); lIt != lReadCounts.end(); ++lIt )
   {
     cout << setw(10) << lIt->first;
     cout << setw(10) << lIt->second << endl;        
   }
   
   cout << setw(6) << "unique set count: "<< lReadCounts.size() << endl;
   cout << endl;
   
   
   delete parser; parser = NULL;
   
   return lReadCounts.size();
 }
示例#3
0
void Hashtable::consume_fasta(const std::string &filename,
			      unsigned int &total_reads,
			      unsigned long long &n_consumed,
			      HashIntoType lower_bound,
			      HashIntoType upper_bound,
			      ReadMaskTable ** orig_readmask,
			      bool update_readmask,
			      CallbackFn callback,
			      void * callback_data)
{
  total_reads = 0;
  n_consumed = 0;

  IParser* parser = IParser::get_parser(filename.c_str());
  Read read;

  string currName = "";
  string currSeq = "";

  //
  // readmask stuff: were we given one? do we want to update it?
  // 

  ReadMaskTable * readmask = NULL;
  std::list<unsigned int> masklist;

  if (orig_readmask && *orig_readmask) {
    readmask = *orig_readmask;
  }

  //
  // iterate through the FASTA file & consume the reads.
  //

  while(!parser->is_complete())  {
    read = parser->get_next_read();
    currSeq = read.seq;
    currName = read.name; 

    // do we want to process it?
    if (!readmask || readmask->get(total_reads)) {

      // yep! process.
      unsigned int this_n_consumed;
      bool is_valid;

      this_n_consumed = check_and_process_read(currSeq,
					       is_valid,
					       lower_bound,
					       upper_bound);

      // was this an invalid sequence -> mark as bad?
      if (!is_valid && update_readmask) {
        if (readmask) {
	  readmask->set(total_reads, false);
	} else {
	  masklist.push_back(total_reads);
	}
      } else {		// nope -- count it!
        n_consumed += this_n_consumed;
      }
    }
	       
    // reset the sequence info, increment read number
    total_reads++;

    // run callback, if specified
    if (total_reads % CALLBACK_PERIOD == 0 && callback) {
      try {
        callback("consume_fasta", callback_data, total_reads, n_consumed);
      } catch (...) {
        throw;
      }
    }
  }


  //
  // We've either updated the readmask in place, OR we need to create a
  // new one.
  //

  if (orig_readmask && update_readmask && readmask == NULL) {
    // allocate, fill in from masklist
    readmask = new ReadMaskTable(total_reads);

    list<unsigned int>::const_iterator it;
    for(it = masklist.begin(); it != masklist.end(); ++it) {
      readmask->set(*it, false);
    }
    *orig_readmask = readmask;
  }
}