unsigned int SubsetPartition::output_partitioned_file(const std::string infilename, const std::string outputfile, bool output_unassigned, CallbackFn callback, void * callback_data) { IParser* parser = IParser::get_parser(infilename); ofstream outfile(outputfile.c_str()); unsigned int total_reads = 0; unsigned int reads_kept = 0; unsigned int n_singletons = 0; PartitionSet partitions; Read read; string seq; std::string first_kmer; HashIntoType kmer = 0; const unsigned int ksize = _ht->ksize(); // // go through all the reads, and take those with assigned partitions // and output them. // while(!parser->is_complete()) { read = parser->get_next_read(); seq = read.seq; if (_ht->check_read(seq)) { const char * kmer_s = seq.c_str(); bool found_tag = false; for (unsigned int i = 0; i < seq.length() - ksize + 1; i++) { kmer = _hash(kmer_s + i, ksize); // is this a known tag? if (partition_map.find(kmer) != partition_map.end()) { found_tag = true; break; } } // all sequences should have at least one tag in them. // assert(found_tag); @CTB currently breaks tests. give fn flag to // disable. PartitionID partition_id = 0; if (found_tag) { PartitionID * partition_p = partition_map[kmer]; if (partition_p == NULL ){ partition_id = 0; n_singletons++; } else { partition_id = *partition_p; partitions.insert(partition_id); } } if (partition_id > 0 || output_unassigned) { outfile << ">" << read.name << "\t" << partition_id; outfile << "\n" << seq << "\n"; } #ifdef VALIDATE_PARTITIONS std::cout << "checking: " << read.name << "\n"; assert(is_single_partition(seq)); #endif // VALIDATE_PARTITIONS total_reads++; // run callback, if specified if (total_reads % CALLBACK_PERIOD == 0 && callback) { try { callback("output_partitions", callback_data, total_reads, reads_kept); } catch (...) { delete parser; parser = NULL; outfile.close(); throw; } } } } delete parser; parser = NULL; return partitions.size() + n_singletons; }
virtual unsigned int output_partitioned_file(const std::string infilename, const std::string outputfilename, CallbackFn callback=0, void * callback_data=0) { IParser* parser = IParser::get_parser(infilename); ofstream outfile(outputfilename.c_str()); unsigned int total_reads = 0; unsigned int reads_kept = 0; Read read; string seq; std::string first_kmer; HashIntoType forward_hash = 0, reverse_hash = 0; map<SetID, unsigned int> lReadCounts; while(!parser->is_complete()) { read = parser->get_next_read(); seq = read.seq; if (check_read(seq)) { first_kmer = seq.substr(0, _ksize); // generate the hash for the first kmer in the read (fair amount of work) HashIntoType bin = _hash(first_kmer.c_str(), _ksize, forward_hash, reverse_hash); SetID lActualFinalSetID = _sets[ _set_IDs[ bin ] ]->getCurrentPrimarySetID(); lReadCounts[lActualFinalSetID]++; outfile << ">" << read.name << "\t" << lActualFinalSetID << " " << "\n" << seq << "\n"; // reset the sequence info, increment read number total_reads++; // run callback, if specified if (total_reads % CALLBACK_PERIOD == 0 && callback) { try { callback("do_truncated_partition/output", callback_data, total_reads, reads_kept); } catch (...) { delete parser; parser = NULL; outfile.close(); throw; } } } } for ( map<SetID, unsigned int>::iterator lIt = lReadCounts.begin(); lIt != lReadCounts.end(); ++lIt ) { cout << setw(10) << lIt->first; cout << setw(10) << lIt->second << endl; } cout << setw(6) << "unique set count: "<< lReadCounts.size() << endl; cout << endl; delete parser; parser = NULL; return lReadCounts.size(); }
void Hashtable::consume_fasta(const std::string &filename, unsigned int &total_reads, unsigned long long &n_consumed, HashIntoType lower_bound, HashIntoType upper_bound, ReadMaskTable ** orig_readmask, bool update_readmask, CallbackFn callback, void * callback_data) { total_reads = 0; n_consumed = 0; IParser* parser = IParser::get_parser(filename.c_str()); Read read; string currName = ""; string currSeq = ""; // // readmask stuff: were we given one? do we want to update it? // ReadMaskTable * readmask = NULL; std::list<unsigned int> masklist; if (orig_readmask && *orig_readmask) { readmask = *orig_readmask; } // // iterate through the FASTA file & consume the reads. // while(!parser->is_complete()) { read = parser->get_next_read(); currSeq = read.seq; currName = read.name; // do we want to process it? if (!readmask || readmask->get(total_reads)) { // yep! process. unsigned int this_n_consumed; bool is_valid; this_n_consumed = check_and_process_read(currSeq, is_valid, lower_bound, upper_bound); // was this an invalid sequence -> mark as bad? if (!is_valid && update_readmask) { if (readmask) { readmask->set(total_reads, false); } else { masklist.push_back(total_reads); } } else { // nope -- count it! n_consumed += this_n_consumed; } } // reset the sequence info, increment read number total_reads++; // run callback, if specified if (total_reads % CALLBACK_PERIOD == 0 && callback) { try { callback("consume_fasta", callback_data, total_reads, n_consumed); } catch (...) { throw; } } } // // We've either updated the readmask in place, OR we need to create a // new one. // if (orig_readmask && update_readmask && readmask == NULL) { // allocate, fill in from masklist readmask = new ReadMaskTable(total_reads); list<unsigned int>::const_iterator it; for(it = masklist.begin(); it != masklist.end(); ++it) { readmask->set(*it, false); } *orig_readmask = readmask; } }