bool del_line(size_t nline) { if ( nline >= lines.size() ) return false; lines.erase(lines.begin()+nline); return true; }
bool insert_line(size_t nline, simpleline_t &line) { if ( nline >= lines.size() ) return false; lines.insert(lines.begin()+nline, line); return true; }
/** Function generate_super_sketches * * Input: a list of read-pair sequences * Output: super sketches for each fragment */ void generate_super_sketches (std::vector<sketch_t>& super_sketches, const strvec_t& pairs, xny::sketch_list& slistgen, xny::super_sketch ssgen, jaz::murmur264& hashfunc) { bool debug = false; int num_frag = pairs.size()/2; int fragID = super_sketches.size(); std::vector<sketch_t> batch_sketches (num_frag); #pragma omp parallel for for (int i = 0; i < num_frag; ++ i) { std::string frag = pairs[i] + pairs[i + num_frag]; if (frag.size()) { if (std::isupper(frag.at(0))) { std::replace (frag.begin(), frag.end(), 'N', 'A'); } else std::replace (frag.begin(), frag.end(), 'n', 'a'); std::vector<sketch_t> slist = slistgen (frag, hashfunc); std::sort (slist.begin(), slist.end(), xny::cmp_sketch()); ssgen (batch_sketches[i], slist, hashfunc); batch_sketches[i].second = fragID + i; } } super_sketches.insert (super_sketches.end(), batch_sketches.begin(), batch_sketches.end()); } //generate_super_sketches
bool set_line(size_t nline, simpleline_t &sl) { if ( nline >= lines.size() ) return false; lines[nline] = sl; return true; }
/** Function generate_seeds () * * Given [pairs] which stores 2*n number of reads, where pairs[i] and * pairs[i + n] for 0 <= i <= n - 1 form a read pair * * Output: [list_seeds] the non-overlapping seeds for each read pair */ void generate_seeds (ii64vec_t& list_seeds, const strvec_t& pairs, int seed_len) { bool debug = false; int num_pairs = pairs.size()/2; int fragID = list_seeds.size(); ii64vec_t batch_seeds (num_pairs); int debug_cnt = 0; #pragma omp parallel for for (int i = 0; i < num_pairs; ++ i) { std::string frag = pairs[i] + pairs[i + num_pairs]; if (frag.size()) { if (std::isupper(frag.at(0))) { std::replace (frag.begin(), frag.end(), 'N', 'A'); } else std::replace (frag.begin(), frag.end(), 'n', 'a'); } get_seeds_per_fragment (batch_seeds[i], frag, seed_len); if (debug) { // print out frag and its seeds std::cout << frag << "\n"; int tmp_last_seed_len = frag.length() - seed_len * (frag.length()/seed_len); std::cout << "fraglen, seed_len, last_seed_len = " << frag.length() << ", " << seed_len << ", " << tmp_last_seed_len << "\n"; for (auto& x : batch_seeds[i]) { std::string d_fwd = xny::ID2Str<int64_t>(x, seed_len); std::cout << d_fwd << "\t" << xny::get_rvc_str(d_fwd) << "\n"; } if (tmp_last_seed_len != 0) { std::cout << "last str: "; std::string d_fwd = xny::ID2Str<int64_t>(batch_seeds[i].back(), tmp_last_seed_len); std::cout << d_fwd << "\t" << xny::get_rvc_str(d_fwd) << "\n"; } debug_cnt ++; if (debug_cnt > 2) exit(1); } // append to the seed list the fragment ID batch_seeds[i].push_back(fragID + i); if (debug) { // debug print std::cout << frag << "\n"; for (int j = 0; j < batch_seeds[i].size(); ++ j) { std::cout << batch_seeds[i][j] << " "; } std::cout << "\n\n"; ++ debug_cnt; if (debug_cnt > 50) exit(1); } } list_seeds.insert (list_seeds.end(), batch_seeds.begin(), batch_seeds.end()); } // generate_seeds
bool patch_line(size_t nline, size_t offs, int value) { if ( nline >= lines.size() ) return false; qstring &L = lines[nline].line; L[offs] = (uchar) value & 0xFF; return true; }
void merge_floats(strvec_t const & tokens_in, strvec_t & tokens_out) { ptrdiff_t tokcnt = tokens_in.size(); ptrdiff_t i = 0; for(; i < tokcnt - 2; ++i) { string const & l = tokens_in[i]; string const & c = tokens_in[i+1]; string const & r = tokens_in[i+2]; if(c == "." && l != "" && r != "") { tokens_out.push_back(l+c+r); i += 2; } else tokens_out.push_back(l); } for(;i < tokcnt; ++i) { tokens_out.push_back(tokens_in[i]); } }
void set_minmax(size_t start=0, size_t end=size_t(-1)) { if ( start == 0 && end == size_t(-1) ) { end = lines.size(); pl_min.n = 0; pl_max.n = end == 0 ? 0 : end - 1; } else { pl_min.n = start; pl_max.n = end; } }
/** Function duplicate_removal () * * Input: a list of paired fastq files * Output: a list of paired fastq files with duplicate removed. If the * num of output files is equal to the input, then dupl removal is applied * to each input pair; otherwise, 2 paired fastq output files should be * specified, and the output are combined in these two files. * * Randomly mutate 'N's in each read into 'A'. This will * result in reads containing a small number of Ns being considered for * clustering whereas reads containing a lot of Ns will not be considered * anyway. */ void duplicate_removal (const strvec_t& ifqs, const drm_t& drm, int w, int w2, xny::low_complexity& lc, int batch, bool silent) { // sanity check if ((drm.op.size() != ifqs.size()) && (drm.op.size () != 2)) { abording ("duplicate_removal ofqs.size() != ifqs.size() and" "ofqs.size() != 2"); } // first obtain the length of a read, double it to be fragment length std::ifstream fh_tmp; xny::openfile<std::ifstream>(fh_tmp, ifqs[0]); bio::fastq_input_iterator<> fq(fh_tmp); int frag_len = 2 * (std::get<1>(*fq)).length(); xny::closefile(fh_tmp); // calculate the upper bound of mismatches can be tolerated, // number of seeds & seed length; max seed length will be bounded by 31 // int ub_mismatch = std::min (drm.max_mismatch, frag_len * (100 - drm.perc_sim) /100), int ub_mismatch = frag_len * (100 - drm.perc_sim) /100, //num_seed = ub_mismatch + 1, seed_len = std::min(frag_len / (ub_mismatch + 1), 31); //num_seed = std::max(num_seed, frag_len/seed_len); //std::cout << max_mismatch << ", " << frag_len << ", " << perc_sim << "\n"; //std::cout << "ub_mismatch = " << ub_mismatch << "\n"; // process every pair of files int num_file_pairs = ifqs.size()/2; xny::sketch_list slistgen (w, false); xny::super_sketch ssgen (w2); std::ofstream ofhfq, ofhfq2; if (drm.op.size() == 2) { xny::openfile<std::ofstream> (ofhfq, drm.op[0]); xny::openfile<std::ofstream> (ofhfq2, drm.op[1]); } for (int i = 0; i < num_file_pairs; ++ i) { int fID = 2*i; if (! silent) { std::cout << "\tprocess files: " << ifqs[fID] << " and " << ifqs[fID + 1] << "\n\n"; } // --------------- generate seeds for each fragment ------------- // ----------- a compressed form to represent fragments --------- if (! silent) std::cout << "\tgenerate seeds...\n"; ii64vec_t list_seeds; // stores the list of seeds per fragment, // the last element stores the fragment ID get_seeds (list_seeds, ifqs[fID], ifqs[fID + 1], seed_len, batch, silent); // ---- initialize the global union-find structure ---- ivec_t uf_clst (list_seeds.size()); for (unsigned int j = 0; j < list_seeds.size(); ++ j) uf_clst[j] = j; // ------------------ clustering via ss ------------------------- if (! silent) std::cout << "\tclustering via super sketches ...\n"; clustering_via_ss (uf_clst, list_seeds, ifqs[fID], ifqs[fID + 1], batch, slistgen, ssgen, ub_mismatch, silent); // --------------- clustering via seeds ----------------- if (!silent) std::cout << "\tclustering via seeds ...\n"; clustering_via_seeds (uf_clst, list_seeds, frag_len/seed_len, ub_mismatch, silent); // -------- generate final union find clusters -------------- iivec_t clusters; uf_generate_cls (clusters, uf_clst); // ---- generate the duplicated fragment IDs ---------------- iset_t duplIDs; // duplicate fragIDs int debug_counter = 0; for (int j = 0; j < (int) clusters.size(); ++ j) { duplIDs.insert(clusters[j].begin() + 1, clusters[j].end()); /*{ // debug code print out clusters if (clusters[j].size() > 100) { debug_print_fragments (clusters[j], ifqs[fID], ifqs[fID+1]); ++ debug_counter; if (debug_counter > 10) exit(1); } }*/ } clusters.clear(); if (!silent) std::cout << "\n\t\tnum duplicate frags: " << duplIDs.size() << "(" << 100 * duplIDs.size()/ list_seeds.size() << "% total)" << "\n\n"; if (!silent) std::cout << "\toutput non-redundant read-pairs...\n"; // ----- output non-redundant read-pairs --------- if (drm.op.size() > 2) { xny::openfile<std::ofstream> (ofhfq, drm.op[fID]); xny::openfile<std::ofstream> (ofhfq2, drm.op[fID + 1]); } clean_dupl_frag (ifqs[fID], ifqs[fID+1], ofhfq, ofhfq2, duplIDs, lc, batch); if (drm.op.size() > 2) { xny::closefile(ofhfq); xny::closefile(ofhfq2); } } // for (int i = 0 } // duplicate_removal
void add_line(const char *str) { lines.push_back(simpleline_t(str)); }
void add_line(simpleline_t &line) { lines.push_back(line); }
void clear_lines() { lines.clear(); set_minmax(); }
const size_t count() const { return lines.size(); }
simpleline_t *get_line(size_t nline) { return nline >= lines.size() ? NULL : &lines[nline]; }