Example #1
0
/**	Function generate_super_sketches
 *
 * Input: a list of read-pair sequences
 * Output: super sketches for each fragment
 */
void generate_super_sketches (std::vector<sketch_t>& super_sketches,
		const strvec_t& pairs, xny::sketch_list& slistgen,
		xny::super_sketch ssgen, jaz::murmur264& hashfunc) {
	bool debug = false;

	int num_frag = pairs.size()/2;
	int fragID = super_sketches.size();

	std::vector<sketch_t> batch_sketches (num_frag);

	#pragma omp parallel for
	for (int i = 0; i < num_frag; ++ i) {
		std::string frag = pairs[i] + pairs[i + num_frag];
		if (frag.size()) {
			if (std::isupper(frag.at(0))) {
				std::replace (frag.begin(), frag.end(), 'N', 'A');
			} else std::replace (frag.begin(), frag.end(), 'n', 'a');

			std::vector<sketch_t> slist = slistgen (frag, hashfunc);
			std::sort (slist.begin(), slist.end(), xny::cmp_sketch());
			ssgen (batch_sketches[i], slist, hashfunc);
			batch_sketches[i].second = fragID + i;
		}
	}

	super_sketches.insert (super_sketches.end(),
			batch_sketches.begin(), batch_sketches.end());
} //generate_super_sketches
Example #2
0
 bool insert_line(size_t nline, simpleline_t &line)
 {
   if ( nline >= lines.size() )
     return false;
   lines.insert(lines.begin()+nline, line);
   return true;
 }
Example #3
0
 bool del_line(size_t nline)
 {
   if ( nline >= lines.size() )
     return false;
   lines.erase(lines.begin()+nline);
   return true;
 }
Example #4
0
 bool set_line(size_t nline, simpleline_t &sl)
 {
   if ( nline >= lines.size() )
     return false;
   lines[nline] = sl;
   return true;
 }
Example #5
0
/** Function generate_seeds ()
 *
 * Given [pairs] which stores 2*n number of reads, where pairs[i] and
 * pairs[i + n] for 0 <= i <= n - 1 form a read pair
 *
 * Output: [list_seeds] the non-overlapping seeds for each read pair
 */
void generate_seeds (ii64vec_t& list_seeds, const strvec_t& pairs,
		int seed_len) {
	bool debug = false;

	int num_pairs = pairs.size()/2;
	int fragID = list_seeds.size();

	ii64vec_t batch_seeds (num_pairs);

	int debug_cnt = 0;

	#pragma omp parallel for
	for (int i = 0; i < num_pairs; ++ i) {
		std::string frag = pairs[i] + pairs[i + num_pairs];
		if (frag.size()) {
			if (std::isupper(frag.at(0))) {
				std::replace (frag.begin(), frag.end(), 'N', 'A');
			} else std::replace (frag.begin(), frag.end(), 'n', 'a');
		}
		get_seeds_per_fragment (batch_seeds[i], frag, seed_len);


		if (debug) { // print out frag and its seeds

			std::cout << frag << "\n";
			int tmp_last_seed_len = frag.length() - seed_len * (frag.length()/seed_len);
			std::cout << "fraglen, seed_len, last_seed_len = " << frag.length() << ", "
					<< seed_len << ", " << tmp_last_seed_len << "\n";
			for (auto& x : batch_seeds[i]) {
				std::string d_fwd = xny::ID2Str<int64_t>(x, seed_len);
				std::cout << d_fwd << "\t" << xny::get_rvc_str(d_fwd) << "\n";
			}
			if (tmp_last_seed_len != 0) {
				std::cout << "last str: ";
				std::string d_fwd = xny::ID2Str<int64_t>(batch_seeds[i].back(),
						tmp_last_seed_len);
				std::cout << d_fwd << "\t" << xny::get_rvc_str(d_fwd) <<  "\n";
			}
			debug_cnt ++;
			if (debug_cnt > 2) exit(1);
		}

		// append to the seed list the fragment ID
		batch_seeds[i].push_back(fragID + i);

		if (debug) { // debug print
			std::cout << frag << "\n";
			for (int j = 0; j < batch_seeds[i].size(); ++ j) {
				std::cout << batch_seeds[i][j] << "  ";
			}
			std::cout << "\n\n";
			++ debug_cnt;
			if (debug_cnt > 50) exit(1);
		}
	}

	list_seeds.insert (list_seeds.end(),
			batch_seeds.begin(), batch_seeds.end());
} // generate_seeds
Example #6
0
 bool patch_line(size_t nline, size_t offs, int value)
 {
   if ( nline >= lines.size() )
     return false;
   qstring &L = lines[nline].line;
   L[offs] = (uchar) value & 0xFF;
   return true;
 }
Example #7
0
 void set_minmax(size_t start=0, size_t end=size_t(-1))
 {
   if ( start == 0 && end == size_t(-1) )
   {
     end = lines.size();
     pl_min.n = 0;
     pl_max.n = end == 0 ? 0 : end - 1;
   }
   else
   {
     pl_min.n = start;
     pl_max.n = end;
   }
 }
Example #8
0
void merge_floats(strvec_t const & tokens_in, strvec_t & tokens_out) {
  ptrdiff_t tokcnt = tokens_in.size();
  ptrdiff_t i = 0;
  
  for(; i < tokcnt - 2; ++i) {
    string const & l = tokens_in[i];
    string const & c = tokens_in[i+1];
    string const & r = tokens_in[i+2];

    if(c == "." && l != "" && r != "") {
      tokens_out.push_back(l+c+r);
      i += 2;
    } else
      tokens_out.push_back(l);
  }

  for(;i < tokcnt; ++i) {
    tokens_out.push_back(tokens_in[i]);
  }
}
Example #9
0
/** Function duplicate_removal ()
 *
 * Input: a list of paired fastq files
 * Output: a list of paired fastq files with duplicate removed. If the
 * num of output files is equal to the input, then dupl removal is applied
 * to each input pair; otherwise, 2 paired fastq output files should be
 * specified, and the output are combined in these two files.
 *
 * Randomly mutate 'N's in each read into 'A'. This will
 * result in reads containing a small number of Ns being considered for
 * clustering whereas reads containing a lot of Ns will not be considered
 * anyway.
 */
void duplicate_removal (const strvec_t& ifqs, const drm_t& drm, int w,
		int w2, xny::low_complexity& lc, int batch, bool silent) {


	// sanity check
	if ((drm.op.size() != ifqs.size()) && (drm.op.size () != 2)) {
		abording ("duplicate_removal ofqs.size() != ifqs.size() and"
				"ofqs.size() != 2");
	}
	// first obtain the length of a read, double it to be fragment length
	std::ifstream fh_tmp;
	xny::openfile<std::ifstream>(fh_tmp, ifqs[0]);
	bio::fastq_input_iterator<> fq(fh_tmp);
	int frag_len = 2 * (std::get<1>(*fq)).length();
	xny::closefile(fh_tmp);

	// calculate the upper bound of mismatches can be tolerated,
	// number of seeds & seed length; max seed length will be bounded by 31
//	int ub_mismatch = std::min (drm.max_mismatch, frag_len * (100 - drm.perc_sim) /100),
	int ub_mismatch = frag_len * (100 - drm.perc_sim) /100,
		//num_seed = ub_mismatch + 1,
		seed_len = std::min(frag_len / (ub_mismatch + 1), 31);
		//num_seed = std::max(num_seed, frag_len/seed_len);

	//std::cout << max_mismatch << ", " << frag_len << ", " << perc_sim << "\n";
	//std::cout << "ub_mismatch = " << ub_mismatch << "\n";

	// process every pair of files
	int num_file_pairs = ifqs.size()/2;

	xny::sketch_list slistgen (w, false);
	xny::super_sketch ssgen (w2);

	std::ofstream ofhfq, ofhfq2;
	if (drm.op.size() == 2) {
		xny::openfile<std::ofstream> (ofhfq, drm.op[0]);
		xny::openfile<std::ofstream> (ofhfq2, drm.op[1]);
	}

	for (int i = 0; i < num_file_pairs; ++ i) {

		int fID = 2*i;

		if (! silent) {
			std::cout << "\tprocess files: " << ifqs[fID] << " and "
					<< ifqs[fID + 1] << "\n\n";
		}

		// --------------- generate seeds for each fragment -------------
		// ----------- a compressed form to represent fragments ---------
		if (! silent) std::cout << "\tgenerate seeds...\n";
		ii64vec_t list_seeds; // stores the list of seeds per fragment,
							  // the last element stores the fragment ID
		get_seeds (list_seeds, ifqs[fID], ifqs[fID + 1], seed_len,
				batch, silent);

		// ---- initialize the global union-find structure ----
		ivec_t uf_clst (list_seeds.size());
		for (unsigned int j = 0; j < list_seeds.size(); ++ j) uf_clst[j] = j;

		// ------------------ clustering via ss -------------------------

		if (! silent) std::cout << "\tclustering via super sketches ...\n";

		clustering_via_ss (uf_clst, list_seeds, ifqs[fID], ifqs[fID + 1],
				batch, slistgen, ssgen, ub_mismatch, silent);

		// --------------- clustering via seeds -----------------
		if (!silent) std::cout << "\tclustering via seeds ...\n";
		clustering_via_seeds (uf_clst, list_seeds, frag_len/seed_len,
				ub_mismatch, silent);


		// -------- generate final union find clusters --------------
		iivec_t clusters;
		uf_generate_cls (clusters, uf_clst);

		// ---- generate the duplicated fragment IDs ----------------
		iset_t duplIDs; // duplicate fragIDs
		int debug_counter = 0;
		for (int j = 0; j < (int) clusters.size(); ++ j) {
			duplIDs.insert(clusters[j].begin() + 1, clusters[j].end());

			/*{ // debug code print out clusters
				if (clusters[j].size() > 100) {
					debug_print_fragments (clusters[j], ifqs[fID], ifqs[fID+1]);
					++ debug_counter;
					if (debug_counter > 10) exit(1);
				}
			}*/
		}
		clusters.clear();

		if (!silent) std::cout << "\n\t\tnum duplicate frags: " << duplIDs.size()
				<< "(" << 100 * duplIDs.size()/ list_seeds.size() << "% total)" << "\n\n";

		if (!silent) std::cout << "\toutput non-redundant read-pairs...\n";

		// ----- output non-redundant read-pairs ---------
		if (drm.op.size() > 2) {

			xny::openfile<std::ofstream> (ofhfq, drm.op[fID]);
			xny::openfile<std::ofstream> (ofhfq2, drm.op[fID + 1]);
		}

		clean_dupl_frag (ifqs[fID], ifqs[fID+1], ofhfq, ofhfq2,
				 duplIDs, lc, batch);

		if (drm.op.size() > 2) {
			xny::closefile(ofhfq);
			xny::closefile(ofhfq2);
		}

	} // for (int i = 0

} // duplicate_removal
Example #10
0
 const size_t count() const
 {
   return lines.size();
 }
Example #11
0
 simpleline_t *get_line(size_t nline)
 {
   return nline >= lines.size() ? NULL : &lines[nline];
 }