Beispiel #1
0
/* --- .Call ENTRY POINT --- */
SEXP read_XStringSet_from_fastq(SEXP filexp_list, SEXP nrec, SEXP skip,
		SEXP seek_first_rec,
		SEXP use_names, SEXP elementType, SEXP lkup)
{
	int nrec0, skip0, seek_rec0, load_seqids, ans_length, i, recno;
	SEXP filexp, ans_geom, ans_width, ans, ans_names;
	const char *element_type;
	char classname[40];  /* longest string should be "DNAStringSet" */
	FASTQ_loaderExt loader_ext;
	FASTQloader loader;

	nrec0 = INTEGER(nrec)[0];
	skip0 = INTEGER(skip)[0];
	seek_rec0 = LOGICAL(seek_first_rec)[0];
	load_seqids = LOGICAL(use_names)[0];
	PROTECT(ans_geom = fastq_geometry(filexp_list, nrec, skip,
					  seek_first_rec));
	ans_length = INTEGER(ans_geom)[0];
	PROTECT(ans_width = NEW_INTEGER(ans_length));
	if (ans_length != 0) {
		if (INTEGER(ans_geom)[1] == NA_INTEGER) {
			UNPROTECT(2);
			error("read_XStringSet_from_fastq(): FASTQ files with "
			      "variable sequence lengths are not supported yet");
		}
		for (recno = 0; recno < ans_length; recno++)
			INTEGER(ans_width)[recno] = INTEGER(ans_geom)[1];
	}
	element_type = CHAR(STRING_ELT(elementType, 0));
	if (snprintf(classname, sizeof(classname), "%sSet", element_type)
	    >= sizeof(classname))
	{
		UNPROTECT(2);
		error("Biostrings internal error in "
		      "read_XStringSet_from_fastq(): "
		      "'classname' buffer too small");
	}
	PROTECT(ans = alloc_XRawList(classname, element_type, ans_width));
	loader_ext = new_FASTQ_loaderExt(ans, lkup);
	loader = new_FASTQ_loader(load_seqids, &loader_ext);
	recno = 0;
	for (i = 0; i < LENGTH(filexp_list); i++) {
		filexp = VECTOR_ELT(filexp_list, i);
		filexp_rewind(filexp);
		parse_FASTQ_file(filexp, nrec0, skip0, seek_rec0,
				 &loader, &recno);
	}
	if (load_seqids) {
		PROTECT(ans_names =
		    new_CHARACTER_from_CharAEAE(loader_ext.ans_names_buf));
		_set_XStringSet_names(ans, ans_names);
		UNPROTECT(1);
	}
	UNPROTECT(3);
	return ans;
}
Beispiel #2
0
/* --- .Call ENTRY POINT ---
 * Args:
 *   x:   An XStringSetList object.
 *   sep: An XString object of the same seqtype as 'x'.
 * Returns an XStringSet object parallel to and of the same seqtype as 'x'.
 */
SEXP XStringSetList_unstrsplit(SEXP x, SEXP sep, SEXP seqtype)
{
	XStringSetList_holder x_holder;
	XStringSet_holder x_elt_holder, ans_holder;
	Chars_holder sep_holder, ans_elt_holder;
	int x_len, sep_len, i;
	const char *seqtype0;
	char ans_elt_type[37];  /* longest string should be "DNAString" */
	char ans_classname[40];  /* longest string should be "DNAStringSet" */
	SEXP ans, ans_width, ans_names;

	x_holder = _hold_XStringSetList(x);
	x_len = _get_length_from_XStringSetList_holder(&x_holder);
	sep_holder = hold_XRaw(sep);
	sep_len = sep_holder.length;
	seqtype0 = CHAR(STRING_ELT(seqtype, 0));
	if (snprintf(ans_elt_type, sizeof(ans_elt_type),
		     "%sString", seqtype0) >= sizeof(ans_elt_type))
		error("Biostrings internal error in "
		      "XStringSetList_unstrsplit(): "
		      "'ans_elt_type' buffer too small");
	if (snprintf(ans_classname, sizeof(ans_classname),
		     "%sSet", ans_elt_type) >= sizeof(ans_classname))
		error("Biostrings internal error in "
		      "XStringSetList_unstrsplit(): "
		      "'ans_classname' buffer too small");

	/* 1st pass: compute 'ans_width' */
	PROTECT(ans_width = NEW_INTEGER(x_len));
	for (i = 0; i < x_len; i++) {
		x_elt_holder = _get_elt_from_XStringSetList_holder(
					&x_holder, i);
		INTEGER(ans_width)[i] = compute_joined_strings_length(
					&x_elt_holder, sep_len);
	}

	/* Allocate 'ans' */
	PROTECT(ans = alloc_XRawList(ans_classname, ans_elt_type, ans_width));

	/* 2nd pass: fill 'ans' */
	ans_holder = _hold_XStringSet(ans);
	for (i = 0; i < x_len; i++) {
		x_elt_holder = _get_elt_from_XStringSetList_holder(
					&x_holder, i);
		ans_elt_holder = _get_elt_from_XStringSet_holder(
					&ans_holder, i);
		join_strings_in_buf((char *) ans_elt_holder.ptr, &x_elt_holder,
					sep_holder.ptr, sep_holder.length);
	}

	PROTECT(ans_names = duplicate(get_CompressedList_names(x)));
	_set_XStringSet_names(ans, ans_names);
	UNPROTECT(3);
	return ans;
}
/*
 * --- .Call ENTRY POINT ---
 * Return an AAStringSet object.
 */
SEXP DNAStringSet_translate(SEXP x, SEXP base_codes, SEXP lkup, SEXP skipcode)
{
	cachedXStringSet X, Y;
	cachedCharSeq X_elt, Y_elt;
	char skipcode0;
	int ans_length, i, errcode;
	SEXP ans, width, ans_width;
	TwobitEncodingBuffer teb;

	X = _cache_XStringSet(x);
	skipcode0 = (unsigned char) INTEGER(skipcode)[0];
	ans_length = _get_cachedXStringSet_length(&X);
	PROTECT(width = NEW_INTEGER(ans_length));
	for (i = 0; i < ans_length; i++) {
		X_elt = _get_cachedXStringSet_elt(&X, i);
		INTEGER(width)[i] = X_elt.length / 3;
	}
	PROTECT(ans = alloc_XRawList("AAStringSet", "AAString", width));
	Y = _cache_XStringSet(ans);
	teb = _new_TwobitEncodingBuffer(base_codes, 3, 0);
	ans_width = _get_XStringSet_width(ans);
	for (i = 0; i < ans_length; i++) {
		X_elt = _get_cachedXStringSet_elt(&X, i);
		Y_elt = _get_cachedXStringSet_elt(&Y, i);
		errcode = translate(&X_elt, &Y_elt, &teb, lkup, skipcode0);
		if (errcode == -1) {
			UNPROTECT(2);
			if (ans_length == 1)
				error("%s", errmsg_buf);
			else
				error("in 'x[[%d]]': %s", i + 1, errmsg_buf);
		}
		if (errcode == 1) {
			if (ans_length == 1)
				warning("%s", errmsg_buf);
			else
				warning("in 'x[[%d]]': %s", i + 1, errmsg_buf);
		}
		INTEGER(ans_width)[i] = Y_elt.length;
	}
	UNPROTECT(2);
	return ans;
}
Beispiel #4
0
/* --- .Call ENTRY POINT ---
 * "FASTA blocks" are groups of consecutive FASTA records.
 * Args:
 *   seqlength:   Integer vector with 1 element per record to read. The
 *                elements must be placed in the order that the records are
 *                going to be read.
 *   filexp_list: A list of N "File External Pointers" (see src/io_utils.c in
 *                the XVector package) with 1 element per input file. Files are
 *                going to be accessed from first to last in the list.
 *   nrec_list:   A list of N integer vectors (1 element per input file).
 *                Each integer vector has 1 value per FASTA block, which is the
 *                number of records in the block. IMPORTANT: Even if not
 *                required, the blocks in each integer vector should preferably
 *                be placed in the same order as in the file. This ensures that
 *                the calls to filexp_seek() are always moving forward
 *                in the file and are therefore efficient (moving backward on a
 *                compressed file can be *extremely* slow).
 *   offset_list: A list of N numeric vectors (1 element per input file) with
 *                the same shape as 'nrec_list', i.e. each numeric vector has 1
 *                value per FASTA block. This value is the offset of the block
 *                (i.e. the offset of its first record) relative to the start
 *                of the file. Measured in bytes.
 *   elementType: The elementType of the XStringSet to return (its class is
 *                inferred from this).
 *   lkup:        Lookup table for encoding the incoming sequence bytes.
 */
SEXP read_XStringSet_from_fasta_blocks(SEXP seqlength,
		SEXP filexp_list, SEXP nrec_list, SEXP offset_list,
		SEXP elementType, SEXP lkup)
{
	const char *element_type;
	char classname[40];  /* longest string should be "DNAStringSet" */
	SEXP ans, filexp, nrec, offset;
	FASTA_loaderExt loader_ext;
	FASTAloader loader;
	int i, j, nrec_j, recno;
	long long int offset_j, ninvalid;

	element_type = CHAR(STRING_ELT(elementType, 0));
	if (snprintf(classname, sizeof(classname), "%sSet", element_type)
	    >= sizeof(classname))
	{
		error("Biostrings internal error in "
		      "read_XStringSet_from_fasta_blocks(): "
		      "'classname' buffer too small");
	}
	PROTECT(ans = alloc_XRawList(classname, element_type, seqlength));
	loader_ext = new_FASTA_loaderExt(ans);
	loader = new_FASTA_loader(lkup, &loader_ext);
	for (i = 0; i < LENGTH(filexp_list); i++) {
		filexp = VECTOR_ELT(filexp_list, i);
		nrec = VECTOR_ELT(nrec_list, i);
		offset = VECTOR_ELT(offset_list, i);
		for (j = 0; j < LENGTH(nrec); j++) {
			nrec_j = INTEGER(nrec)[j];
			offset_j = llround(REAL(offset)[j]);
			filexp_seek(filexp, offset_j, SEEK_SET);
			recno = 0;
			ninvalid = 0LL;
			parse_FASTA_file(filexp, nrec_j, 0, 0,
					 &loader, &recno, &offset_j, &ninvalid);
		}
	}
	UNPROTECT(1);
	return ans;
}
SEXP extract_transcripts(SEXP classname, SEXP x,
		SEXP exonStarts, SEXP exonEnds, SEXP strand,
		SEXP decreasing_rank_on_minus_strand, SEXP lkup)
{
	cachedCharSeq X, Y;
	SEXP ans_width, ans, starts, ends;
	cachedXVectorList cached_ans;
	int decreasing_rank_on_minus_strand0, ans_length,
	    i, on_minus_strand;

	X = cache_XRaw(x);
	decreasing_rank_on_minus_strand0 =
		LOGICAL(decreasing_rank_on_minus_strand)[0];
	PROTECT(ans_width = mk_transcript_widths(exonStarts,
					exonEnds, X.length));
	PROTECT(ans = alloc_XRawList(CHAR(STRING_ELT(classname, 0)),
					get_classname(x), ans_width));
	cached_ans = cache_XVectorList(ans);
	ans_length = get_cachedXVectorList_length(&cached_ans);
	for (i = 0; i < ans_length; i++) {
		starts = VECTOR_ELT(exonStarts, i);
		if (starts == R_NilValue || LENGTH(starts) == 0)
			continue;
		ends = VECTOR_ELT(exonEnds, i);
		on_minus_strand = strand_is_minus(strand, i);
		if (on_minus_strand == -1) {
			UNPROTECT(2);
			error("%s", errmsg_buf);
		}
		Y = get_cachedXRawList_elt(&cached_ans, i);
		/* Y.seq is a const char * so we need to cast it to
		   char * before we can write to it */
		copy_transcript((char *) Y.seq, &X,
			starts, ends,
			on_minus_strand, decreasing_rank_on_minus_strand0,
			lkup);
	}
	UNPROTECT(2);
	return ans;
}