/* --- .Call ENTRY POINT --- */ SEXP read_XStringSet_from_fastq(SEXP filexp_list, SEXP nrec, SEXP skip, SEXP seek_first_rec, SEXP use_names, SEXP elementType, SEXP lkup) { int nrec0, skip0, seek_rec0, load_seqids, ans_length, i, recno; SEXP filexp, ans_geom, ans_width, ans, ans_names; const char *element_type; char classname[40]; /* longest string should be "DNAStringSet" */ FASTQ_loaderExt loader_ext; FASTQloader loader; nrec0 = INTEGER(nrec)[0]; skip0 = INTEGER(skip)[0]; seek_rec0 = LOGICAL(seek_first_rec)[0]; load_seqids = LOGICAL(use_names)[0]; PROTECT(ans_geom = fastq_geometry(filexp_list, nrec, skip, seek_first_rec)); ans_length = INTEGER(ans_geom)[0]; PROTECT(ans_width = NEW_INTEGER(ans_length)); if (ans_length != 0) { if (INTEGER(ans_geom)[1] == NA_INTEGER) { UNPROTECT(2); error("read_XStringSet_from_fastq(): FASTQ files with " "variable sequence lengths are not supported yet"); } for (recno = 0; recno < ans_length; recno++) INTEGER(ans_width)[recno] = INTEGER(ans_geom)[1]; } element_type = CHAR(STRING_ELT(elementType, 0)); if (snprintf(classname, sizeof(classname), "%sSet", element_type) >= sizeof(classname)) { UNPROTECT(2); error("Biostrings internal error in " "read_XStringSet_from_fastq(): " "'classname' buffer too small"); } PROTECT(ans = alloc_XRawList(classname, element_type, ans_width)); loader_ext = new_FASTQ_loaderExt(ans, lkup); loader = new_FASTQ_loader(load_seqids, &loader_ext); recno = 0; for (i = 0; i < LENGTH(filexp_list); i++) { filexp = VECTOR_ELT(filexp_list, i); filexp_rewind(filexp); parse_FASTQ_file(filexp, nrec0, skip0, seek_rec0, &loader, &recno); } if (load_seqids) { PROTECT(ans_names = new_CHARACTER_from_CharAEAE(loader_ext.ans_names_buf)); _set_XStringSet_names(ans, ans_names); UNPROTECT(1); } UNPROTECT(3); return ans; }
/* --- .Call ENTRY POINT --- * Args: * x: An XStringSetList object. * sep: An XString object of the same seqtype as 'x'. * Returns an XStringSet object parallel to and of the same seqtype as 'x'. */ SEXP XStringSetList_unstrsplit(SEXP x, SEXP sep, SEXP seqtype) { XStringSetList_holder x_holder; XStringSet_holder x_elt_holder, ans_holder; Chars_holder sep_holder, ans_elt_holder; int x_len, sep_len, i; const char *seqtype0; char ans_elt_type[37]; /* longest string should be "DNAString" */ char ans_classname[40]; /* longest string should be "DNAStringSet" */ SEXP ans, ans_width, ans_names; x_holder = _hold_XStringSetList(x); x_len = _get_length_from_XStringSetList_holder(&x_holder); sep_holder = hold_XRaw(sep); sep_len = sep_holder.length; seqtype0 = CHAR(STRING_ELT(seqtype, 0)); if (snprintf(ans_elt_type, sizeof(ans_elt_type), "%sString", seqtype0) >= sizeof(ans_elt_type)) error("Biostrings internal error in " "XStringSetList_unstrsplit(): " "'ans_elt_type' buffer too small"); if (snprintf(ans_classname, sizeof(ans_classname), "%sSet", ans_elt_type) >= sizeof(ans_classname)) error("Biostrings internal error in " "XStringSetList_unstrsplit(): " "'ans_classname' buffer too small"); /* 1st pass: compute 'ans_width' */ PROTECT(ans_width = NEW_INTEGER(x_len)); for (i = 0; i < x_len; i++) { x_elt_holder = _get_elt_from_XStringSetList_holder( &x_holder, i); INTEGER(ans_width)[i] = compute_joined_strings_length( &x_elt_holder, sep_len); } /* Allocate 'ans' */ PROTECT(ans = alloc_XRawList(ans_classname, ans_elt_type, ans_width)); /* 2nd pass: fill 'ans' */ ans_holder = _hold_XStringSet(ans); for (i = 0; i < x_len; i++) { x_elt_holder = _get_elt_from_XStringSetList_holder( &x_holder, i); ans_elt_holder = _get_elt_from_XStringSet_holder( &ans_holder, i); join_strings_in_buf((char *) ans_elt_holder.ptr, &x_elt_holder, sep_holder.ptr, sep_holder.length); } PROTECT(ans_names = duplicate(get_CompressedList_names(x))); _set_XStringSet_names(ans, ans_names); UNPROTECT(3); return ans; }
/* * --- .Call ENTRY POINT --- * Return an AAStringSet object. */ SEXP DNAStringSet_translate(SEXP x, SEXP base_codes, SEXP lkup, SEXP skipcode) { cachedXStringSet X, Y; cachedCharSeq X_elt, Y_elt; char skipcode0; int ans_length, i, errcode; SEXP ans, width, ans_width; TwobitEncodingBuffer teb; X = _cache_XStringSet(x); skipcode0 = (unsigned char) INTEGER(skipcode)[0]; ans_length = _get_cachedXStringSet_length(&X); PROTECT(width = NEW_INTEGER(ans_length)); for (i = 0; i < ans_length; i++) { X_elt = _get_cachedXStringSet_elt(&X, i); INTEGER(width)[i] = X_elt.length / 3; } PROTECT(ans = alloc_XRawList("AAStringSet", "AAString", width)); Y = _cache_XStringSet(ans); teb = _new_TwobitEncodingBuffer(base_codes, 3, 0); ans_width = _get_XStringSet_width(ans); for (i = 0; i < ans_length; i++) { X_elt = _get_cachedXStringSet_elt(&X, i); Y_elt = _get_cachedXStringSet_elt(&Y, i); errcode = translate(&X_elt, &Y_elt, &teb, lkup, skipcode0); if (errcode == -1) { UNPROTECT(2); if (ans_length == 1) error("%s", errmsg_buf); else error("in 'x[[%d]]': %s", i + 1, errmsg_buf); } if (errcode == 1) { if (ans_length == 1) warning("%s", errmsg_buf); else warning("in 'x[[%d]]': %s", i + 1, errmsg_buf); } INTEGER(ans_width)[i] = Y_elt.length; } UNPROTECT(2); return ans; }
/* --- .Call ENTRY POINT --- * "FASTA blocks" are groups of consecutive FASTA records. * Args: * seqlength: Integer vector with 1 element per record to read. The * elements must be placed in the order that the records are * going to be read. * filexp_list: A list of N "File External Pointers" (see src/io_utils.c in * the XVector package) with 1 element per input file. Files are * going to be accessed from first to last in the list. * nrec_list: A list of N integer vectors (1 element per input file). * Each integer vector has 1 value per FASTA block, which is the * number of records in the block. IMPORTANT: Even if not * required, the blocks in each integer vector should preferably * be placed in the same order as in the file. This ensures that * the calls to filexp_seek() are always moving forward * in the file and are therefore efficient (moving backward on a * compressed file can be *extremely* slow). * offset_list: A list of N numeric vectors (1 element per input file) with * the same shape as 'nrec_list', i.e. each numeric vector has 1 * value per FASTA block. This value is the offset of the block * (i.e. the offset of its first record) relative to the start * of the file. Measured in bytes. * elementType: The elementType of the XStringSet to return (its class is * inferred from this). * lkup: Lookup table for encoding the incoming sequence bytes. */ SEXP read_XStringSet_from_fasta_blocks(SEXP seqlength, SEXP filexp_list, SEXP nrec_list, SEXP offset_list, SEXP elementType, SEXP lkup) { const char *element_type; char classname[40]; /* longest string should be "DNAStringSet" */ SEXP ans, filexp, nrec, offset; FASTA_loaderExt loader_ext; FASTAloader loader; int i, j, nrec_j, recno; long long int offset_j, ninvalid; element_type = CHAR(STRING_ELT(elementType, 0)); if (snprintf(classname, sizeof(classname), "%sSet", element_type) >= sizeof(classname)) { error("Biostrings internal error in " "read_XStringSet_from_fasta_blocks(): " "'classname' buffer too small"); } PROTECT(ans = alloc_XRawList(classname, element_type, seqlength)); loader_ext = new_FASTA_loaderExt(ans); loader = new_FASTA_loader(lkup, &loader_ext); for (i = 0; i < LENGTH(filexp_list); i++) { filexp = VECTOR_ELT(filexp_list, i); nrec = VECTOR_ELT(nrec_list, i); offset = VECTOR_ELT(offset_list, i); for (j = 0; j < LENGTH(nrec); j++) { nrec_j = INTEGER(nrec)[j]; offset_j = llround(REAL(offset)[j]); filexp_seek(filexp, offset_j, SEEK_SET); recno = 0; ninvalid = 0LL; parse_FASTA_file(filexp, nrec_j, 0, 0, &loader, &recno, &offset_j, &ninvalid); } } UNPROTECT(1); return ans; }
SEXP extract_transcripts(SEXP classname, SEXP x, SEXP exonStarts, SEXP exonEnds, SEXP strand, SEXP decreasing_rank_on_minus_strand, SEXP lkup) { cachedCharSeq X, Y; SEXP ans_width, ans, starts, ends; cachedXVectorList cached_ans; int decreasing_rank_on_minus_strand0, ans_length, i, on_minus_strand; X = cache_XRaw(x); decreasing_rank_on_minus_strand0 = LOGICAL(decreasing_rank_on_minus_strand)[0]; PROTECT(ans_width = mk_transcript_widths(exonStarts, exonEnds, X.length)); PROTECT(ans = alloc_XRawList(CHAR(STRING_ELT(classname, 0)), get_classname(x), ans_width)); cached_ans = cache_XVectorList(ans); ans_length = get_cachedXVectorList_length(&cached_ans); for (i = 0; i < ans_length; i++) { starts = VECTOR_ELT(exonStarts, i); if (starts == R_NilValue || LENGTH(starts) == 0) continue; ends = VECTOR_ELT(exonEnds, i); on_minus_strand = strand_is_minus(strand, i); if (on_minus_strand == -1) { UNPROTECT(2); error("%s", errmsg_buf); } Y = get_cachedXRawList_elt(&cached_ans, i); /* Y.seq is a const char * so we need to cast it to char * before we can write to it */ copy_transcript((char *) Y.seq, &X, starts, ends, on_minus_strand, decreasing_rank_on_minus_strand0, lkup); } UNPROTECT(2); return ans; }