static int encseq_lua_has_description_support(lua_State *L) { GtEncseq **encseq; encseq = check_encseq(L, 1); lua_pushboolean(L, gt_encseq_has_description_support(*encseq)); return 1; }
static void gt_seqorder_output(unsigned long seqnum, GtEncseq *encseq) { GtEncseqReader *esr; unsigned long startpos, len, desclen = 0; const char *desc = NULL; unsigned long i; startpos = gt_encseq_seqstartpos(encseq, seqnum); len = gt_encseq_seqlength(encseq, seqnum); gt_xfputc(GT_FASTA_SEPARATOR, stdout); if (gt_encseq_has_description_support(encseq)) { desc = gt_encseq_description(encseq, &desclen, seqnum); gt_xfwrite(desc, (size_t)1, (size_t)desclen, stdout); } gt_xfputc('\n', stdout); esr = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD, startpos); for (i = 0; i < len; i++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); gt_xfputc('\n', stdout); }
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args, const char *filename, GtError *err) { GtUword i, j, sfrom, sto; int had_err = 0; bool has_desc; GtEncseqReader *esr; gt_assert(encseq); if (!(has_desc = gt_encseq_has_description_support(encseq))) gt_warning("Missing description support for file %s", filename); if (strcmp(gt_str_get(args->mode), "fasta") == 0) { /* specify a single sequence to extract */ if (args->seq != GT_UNDEF_UWORD) { if (args->seq >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "requested sequence "GT_WU" exceeds number of sequences " "("GT_WU")", args->seq, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seq; sto = args->seq + 1; } else if (args->seqrng.start != GT_UNDEF_UWORD && args->seqrng.end != GT_UNDEF_UWORD) { /* specify a sequence range to extract */ if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq) || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "range "GT_WU"-"GT_WU" includes a sequence number " "exceeding the total number of sequences ("GT_WU")", args->seqrng.start, args->seqrng.end, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seqrng.start; sto = args->seqrng.end + 1; } else { /* extract all sequences */ sfrom = 0; sto = gt_encseq_num_of_sequences(encseq); } for (i = sfrom; i < sto; i++) { GtUword desclen, startpos, len; char buf[BUFSIZ]; const char *desc = NULL; /* XXX: maybe make this distinction in the functions via readmode? */ if (!GT_ISDIRREVERSE(args->rm)) { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, i); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } else { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, gt_encseq_num_of_sequences(encseq)-1-i); startpos = gt_encseq_total_length(encseq) - (gt_encseq_seqstartpos(encseq, gt_encseq_num_of_sequences( encseq)-1-i) + len); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, gt_encseq_num_of_sequences(encseq)-1-i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } gt_assert(desc); /* output description */ gt_xfputc(GT_FASTA_SEPARATOR, stdout); gt_xfwrite(desc, 1, desclen, stdout); gt_xfputc('\n', stdout); /* XXX: make this more efficient by writing in a buffer first and then showing the result */ if (args->singlechars) { for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_get_decoded_char(encseq, startpos + j, args->rm), stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos); for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); } gt_xfputc('\n', stdout); } } if (strcmp(gt_str_get(args->mode), "concat") == 0) { GtUword from = 0, to = gt_encseq_total_length(encseq) - 1; if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) { if (args->rng.end > to) { had_err = -1; gt_error_set(err, "end of range ("GT_WU") exceeds encoded sequence length " "("GT_WU")", args->rng.end, to); } if (!had_err) { from = args->rng.start; to = args->rng.end; } } if (!had_err) { if (args->singlechars) { for (j = from; j <= to; j++) { char cc = gt_encseq_get_decoded_char(encseq, j, args->rm); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from); if (esr) { for (j = from; j <= to; j++) { char cc = gt_encseq_reader_next_decoded_char(esr); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } gt_encseq_reader_delete(esr); } } gt_xfputc('\n', stdout); } } return had_err; }
static int gt_seqorder_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSeqorderArguments *arguments = tool_arguments; int had_err = 0; GtEncseq *encseq; GtEncseqLoader *loader; unsigned long i, nofseqs; gt_error_check(err); gt_assert(arguments != NULL); /* load encseq */ loader = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(loader, argv[parsed_args], err); if (encseq == NULL) had_err = -1; if (had_err == 0 && !gt_encseq_has_description_support(encseq)) gt_warning("%s has no description support", argv[parsed_args]); if (!had_err) { nofseqs = gt_encseq_num_of_sequences(encseq); if (arguments->invert) { for (i = nofseqs; i > 0; i--) gt_seqorder_output(i - 1, encseq); } else if (arguments->shuffle) { unsigned long *seqnums; seqnums = gt_malloc(sizeof (unsigned long) * nofseqs); gt_seqorder_get_shuffled_seqnums(nofseqs, seqnums); for (i = 0; i < nofseqs; i++) gt_seqorder_output(seqnums[i], encseq); gt_free(seqnums); } else { GtSuffixsortspace *suffixsortspace; gt_assert(arguments->sort || arguments->revsort); suffixsortspace = gt_suffixsortspace_new(nofseqs, /* Use iterator over sequence separators: saves a lot of binary searches */ gt_encseq_seqstartpos(encseq, nofseqs-1), false,NULL); gt_seqorder_sort(suffixsortspace, encseq); if (arguments->sort) for (i = 0; i < nofseqs; i++) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i)), encseq); else for (i = nofseqs; i > 0; i--) gt_seqorder_output(gt_encseq_seqnum(encseq, gt_suffixsortspace_getdirect(suffixsortspace, i - 1)), encseq); gt_suffixsortspace_delete(suffixsortspace, false); } } gt_encseq_loader_delete(loader); gt_encseq_delete(encseq); return had_err; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }