static GtUword gt_encseq_col_num_of_files(const GtSeqCol *sc) { const GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_assert(esc); return gt_encseq_num_of_files(esc->encseq); }
GtBUstate_shulen *gt_sfx_multiesashulengthdist_new(const GtEncseq *encseq, GenomediffInfo *gd_info) { GtBUstate_shulen *bustate; bustate = gt_malloc(sizeof (*bustate)); bustate->encseq = encseq; bustate->previousbucketlastsuffix = ULONG_MAX; bustate->idxoffset = 0; bustate->firstedgefromroot = false; #ifdef SHUDEBUG bustate->nextid = 0; #endif if (gd_info == NULL) bustate->unit_info = gt_shu_unit_info_new(encseq); else bustate->unit_info = gd_info->unit_info; bustate->numofdbfiles = gt_encseq_num_of_files(encseq); #ifdef GENOMEDIFF_PAPER_IMPL bustate->leafdist = gt_malloc(sizeof (*bustate->leafdist) * bustate->numofdbfiles); #endif bustate->file_to_genome_map = bustate->unit_info->map_files; if (gd_info == NULL) bustate->shulengthdist = shulengthdist_new(bustate->numofdbfiles); else bustate->shulengthdist = gd_info->shulensums; bustate->stack = (void *) gt_GtArrayGtBUItvinfo_new_shulen(); return bustate; }
int gt_multiesa2shulengthdist_print(Sequentialsuffixarrayreader *ssar, const GtEncseq *encseq, GtError *err) { GtBUstate_shulen *state; bool haserr = false; state = gt_malloc(sizeof (*state)); state->numofdbfiles = gt_encseq_num_of_files(encseq); state->encseq = encseq; #ifdef GENOMEDIFF_PAPER_IMPL state->leafdist = gt_malloc(sizeof (*state->leafdist) * state->numofdbfiles); #endif #ifdef SHUDEBUG state->nextid = 0; #endif state->shulengthdist = shulengthdist_new(state->numofdbfiles); if (gt_esa_bottomup_shulen(ssar, state, err) != 0) { haserr = true; } if (!haserr) { shulengthdist_print(NULL,(const uint64_t * const*) state->shulengthdist, state->numofdbfiles); } gt_array2dim_delete(state->shulengthdist); #ifdef GENOMEDIFF_PAPER_IMPL gt_free(state->leafdist); #endif gt_free(state); return haserr ? -1 : 0; }
static int encseq_lua_num_of_files(lua_State *L) { GtEncseq **encseq; encseq = check_encseq(L, 1); lua_pushnumber(L, gt_encseq_num_of_files(*encseq)); return 1; }
static GtUword gt_encseq_col_get_sequence_length(const GtSeqCol *sc, GtUword filenum, GtUword seqnum) { GtEncseqCol *esc; GtUword encseq_seqnum; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; return gt_encseq_seqlength(esc->encseq, encseq_seqnum); }
static const char* gt_encseq_col_get_md5_fingerprint(const GtSeqCol *sc, GtUword filenum, GtUword seqnum) { GtEncseqCol *esc; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); return gt_md5_tab_get(esc->md5_tab, gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum); }
static GtUword gt_encseq_col_num_of_seqs(const GtSeqCol *sc, GtUword filenum) { GtEncseqCol *esc; /* XXX cache function evaluated values */ esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); if (gt_encseq_num_of_files(esc->encseq) == 1 && filenum == 0) return gt_encseq_num_of_sequences(esc->encseq); else if (filenum == gt_encseq_num_of_files(esc->encseq) - 1) { return (gt_encseq_num_of_sequences(esc->encseq) - gt_encseq_filenum_first_seqnum(esc->encseq, filenum)); } else { GtUword firstpos, nextpos; gt_assert(filenum < gt_encseq_num_of_files(esc->encseq) - 1); firstpos = gt_encseq_filenum_first_seqnum(esc->encseq, filenum); nextpos = gt_encseq_filenum_first_seqnum(esc->encseq, filenum + 1); return nextpos - firstpos; } }
static int encseq_lua_filestartpos(lua_State *L) { GtEncseq **encseq; GtUword fileno; encseq = check_encseq(L, 1); fileno = luaL_checknumber(L, 2); luaL_argcheck(L, fileno < gt_encseq_num_of_files(*encseq), 2, "cannot exceed number of files"); lua_pushnumber(L, gt_encseq_filestartpos(*encseq, fileno)); return 1; }
static char* gt_encseq_col_get_description(const GtSeqCol *sc, GtUword filenum, GtUword seqnum) { GtEncseqCol *esc; const char *desc; GtUword encseq_seqnum, desclen; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq)); desc = gt_encseq_description(esc->encseq, &desclen, encseq_seqnum); gt_assert(desc && desclen > 0); return gt_cstr_dup_nt(desc, desclen);; }
static char* gt_encseq_col_get_sequence(const GtSeqCol *sc, GtUword filenum, GtUword seqnum, GtUword start, GtUword end) { GtEncseqCol *esc; char *out; GtUword encseq_seqnum, startpos; esc = gt_encseq_col_cast(sc); gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq)); encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum; gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq)); gt_assert(start <= end); startpos = gt_encseq_seqstartpos(esc->encseq, encseq_seqnum); out = gt_calloc(end - start + 1, sizeof (char)); gt_encseq_extract_decoded(esc->encseq, out, startpos + start, startpos + end); return out; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }