static int gt_cntlist_parse_ascii(FILE *infp, bool alloc_cntlist, GtBitsequence **cntlist, GtUword *nofreads, GtError *err) { int n; GtUword seqnum; gt_assert(infp != NULL && nofreads != NULL && cntlist != NULL); /*@i1@*/ gt_error_check(err); n = fscanf(infp, "[n: "GT_WU"]\n", nofreads); if (n!=1 || *nofreads == 0) { gt_error_set(err, "contained reads file: unrecognized format"); return -1; } if (alloc_cntlist) GT_INITBITTAB(*cntlist, *nofreads); while (true) { n = fscanf(infp, ""GT_WU"\n", &seqnum); if (n == EOF) break; else if (n != 1) { gt_error_set(err, "contained reads file: unrecognized format"); return -1; } GT_SETIBIT(*cntlist, seqnum); } return 0; }
static int gt_cntlist_parse_bin(FILE *infp, bool alloc_cntlist, GtBitsequence **cntlist, GtUword *nofreads, GtError *err) { int had_err = gt_cntlist_parse_bin_or_bit_header(infp, nofreads, err); if (had_err == 0) { size_t n; GtUword seqnum; gt_assert(cntlist != NULL); if (alloc_cntlist) GT_INITBITTAB(*cntlist, *nofreads); while (true) { n = fread(&seqnum, sizeof (GtUword), (size_t)1, infp); if (n != (size_t)1) { if (!feof(infp)) { gt_error_set(err, "contained reads file: unrecognized format"); had_err = -1; } break; } GT_SETIBIT(*cntlist, seqnum); } } return had_err; }
void gt_initstorematch(Storematchinfo *storematch, const GtEncseq *encseq) { unsigned long numofdbsequences = gt_encseq_num_of_sequences(encseq); storematch->encseq = encseq; GT_INITBITTAB(storematch->hasmatch,numofdbsequences); }
/* prepare sspbittab and determine length of shortest sequence */ static void prepare_sspbittab_and_shortest(unsigned long totallength, ContfindBUstate *state) { unsigned long length, lastseqstart, i, ssp; GT_INITBITTAB(state->sspbittab, totallength + 1); lastseqstart = 0; state->shortest = totallength; for (i = 1UL; i <= state->nofsequences - 1; i++) { ssp = gt_encseq_seqstartpos(state->encseq, i) - 1; GT_SETIBIT(state->sspbittab, ssp); length = ssp - lastseqstart; lastseqstart = ssp + 1; if (length < state->shortest) state->shortest = length; } GT_SETIBIT(state->sspbittab, totallength); length = totallength - lastseqstart; if (length < state->shortest) state->shortest = length; }
static int gt_cntlist_parse_bit(FILE *infp, bool alloc_cntlist, GtBitsequence **cntlist, GtUword *nofreads, GtError *err) { int had_err = gt_cntlist_parse_bin_or_bit_header(infp, nofreads, err); if (had_err == 0) { size_t n; gt_assert(cntlist != NULL); if (alloc_cntlist) { GT_INITBITTAB(*cntlist, *nofreads); n = fread(*cntlist, sizeof (GtBitsequence), GT_NUMOFINTSFORBITS(*nofreads), infp); if (n != GT_NUMOFINTSFORBITS(*nofreads)) { gt_error_set(err, "contained reads file: unrecognized format"); had_err = -1; } } else { /* combine using OR with existing data */ size_t i; for (i = 0; i < GT_NUMOFINTSFORBITS(*nofreads); i++) { GtBitsequence value; n = fread(&value, sizeof (GtBitsequence), (size_t)1, infp); if (n != (size_t)1) { gt_error_set(err, "contained reads file: unrecognized format"); had_err = -1; break; } *cntlist[i] |= value; } } } return had_err; }
static inline void rdj_pairwise_generic(bool use_dp, GtOvlfindMode m, GtEncseq *encseq, bool revcompl, bool show_progressbar, bool use_kmp, double max_error, GtUword min_length, bool find_nonmaximal, GtSpmproc proc, GtSpmprocA proc_a, void* procdata, bool cntfilter, GtBitsequence *cntreads_in, GtBitsequence **cntreads_out, GtUword *nofreads) { GtContfind containment_status; GtBitsequence *cntreads = NULL; GtUint64 progress = 0; GtUword i, j, startpos, v_seqnum, nofsequences, n; struct Read u, v; struct Data d; gt_kmp_t** kmp_values = NULL; GT_RDJ_PAIRWISE_INIT_STRUCT_DATA(d, proc, proc_a, procdata, &u, &v, 0); gt_assert(encseq != NULL); d.mode = m; if ((m == GT_OVLFIND_ALL) && cntfilter) d.mode = GT_OVLFIND_PROPER_SPM; n = gt_encseq_num_of_sequences(encseq); if (use_kmp) kmp_values = prepare_kmp_values(encseq, n); nofsequences = n; if (revcompl) n = n >> 1; if (cntreads_in != NULL) cntreads = cntreads_in; else if (m != GT_OVLFIND_SPM) GT_INITBITTAB(cntreads, n); if (show_progressbar) gt_progressbar_start(&progress, (GtUint64)n * ((GtUint64)n - 1ULL) / 2ULL); for (i = 0; i < n; i++) { u.seqnum = i; u.direct = true; u.len = gt_encseq_seqlength(encseq, i); u.seq = gt_malloc(sizeof (char) * (u.len + 1)); startpos = gt_encseq_seqstartpos(encseq, i); gt_encseq_extract_decoded(encseq, u.seq, startpos, startpos + u.len - 1); u.seq[u.len] = '\0'; if (use_kmp) { gt_assert(kmp_values != NULL); u.pi = kmp_values[i]; } for (j = i; j < n; j++) { if (cntfilter) { gt_assert(cntreads != NULL); if ((bool)GT_ISIBITSET(cntreads, i)) break; if ((bool)GT_ISIBITSET(cntreads, j)) continue; } v.seqnum = j; /* find overlaps using direct v */ v.direct = true; v.len = gt_encseq_seqlength(encseq, j); v.seq = gt_malloc(sizeof (char) * (v.len + 1)); startpos = gt_encseq_seqstartpos(encseq, j); gt_encseq_extract_decoded(encseq, v.seq, startpos, startpos + v.len - 1); v.seq[v.len] = '\0'; if (use_kmp) { gt_assert(kmp_values != NULL); v.pi = kmp_values[j]; } containment_status = use_dp ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal) : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal); if (m != GT_OVLFIND_SPM) mark_contained(containment_status, u.seqnum, v.seqnum, cntreads); /* find overlaps using reverse complement of v */ if (revcompl) { v_seqnum = nofsequences - j - 1; v.direct = false; gt_assert(gt_encseq_seqlength(encseq, j) == gt_encseq_seqlength(encseq, v_seqnum)); startpos = gt_encseq_seqstartpos(encseq, v_seqnum); gt_encseq_extract_decoded(encseq, v.seq, startpos, startpos + v.len - 1); if (use_kmp) { gt_assert(kmp_values != NULL); v.pi = kmp_values[v_seqnum]; } containment_status = use_dp ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal) : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal); if (m != GT_OVLFIND_SPM) mark_contained(containment_status, u.seqnum, v.seqnum, cntreads); } gt_free(v.seq); progress++; } gt_free(u.seq); } if (cntreads_out != NULL) *cntreads_out = cntreads; else if (cntreads_in == NULL) gt_free(cntreads); if (nofreads != NULL) *nofreads = n; if (use_kmp) free_kmp_values(kmp_values, revcompl ? n << 1 : n); if (show_progressbar) gt_progressbar_stop(); }
static int gt_readjoiner_cnttest_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtReadjoinerCnttestArguments *arguments = tool_arguments; GtEncseqLoader *el = NULL; GtEncseq *reads = NULL; GtBitsequence *bits = NULL; GtUword nofreads; int had_err = 0; gt_error_check(err); gt_assert(arguments); if (arguments->test == GT_READJOINER_CNTTEST_SHOWLIST) { GtStr *fn = NULL; fn = gt_str_clone(arguments->readset); gt_str_append_cstr(fn, GT_READJOINER_SUFFIX_CNTLIST); had_err = gt_cntlist_parse(gt_str_get(fn), true, &bits, &nofreads, err); gt_str_delete(fn); } else if (arguments->test == GT_READJOINER_CNTTEST_BRUTEFORCE || arguments->test == GT_READJOINER_CNTTEST_KMP) { el = gt_encseq_loader_new(); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_disable_autosupport(el); if (!arguments->singlestrand) gt_encseq_loader_mirror(el); reads = gt_encseq_loader_load(el, gt_str_get(arguments->readset), err); if (reads == NULL) had_err = -1; else { gt_rdj_pairwise_exact(GT_OVLFIND_CNT, reads, !arguments->singlestrand, false, arguments->test == GT_READJOINER_CNTTEST_KMP, 1UL, true, NULL, NULL, false, NULL, &bits, &nofreads); } gt_encseq_delete(reads); gt_encseq_loader_delete(el); } else if (arguments->test == GT_READJOINER_CNTTEST_ESA) { Sequentialsuffixarrayreader *ssar = NULL; GtUword readlength = 0, firstrevcompl = 0; GtLogger *verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); ssar = gt_newSequentialsuffixarrayreaderfromfile(gt_str_get( arguments->readset), SARR_LCPTAB | SARR_SUFTAB | SARR_SSPTAB, true, verbose_logger, err); if (gt_error_is_set(err)) had_err = -1; else { nofreads = gt_encseq_num_of_sequences(ssar->encseq); if (!arguments->singlestrand) { nofreads = GT_DIV2(nofreads); firstrevcompl = nofreads; } GT_INITBITTAB(bits, nofreads); if (!arguments->singlestrand) if (gt_encseq_accesstype_get(ssar->encseq) == GT_ACCESS_TYPE_EQUALLENGTH) readlength = gt_encseq_seqlength(ssar->encseq, 0); (void)gt_contfind_bottomup(ssar, false, bits, arguments->singlestrand ? 0 : firstrevcompl, readlength); } if (ssar != NULL) gt_freeSequentialsuffixarrayreader(&ssar); gt_logger_delete(verbose_logger); } else { gt_assert(false); } if (!had_err) had_err = gt_cntlist_show(bits, nofreads, NULL, false, err); gt_free(bits); return had_err; }