GtQuerysubstringmatchiterator *gt_querysubstringmatchiterator_new( const GtEncseq *dbencseq, GtUword totallength, const ESASuffixptr *suftabpart, GtReadmode db_readmode, GtUword numberofsuffixes, const GtStrArray *query_files, const GtEncseq *query_encseq, GtReadmode query_readmode, unsigned int userdefinedleastlength, GtError *err) { GtQuerysubstringmatchiterator *qsmi = gt_malloc(sizeof *qsmi); qsmi->dbencseq = dbencseq; qsmi->suftabpart = suftabpart; qsmi->db_readmode = db_readmode; qsmi->numberofsuffixes = numberofsuffixes; qsmi->totallength = totallength; qsmi->userdefinedleastlength = (GtUword) userdefinedleastlength; qsmi->queryunitnum = 0; qsmi->desc = NULL; qsmi->query_for_seqit = NULL; qsmi->query_seqlen = 0; qsmi->queryrep.sequence = NULL; qsmi->queryrep.encseq = query_encseq; qsmi->queryrep.readmode = query_readmode; qsmi->queryrep.startpos = 0; qsmi->dbstart = 0; qsmi->matchlength = 0; qsmi->querysubstring.queryrep = &qsmi->queryrep; qsmi->mmsi = gt_mmsearchiterator_new_empty(); qsmi->mmsi_defined = false; if (query_files == NULL || gt_str_array_size(query_files) == 0) { gt_assert(query_encseq != NULL); qsmi->seqit = NULL; qsmi->query_encseq_numofsequences = (uint64_t) gt_encseq_num_of_sequences(query_encseq); } else { gt_assert(query_encseq == NULL); qsmi->seqit = gt_seq_iterator_sequence_buffer_new(query_files, err); if (qsmi->seqit == NULL) { gt_querysubstringmatchiterator_delete(qsmi); return NULL; } gt_seq_iterator_set_symbolmap(qsmi->seqit, gt_alphabet_symbolmap(gt_encseq_alphabet(dbencseq))); } return qsmi; }
static int gt_fasta_header_iterator_reset(GtCstrIterator *cstr_iterator, GtError *err) { GtFastaHeaderIterator *fhi = gt_fasta_header_iterator_cast(cstr_iterator); gt_error_check(err); gt_seq_iterator_delete(fhi->seq_iter); fhi->seq_iter = gt_seq_iterator_sequence_buffer_new(fhi->filenametab, err); if (fhi->seq_iter == NULL) return -1; gt_seq_iterator_set_sequence_output(fhi->seq_iter, false); return 0; }
int gt_esa2shulengthqueryfiles(unsigned long *totalgmatchlength, const Suffixarray *suffixarray, const GtStrArray *queryfilenames, GtError *err) { bool haserr = false; GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; GtAlphabet *alphabet; gt_error_check(err); alphabet = gt_encseq_alphabet(suffixarray->encseq); gt_assert(gt_str_array_size(queryfilenames) == 1UL); seqit = gt_seq_iterator_sequence_buffer_new(queryfilenames, err); if (!seqit) { haserr = true; } if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet)); for (; /* Nothing */; ) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } *totalgmatchlength += gt_esa2shulengthquery(suffixarray,query,querylen); } gt_seq_iterator_delete(seqit); } return haserr ? -1 : 0; }
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err) { Genericindex *genericindex = NULL; bool haserr = false; GtLogger *logger; const GtEncseq *encseq = NULL; logger = gt_logger_new(idxlocalioptions->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); if (idxlocalioptions->doonline) { GtEncseqLoader *el; el = gt_encseq_loader_new(); gt_encseq_loader_require_multiseq_support(el); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname), err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } } else { genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname), idxlocalioptions->withesa, idxlocalioptions->withesa || idxlocalioptions->docompare, false, true, 0, logger, err); if (genericindex == NULL) { haserr = true; } else { encseq = genericindex_getencseq(genericindex); } } if (!haserr) { GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; Limdfsresources *limdfsresources = NULL; const AbstractDfstransformer *dfst; SWdpresource *swdpresource = NULL; Showmatchinfo showmatchinfo; ProcessIdxMatch processmatch; GtAlphabet *a; void *processmatchinfoonline, *processmatchinfooffline; Storematchinfo storeonline, storeoffline; a = gt_encseq_alphabet(encseq); if (idxlocalioptions->docompare) { processmatch = storematch; gt_initstorematch(&storeonline,encseq); gt_initstorematch(&storeoffline,encseq); processmatchinfoonline = &storeonline; processmatchinfooffline = &storeoffline; } else { processmatch = showmatch; showmatchinfo.encseq = encseq; showmatchinfo.characters = gt_alphabet_characters(a); showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a); showmatchinfo.showalignment = idxlocalioptions->showalignment; processmatchinfoonline = processmatchinfooffline = &showmatchinfo; } if (idxlocalioptions->doonline || idxlocalioptions->docompare) { swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapextend, idxlocalioptions->threshold, idxlocalioptions->showalignment, processmatch, processmatchinfoonline); } dfst = gt_locali_AbstractDfstransformer(); if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_assert(genericindex != NULL); limdfsresources = gt_newLimdfsresources(genericindex, true, 0, 0, /* maxpathlength */ true, /* keepexpandedonstack */ processmatch, processmatchinfooffline, NULL, /* processresult */ NULL, /* processresult info */ dfst); } seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles, err); if (!seqit) haserr = true; if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a)); for (showmatchinfo.queryunit = 0; /* Nothing */; showmatchinfo.queryunit++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } printf("process sequence " Formatuint64_t " of length %lu\n", PRINTuint64_tcast(showmatchinfo.queryunit),querylen); if (idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen); } if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_indexbasedlocali(limdfsresources, idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapstart, idxlocalioptions->gapextend, idxlocalioptions->threshold, query, querylen, dfst); } if (idxlocalioptions->docompare) { gt_checkandresetstorematch(showmatchinfo.queryunit, &storeonline,&storeoffline); } } if (limdfsresources != NULL) { gt_freeLimdfsresources(&limdfsresources,dfst); } if (swdpresource != NULL) { gt_freeSWdpresource(swdpresource); swdpresource = NULL; } gt_seq_iterator_delete(seqit); } if (idxlocalioptions->docompare) { gt_freestorematch(&storeonline); gt_freestorematch(&storeoffline); } } if (genericindex == NULL) { gt_encseq_delete((GtEncseq *) encseq); encseq = NULL; } else { genericindex_delete(genericindex); } gt_logger_delete(logger); logger = NULL; return haserr ? -1 : 0; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }
static int gt_callenumquerymatches_withindex( GtQuerysubstringmatchfunc findquerymatches, const Suffixarray *suffixarray, const GtStrArray *queryfiles, bool forwardstrand, bool reversestrand, unsigned int userdefinedleastlength, GtProcessquerybeforematching processquerybeforematching, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtError *err) { GtSeqIterator *seqit; bool haserr = false; seqit = gt_seq_iterator_sequence_buffer_new(queryfiles, err); if (seqit == NULL) { haserr = true; } else { GtQuerymatch *querymatchspaceptr = gt_querymatch_new(); const GtUchar *query; unsigned long querylen; int retval; uint64_t queryunitnum; GtUchar *queryreverse = NULL; unsigned long queryreverse_length = 0; char *desc = NULL; int mode; gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(gt_encseq_alphabet( suffixarray->encseq))); for (queryunitnum = 0; /* Nothing */; queryunitnum++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } if (querylen >= (unsigned long) userdefinedleastlength) { GtQueryrep queryrep; queryrep.encseq = NULL; queryrep.readmode = GT_READMODE_FORWARD; queryrep.startpos = 0; queryrep.length = querylen; for (mode = 0; mode <= 1; mode++) { if (mode == 0 && forwardstrand) { queryrep.sequence = query; queryrep.reversecopy = false; if (processquerybeforematching != NULL) { processquerybeforematching(processquerymatchinfo,desc,query, querylen,true); } } else { if (mode == 1 && reversestrand) { if (querylen > queryreverse_length) { queryreverse = gt_realloc(queryreverse, sizeof (*queryreverse) * querylen); queryreverse_length = querylen; } gt_copy_reversecomplement(queryreverse,query,querylen); queryrep.sequence = queryreverse; queryrep.reversecopy = true; if (processquerybeforematching != NULL) { processquerybeforematching(processquerymatchinfo,desc, queryreverse,querylen,false); } } else { queryrep.sequence = NULL; queryrep.reversecopy = false; } } if (queryrep.sequence != NULL) { int ret = findquerymatches(false, suffixarray, queryunitnum, &queryrep, (unsigned long) userdefinedleastlength, processquerymatch, processquerymatchinfo, querymatchspaceptr, err); if (ret != 0) { haserr = true; break; } } } } } gt_seq_iterator_delete(seqit); gt_free(queryreverse); gt_querymatch_delete(querymatchspaceptr); } return haserr ? -1 : 0; }