static GtEncseq *mapbwtencoding(const char *indexname, GtLogger *logger, GtError *err) { GtEncseqLoader *el; GtEncseq *ret; gt_error_check(err); el = gt_encseq_loader_new(); gt_encseq_loader_do_not_require_des_tab(el); gt_encseq_loader_do_not_require_ssp_tab(el); gt_encseq_loader_do_not_require_sds_tab(el); gt_encseq_loader_set_logger(el, logger); ret = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); return ret; }
static int inputsuffixarray(bool map, Suffixarray *suffixarray, unsigned int demand, const char *indexname, GtLogger *logger, GtError *err) { bool haserr = false; GtEncseqLoader *el; GtUword totallength = 0; gt_error_check(err); initsuffixarray(suffixarray); el = gt_encseq_loader_new(); if (!(demand & SARR_DESTAB)) gt_encseq_loader_do_not_require_des_tab(el); else gt_encseq_loader_require_des_tab(el); if (!(demand & SARR_SDSTAB)) gt_encseq_loader_do_not_require_sds_tab(el); else gt_encseq_loader_require_sds_tab(el); if (!(demand & SARR_SSPTAB)) gt_encseq_loader_do_not_require_ssp_tab(el); else gt_encseq_loader_require_ssp_tab(el); gt_encseq_loader_set_logger(el, logger); suffixarray->encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (suffixarray->encseq == NULL) { haserr = true; } if (!haserr) { haserr = scanprjfileuintkeys(suffixarray,indexname,logger,err); } if (!haserr && suffixarray->mirroredencseq && !gt_encseq_is_mirrored(suffixarray->encseq)) { if (gt_encseq_mirror(suffixarray->encseq, err) != 0) haserr = true; } if (!haserr) { totallength = gt_encseq_total_length(suffixarray->encseq); } if (!haserr && (demand & SARR_SUFTAB)) { if (map) { if (suffixarray->numberofallsortedsuffixes > 0) { suffixarray->suftab = gt_fa_mmap_check_size_with_suffix(indexname, GT_SUFTABSUFFIX, suffixarray->numberofallsortedsuffixes, sizeof (*suffixarray->suftab), err); if (suffixarray->suftab == NULL) { haserr = true; } } } else { #if defined (_LP64) || defined (_WIN64) off_t filesize = gt_file_size_with_suffix(indexname,GT_SUFTABSUFFIX); if (filesize == (off_t) sizeof (uint32_t) * suffixarray->numberofallsortedsuffixes) { gt_logger_log(logger,"read suftab in units of 4 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_uint32_t,uint32_t, GT_SUFTABSUFFIX); } else { gt_logger_log(logger,"read suftab in units of 8 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword, GT_SUFTABSUFFIX); } #else gt_logger_log(logger,"read suftab in units of 4 bytes"); INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword, GT_SUFTABSUFFIX); #endif } if (!haserr && !suffixarray->longest.defined) { gt_error_set(err,"longest not defined"); haserr = true; } } if (!haserr && (demand & SARR_LCPTAB)) { if (map) { if (suffixarray->numberofallsortedsuffixes > 0) { suffixarray->lcptab = gt_fa_mmap_check_size_with_suffix(indexname, GT_LCPTABSUFFIX, suffixarray->numberofallsortedsuffixes, sizeof (*suffixarray->lcptab), err); if (suffixarray->lcptab == NULL) { haserr = true; } } } else { INITBufferedfile(indexname,&suffixarray->lcptabstream,GtUchar, GT_LCPTABSUFFIX); if (!haserr && fseek(suffixarray->lcptabstream.fp, (GtWord) sizeof (GtUchar),SEEK_SET)) { gt_error_set(err,"fseek(esastream) failed: %s",strerror(errno)); haserr = true; } } if (!haserr && !suffixarray->numoflargelcpvalues.defined) { gt_error_set(err,"numoflargelcpvalues not defined"); haserr = true; } if (!haserr && suffixarray->numoflargelcpvalues.valueunsignedlong > 0) { if (map) { suffixarray->llvtab = gt_fa_mmap_check_size_with_suffix(indexname, GT_LARGELCPTABSUFFIX, (GtUword) suffixarray->numoflargelcpvalues. valueunsignedlong, sizeof (*suffixarray->llvtab), err); if (suffixarray->llvtab == NULL) { haserr = true; } } else { INITBufferedfile(indexname,&suffixarray->llvtabstream,Largelcpvalue, GT_LARGELCPTABSUFFIX); } } } if (!haserr && (demand & SARR_BWTTAB)) { if (map) { suffixarray->bwttab = gt_fa_mmap_check_size_with_suffix(indexname, GT_BWTTABSUFFIX, totallength+1, sizeof (*suffixarray->bwttab), err); if (suffixarray->bwttab == NULL) { haserr = true; } } else { INITBufferedfile(indexname,&suffixarray->bwttabstream,GtUchar, GT_BWTTABSUFFIX); } } if (!haserr && (demand & SARR_BCKTAB)) { suffixarray->bcktab = gt_bcktab_map(indexname, gt_encseq_alphabetnumofchars(suffixarray->encseq), suffixarray->prefixlength, totallength+1, true, err); if (suffixarray->bcktab == NULL) { haserr = true; } } if (haserr) { gt_freesuffixarray(suffixarray); } return haserr ? -1 : 0; }
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err) { Genericindex *genericindex = NULL; bool haserr = false; GtLogger *logger; const GtEncseq *encseq = NULL; logger = gt_logger_new(idxlocalioptions->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); if (idxlocalioptions->doonline) { GtEncseqLoader *el; el = gt_encseq_loader_new(); gt_encseq_loader_require_multiseq_support(el); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname), err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } } else { genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname), idxlocalioptions->withesa, idxlocalioptions->withesa || idxlocalioptions->docompare, false, true, 0, logger, err); if (genericindex == NULL) { haserr = true; } else { encseq = genericindex_getencseq(genericindex); } } if (!haserr) { GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; Limdfsresources *limdfsresources = NULL; const AbstractDfstransformer *dfst; SWdpresource *swdpresource = NULL; Showmatchinfo showmatchinfo; ProcessIdxMatch processmatch; GtAlphabet *a; void *processmatchinfoonline, *processmatchinfooffline; Storematchinfo storeonline, storeoffline; a = gt_encseq_alphabet(encseq); if (idxlocalioptions->docompare) { processmatch = storematch; gt_initstorematch(&storeonline,encseq); gt_initstorematch(&storeoffline,encseq); processmatchinfoonline = &storeonline; processmatchinfooffline = &storeoffline; } else { processmatch = showmatch; showmatchinfo.encseq = encseq; showmatchinfo.characters = gt_alphabet_characters(a); showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a); showmatchinfo.showalignment = idxlocalioptions->showalignment; processmatchinfoonline = processmatchinfooffline = &showmatchinfo; } if (idxlocalioptions->doonline || idxlocalioptions->docompare) { swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapextend, idxlocalioptions->threshold, idxlocalioptions->showalignment, processmatch, processmatchinfoonline); } dfst = gt_locali_AbstractDfstransformer(); if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_assert(genericindex != NULL); limdfsresources = gt_newLimdfsresources(genericindex, true, 0, 0, /* maxpathlength */ true, /* keepexpandedonstack */ processmatch, processmatchinfooffline, NULL, /* processresult */ NULL, /* processresult info */ dfst); } seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles, err); if (!seqit) haserr = true; if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a)); for (showmatchinfo.queryunit = 0; /* Nothing */; showmatchinfo.queryunit++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } printf("process sequence " Formatuint64_t " of length %lu\n", PRINTuint64_tcast(showmatchinfo.queryunit),querylen); if (idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen); } if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_indexbasedlocali(limdfsresources, idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapstart, idxlocalioptions->gapextend, idxlocalioptions->threshold, query, querylen, dfst); } if (idxlocalioptions->docompare) { gt_checkandresetstorematch(showmatchinfo.queryunit, &storeonline,&storeoffline); } } if (limdfsresources != NULL) { gt_freeLimdfsresources(&limdfsresources,dfst); } if (swdpresource != NULL) { gt_freeSWdpresource(swdpresource); swdpresource = NULL; } gt_seq_iterator_delete(seqit); } if (idxlocalioptions->docompare) { gt_freestorematch(&storeonline); gt_freestorematch(&storeoffline); } } if (genericindex == NULL) { gt_encseq_delete((GtEncseq *) encseq); encseq = NULL; } else { genericindex_delete(genericindex); } gt_logger_delete(logger); logger = NULL; return haserr ? -1 : 0; }
int gt_testmaxpairs(const char *indexname, GtUword samples, unsigned int minlength, GtUword substringlength, GtLogger *logger, GtError *err) { GtEncseq *encseq; GtUword totallength = 0, dblen, querylen; GtUchar *dbseq = NULL, *query = NULL; bool haserr = false; GtUword s; GtArray *tabmaxquerymatches; Maxmatchselfinfo maxmatchselfinfo; GtEncseqLoader *el; gt_logger_log(logger,"draw "GT_WU" samples",samples); el = gt_encseq_loader_new(); gt_encseq_loader_do_not_require_des_tab(el); gt_encseq_loader_do_not_require_ssp_tab(el); gt_encseq_loader_do_not_require_sds_tab(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } else { totallength = gt_encseq_total_length(encseq); } if (!haserr) { if (substringlength > totallength/2) { substringlength = totallength/2; } dbseq = gt_malloc(sizeof *dbseq * substringlength); query = gt_malloc(sizeof *query * substringlength); } for (s=0; s<samples && !haserr; s++) { dblen = samplesubstring(dbseq,encseq,substringlength); querylen = samplesubstring(query,encseq,substringlength); gt_logger_log(logger,"run query match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); tabmaxquerymatches = gt_array_new(sizeof (Substringmatch)); if (gt_sarrquerysubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchquery, tabmaxquerymatches, logger, err) != 0) { haserr = true; break; } gt_logger_log(logger,"run self match for dblen="GT_WU"" ",querylen= "GT_WU", minlength=%u", dblen, querylen, minlength); maxmatchselfinfo.results = gt_array_new(sizeof (Substringmatch)); maxmatchselfinfo.dblen = dblen; maxmatchselfinfo.querylen = querylen; maxmatchselfinfo.querymarkpos = sequence2markpositions(&maxmatchselfinfo.numofquerysequences, query,querylen); if (sarrselfsubstringmatch(dbseq, dblen, query, (GtUword) querylen, minlength, gt_encseq_alphabet(encseq), storemaxmatchself, &maxmatchselfinfo, logger, err) != 0) { haserr = true; break; } gt_array_sort(tabmaxquerymatches,orderSubstringmatch); gt_array_sort(maxmatchselfinfo.results,orderSubstringmatch); if (!gt_array_equal(tabmaxquerymatches,maxmatchselfinfo.results, orderSubstringmatch)) { const GtUword width = 60UL; printf("failure for query of length "GT_WU"\n",(GtUword) querylen); printf("querymatches\n"); (void) gt_array_iterate(tabmaxquerymatches,showSubstringmatch,NULL, err); printf("dbmatches\n"); (void) gt_array_iterate(maxmatchselfinfo.results,showSubstringmatch, NULL,err); gt_symbolstring2fasta(stdout,"dbseq", gt_encseq_alphabet(encseq), dbseq, (GtUword) dblen, width); gt_symbolstring2fasta(stdout,"queryseq", gt_encseq_alphabet(encseq), query, (GtUword) querylen, width); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_free(maxmatchselfinfo.querymarkpos); printf("# numberofmatches="GT_WU"\n",gt_array_size(tabmaxquerymatches)); gt_array_delete(tabmaxquerymatches); gt_array_delete(maxmatchselfinfo.results); } gt_free(dbseq); gt_free(query); gt_encseq_delete(encseq); encseq = NULL; return haserr ? -1 : 0; }
int gt_extractkeysfromdesfile(const char *indexname, bool sortkeys, GtLogger *logger, GtError *err) { FILE *fpin, *fpout = NULL; GtStr *line = NULL; const char *keyptr; unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/ bool haserr = false, firstdesc = true; char *previouskey = NULL; Fixedsizekey *keytab = NULL, *keytabptr = NULL; GtEncseq *encseq = NULL; unsigned long numofentries = 0; const unsigned long linewidth = 60UL; fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err); if (fpin == NULL) { return -1; } if (!sortkeys) { fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err); if (fpout == NULL) { haserr = true; } } if (!haserr) { line = gt_str_new(); } for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF; linenum++) { keyptr = desc2key(&keylen,gt_str_get(line),err); if (keyptr == NULL) { haserr = true; break; } if (keylen == 0) { gt_error_set(err,"key of length 0 in \"%s\" not expected", gt_str_get(line)); haserr = true; break; } if (firstdesc) { if (keylen > (unsigned long) CHAR_MAX) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX); haserr = true; break; } constantkeylen = keylen; previouskey = gt_malloc(sizeof (char) * (constantkeylen+1)); firstdesc = false; if (!sortkeys) { gt_xfputc((char) constantkeylen,fpout); } else { GtEncseqLoader *el; if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen, MAXFIXEDKEYSIZE); haserr = true; break; } el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; break; } numofentries = gt_encseq_num_of_sequences(encseq); gt_assert(numofentries > 0); keytab = gt_malloc(sizeof (*keytab) * numofentries); keytabptr = keytab; } } else { if (constantkeylen != keylen) { gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of " "the same length which for all previously seen " "headers is %lu", (int) keylen,(int) keylen,keyptr,keylen, constantkeylen); haserr = true; break; } gt_assert(previouskey != NULL); if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0) { gt_error_set(err,"previous key \"%s\" is not lexicographically smaller " "than current key \"%*.*s\"", previouskey,(int) keylen,(int) keylen,keyptr); haserr = true; break; /* printf("previous key \"%s\" (no %lu) is lexicographically larger " "than current key \"%*.*s\"\n", previouskey,linenum,(int) keylen,(int) keylen,keyptr); incorrectorder++; */ } } if (!sortkeys) { gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout); gt_xfputc('\0',fpout); } else { gt_assert(keytabptr != NULL); strncpy(keytabptr->key,keyptr,(size_t) constantkeylen); keytabptr->key[constantkeylen] = '\0'; keytabptr->seqnum = linenum; keytabptr++; } strncpy(previouskey,keyptr,(size_t) constantkeylen); previouskey[constantkeylen] = '\0'; gt_str_reset(line); } if (!haserr) { gt_logger_log(logger,"number of keys of length %lu = %lu", constantkeylen,linenum); /* gt_logger_log(logger,"number of incorrectly ordered keys = %lu", incorrectorder); */ } gt_str_delete(line); gt_fa_fclose(fpin); gt_fa_fclose(fpout); gt_free(previouskey); if (!haserr && sortkeys) { gt_assert(keytabptr != NULL); gt_assert(numofentries > 0); gt_assert(keytabptr == keytab + numofentries); qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys); gt_assert(keytabptr != NULL); for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries; keytabptr++) { if (giextract_encodedseq2fasta(stdout, encseq, keytabptr->seqnum, NULL, linewidth, err) != 0) { haserr = true; break; } } } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } gt_free(keytab); return haserr ? -1 : 0; }
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtLTRdigestOptions *arguments = tool_arguments; GtNodeStream *gff3_in_stream = NULL, *gff3_out_stream = NULL, *ltrdigest_stream = NULL, *tab_out_stream = NULL, *last_stream = NULL; int had_err = 0, tests_to_run = 0, arg = parsed_args; const char *indexname = argv[arg+1]; GtLogger *logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); GtEncseqLoader *el; GtEncseq *encseq; gt_error_check(err); gt_assert(arguments); /* Set sequence encoder options. Defaults are ok. */ el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); /* Open sequence file */ encseq = gt_encseq_loader_load(el, indexname, err); if (!encseq) had_err = -1; /* Always search for PPT. */ tests_to_run |= GT_LTRDIGEST_RUN_PPT; /* Open tRNA library if given. */ if (!had_err && arguments->trna_lib && gt_str_length(arguments->trna_lib) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PBS; arguments->pbs_opts.trna_lib = gt_bioseq_new(gt_str_get(arguments->trna_lib), err); if (gt_error_is_set(err)) had_err = -1; } #ifdef HAVE_HMMER /* Open HMMER files if given. */ if (!had_err && gt_str_array_size(arguments->pdom_opts.hmm_files) > 0) { tests_to_run |= GT_LTRDIGEST_RUN_PDOM; if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_GA; } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_TC; } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) { arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_NONE; } else { gt_error_set(err, "invalid cutoff setting!"); had_err = -1; } } #endif if (!had_err) { /* set up stream flow * ------------------*/ last_stream = gff3_in_stream = gt_gff3_in_stream_new_sorted(argv[arg]); last_stream = ltrdigest_stream = gt_ltrdigest_stream_new(last_stream, tests_to_run, encseq, &arguments->pbs_opts, &arguments->ppt_opts, #ifdef HAVE_HMMER &arguments->pdom_opts, #endif err); if (!ltrdigest_stream) had_err = -1; } if (!had_err) { /* attach tabular output stream, if requested */ if (gt_str_length(arguments->prefix) > 0) { last_stream = tab_out_stream = gt_ltr_fileout_stream_new(last_stream, tests_to_run, encseq, gt_str_get(arguments->prefix), &arguments->ppt_opts, &arguments->pbs_opts, #ifdef HAVE_HMMER &arguments->pdom_opts, #endif gt_str_get(arguments->trna_lib), argv[arg+1], argv[arg], arguments->seqnamelen, err); #ifdef HAVE_HMMER if (&arguments->pdom_opts.write_alignments) gt_ltr_fileout_stream_enable_pdom_alignment_output(tab_out_stream); if (&arguments->pdom_opts.write_aaseqs) gt_ltr_fileout_stream_enable_aa_sequence_output(tab_out_stream); #endif } last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream, arguments->outfp); /* pull the features through the stream and free them afterwards */ had_err = gt_node_stream_pull(last_stream, err); } gt_node_stream_delete(gff3_out_stream); gt_node_stream_delete(ltrdigest_stream); if (tab_out_stream != NULL) gt_node_stream_delete(tab_out_stream); gt_node_stream_delete(gff3_in_stream); gt_encseq_loader_delete(el); gt_encseq_delete(encseq); encseq = NULL; gt_bioseq_delete(arguments->pbs_opts.trna_lib); gt_logger_delete(logger); return had_err; }