GtQuerysubstringmatchiterator *gt_querysubstringmatchiterator_new( const GtEncseq *dbencseq, GtUword totallength, const ESASuffixptr *suftabpart, GtReadmode db_readmode, GtUword numberofsuffixes, const GtStrArray *query_files, const GtEncseq *query_encseq, GtReadmode query_readmode, unsigned int userdefinedleastlength, GtError *err) { GtQuerysubstringmatchiterator *qsmi = gt_malloc(sizeof *qsmi); qsmi->dbencseq = dbencseq; qsmi->suftabpart = suftabpart; qsmi->db_readmode = db_readmode; qsmi->numberofsuffixes = numberofsuffixes; qsmi->totallength = totallength; qsmi->userdefinedleastlength = (GtUword) userdefinedleastlength; qsmi->queryunitnum = 0; qsmi->desc = NULL; qsmi->query_for_seqit = NULL; qsmi->query_seqlen = 0; qsmi->queryrep.sequence = NULL; qsmi->queryrep.encseq = query_encseq; qsmi->queryrep.readmode = query_readmode; qsmi->queryrep.startpos = 0; qsmi->dbstart = 0; qsmi->matchlength = 0; qsmi->querysubstring.queryrep = &qsmi->queryrep; qsmi->mmsi = gt_mmsearchiterator_new_empty(); qsmi->mmsi_defined = false; if (query_files == NULL || gt_str_array_size(query_files) == 0) { gt_assert(query_encseq != NULL); qsmi->seqit = NULL; qsmi->query_encseq_numofsequences = (uint64_t) gt_encseq_num_of_sequences(query_encseq); } else { gt_assert(query_encseq == NULL); qsmi->seqit = gt_seq_iterator_sequence_buffer_new(query_files, err); if (qsmi->seqit == NULL) { gt_querysubstringmatchiterator_delete(qsmi); return NULL; } gt_seq_iterator_set_symbolmap(qsmi->seqit, gt_alphabet_symbolmap(gt_encseq_alphabet(dbencseq))); } return qsmi; }
int gt_esa2shulengthqueryfiles(unsigned long *totalgmatchlength, const Suffixarray *suffixarray, const GtStrArray *queryfilenames, GtError *err) { bool haserr = false; GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; GtAlphabet *alphabet; gt_error_check(err); alphabet = gt_encseq_alphabet(suffixarray->encseq); gt_assert(gt_str_array_size(queryfilenames) == 1UL); seqit = gt_seq_iterator_sequence_buffer_new(queryfilenames, err); if (!seqit) { haserr = true; } if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alphabet)); for (; /* Nothing */; ) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } *totalgmatchlength += gt_esa2shulengthquery(suffixarray,query,querylen); } gt_seq_iterator_delete(seqit); } return haserr ? -1 : 0; }
static int hcr_write_seqs(FILE *fp, GtHcrEncoder *hcr_enc, GtError *err) { int had_err = 0, seqit_err; GtUword bits_to_write = 0, len, read_counter = 0, page_counter = 0, bits_left_in_page, cur_read = 0; GtWord filepos; GtSeqIterator *seqit; const GtUchar *seq, *qual; char *desc; GtBitOutStream *bitstream; gt_error_check(err); gt_assert(hcr_enc->seq_encoder->sampling); gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8)); gt_xfseek(fp, hcr_enc->seq_encoder->start_of_encoding, SEEK_SET); bitstream = gt_bitoutstream_new(fp); seqit = gt_seq_iterator_fastq_new(hcr_enc->files, err); if (!seqit) { gt_assert(gt_error_is_set(err)); had_err = -1; } if (!had_err) { gt_seq_iterator_set_quality_buffer(seqit, &qual); gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(hcr_enc->seq_encoder->alpha)); hcr_enc->seq_encoder->total_num_of_symbols = 0; while (!had_err && (seqit_err = gt_seq_iterator_next(seqit, &seq, &len, &desc, err)) == 1) { /* count the bits */ bits_to_write = hcr_write_seq(hcr_enc->seq_encoder, seq, qual, len, bitstream, true); /* check if a new sample has to be added */ if (gt_sampling_is_next_element_sample(hcr_enc->seq_encoder->sampling, page_counter, read_counter, bits_to_write, bits_left_in_page)) { gt_bitoutstream_flush_advance(bitstream); filepos = gt_bitoutstream_pos(bitstream); if (filepos < 0) { had_err = -1; gt_error_set(err, "error by ftell: %s", strerror(errno)); } else { gt_sampling_add_sample(hcr_enc->seq_encoder->sampling, (size_t) filepos, cur_read); read_counter = 0; page_counter = 0; gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8)); } } if (!had_err) { /* do the writing */ bits_to_write = hcr_write_seq(hcr_enc->seq_encoder, seq, qual, len, bitstream, false); /* update counter for sampling */ while (bits_left_in_page < bits_to_write) { page_counter++; bits_to_write -= bits_left_in_page; gt_safe_assign(bits_left_in_page, (hcr_enc->pagesize * 8)); } bits_left_in_page -= bits_to_write; /* always set first page as written */ if (page_counter == 0) page_counter++; read_counter++; hcr_enc->seq_encoder->total_num_of_symbols += len; cur_read++; } } gt_assert(hcr_enc->num_of_reads == cur_read); if (!had_err && seqit_err) { had_err = seqit_err; gt_assert(gt_error_is_set(err)); } } if (!had_err) { gt_bitoutstream_flush(bitstream); filepos = gt_bitoutstream_pos(bitstream); if (filepos < 0) { had_err = -1; gt_error_set(err, "error by ftell: %s", strerror(errno)); } else { hcr_enc->seq_encoder->startofsamplingtab = filepos; gt_log_log("start of samplingtab: "GT_WU"", hcr_enc->seq_encoder->startofsamplingtab); if (hcr_enc->seq_encoder->sampling != NULL) gt_sampling_write(hcr_enc->seq_encoder->sampling, fp); } } gt_bitoutstream_delete(bitstream); gt_seq_iterator_delete(seqit); return had_err; }
GtHcrEncoder *gt_hcr_encoder_new(GtStrArray *files, GtAlphabet *alpha, bool descs, GtQualRange qrange, GtTimer *timer, GtError *err) { GtBaseQualDistr *bqd; GtHcrEncoder *hcr_enc; GtSeqIterator *seqit; GtStrArray *file; int had_err = 0, status; GtUword len1, len2, i, num_of_reads = 0; const GtUchar *seq, *qual; char *desc; gt_error_check(err); gt_assert(alpha && files); if (timer != NULL) gt_timer_show_progress(timer, "get <base,qual> distr", stdout); if (qrange.start != GT_UNDEF_UINT) if (qrange.start == qrange.end) { gt_error_set(err, "qrange.start must unequal qrange.end"); return NULL; } hcr_enc = gt_malloc(sizeof (GtHcrEncoder)); hcr_enc->files = files; hcr_enc->num_of_files = gt_str_array_size(files); hcr_enc->num_of_reads = 0; hcr_enc->page_sampling = false; hcr_enc->regular_sampling = false; hcr_enc->sampling_rate = 0; hcr_enc->pagesize = gt_pagesize(); if (descs) { hcr_enc->encdesc_encoder = gt_encdesc_encoder_new(); if (timer != NULL) gt_encdesc_encoder_set_timer(hcr_enc->encdesc_encoder, timer); } else hcr_enc->encdesc_encoder = NULL; hcr_enc->seq_encoder = gt_malloc(sizeof (GtHcrSeqEncoder)); hcr_enc->seq_encoder->alpha = alpha; hcr_enc->seq_encoder->sampling = NULL; hcr_enc->seq_encoder->fileinfos = gt_calloc((size_t) hcr_enc->num_of_files, sizeof (*(hcr_enc->seq_encoder->fileinfos))); hcr_enc->seq_encoder->qrange = qrange; bqd = hcr_base_qual_distr_new(alpha, qrange); /* check if reads in the same file are of same length and get <base, quality> pair distribution */ for (i = 0; i < hcr_enc->num_of_files; i++) { file = gt_str_array_new(); gt_str_array_add(file, gt_str_array_get_str(files, i)); seqit = gt_seq_iterator_fastq_new(file, err); if (!seqit) { gt_error_set(err, "cannot initialize GtSeqIteratorFastQ object"); had_err = -1; } if (!had_err) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(alpha)); gt_seq_iterator_set_quality_buffer(seqit, &qual); status = gt_seq_iterator_next(seqit, &seq, &len1, &desc, err); if (status == 1) { num_of_reads = 1UL; while (!had_err) { status = gt_seq_iterator_next(seqit, &seq, &len2, &desc, err); if (status == -1) had_err = -1; if (status != 1) break; if (len2 != len1) { gt_error_set(err, "reads have to be of equal length"); had_err = -1; break; } if (hcr_base_qual_distr_add(bqd, qual, seq, len1) != 0) had_err = -1; len1 = len2; num_of_reads++; } } else if (status == -1) had_err = -1; if (!had_err) { if (i == 0) hcr_enc->seq_encoder->fileinfos[i].readnum = num_of_reads; else hcr_enc->seq_encoder->fileinfos[i].readnum = hcr_enc->seq_encoder->fileinfos[i - 1].readnum + num_of_reads; hcr_enc->seq_encoder->fileinfos[i].readlength = len1; } } hcr_enc->num_of_reads += num_of_reads; gt_str_array_delete(file); gt_seq_iterator_delete(seqit); } if (!had_err) hcr_base_qual_distr_trim(bqd); if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "build huffman tree for sequences and" " qualities", stdout); hcr_enc->seq_encoder->huffman = gt_huffman_new(bqd, hcr_base_qual_distr_func, (GtUword) bqd->ncols * bqd->nrows); } if (!had_err) { hcr_enc->seq_encoder->qual_offset = bqd->qual_offset; hcr_base_qual_distr_delete(bqd); return hcr_enc; } return NULL; }
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err) { Genericindex *genericindex = NULL; bool haserr = false; GtLogger *logger; const GtEncseq *encseq = NULL; logger = gt_logger_new(idxlocalioptions->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); if (idxlocalioptions->doonline) { GtEncseqLoader *el; el = gt_encseq_loader_new(); gt_encseq_loader_require_multiseq_support(el); gt_encseq_loader_drop_description_support(el); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname), err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; } } else { genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname), idxlocalioptions->withesa, idxlocalioptions->withesa || idxlocalioptions->docompare, false, true, 0, logger, err); if (genericindex == NULL) { haserr = true; } else { encseq = genericindex_getencseq(genericindex); } } if (!haserr) { GtSeqIterator *seqit; const GtUchar *query; unsigned long querylen; char *desc = NULL; int retval; Limdfsresources *limdfsresources = NULL; const AbstractDfstransformer *dfst; SWdpresource *swdpresource = NULL; Showmatchinfo showmatchinfo; ProcessIdxMatch processmatch; GtAlphabet *a; void *processmatchinfoonline, *processmatchinfooffline; Storematchinfo storeonline, storeoffline; a = gt_encseq_alphabet(encseq); if (idxlocalioptions->docompare) { processmatch = storematch; gt_initstorematch(&storeonline,encseq); gt_initstorematch(&storeoffline,encseq); processmatchinfoonline = &storeonline; processmatchinfooffline = &storeoffline; } else { processmatch = showmatch; showmatchinfo.encseq = encseq; showmatchinfo.characters = gt_alphabet_characters(a); showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a); showmatchinfo.showalignment = idxlocalioptions->showalignment; processmatchinfoonline = processmatchinfooffline = &showmatchinfo; } if (idxlocalioptions->doonline || idxlocalioptions->docompare) { swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapextend, idxlocalioptions->threshold, idxlocalioptions->showalignment, processmatch, processmatchinfoonline); } dfst = gt_locali_AbstractDfstransformer(); if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_assert(genericindex != NULL); limdfsresources = gt_newLimdfsresources(genericindex, true, 0, 0, /* maxpathlength */ true, /* keepexpandedonstack */ processmatch, processmatchinfooffline, NULL, /* processresult */ NULL, /* processresult info */ dfst); } seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles, err); if (!seqit) haserr = true; if (!haserr) { gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a)); for (showmatchinfo.queryunit = 0; /* Nothing */; showmatchinfo.queryunit++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } printf("process sequence " Formatuint64_t " of length %lu\n", PRINTuint64_tcast(showmatchinfo.queryunit),querylen); if (idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen); } if (!idxlocalioptions->doonline || idxlocalioptions->docompare) { gt_indexbasedlocali(limdfsresources, idxlocalioptions->matchscore, idxlocalioptions->mismatchscore, idxlocalioptions->gapstart, idxlocalioptions->gapextend, idxlocalioptions->threshold, query, querylen, dfst); } if (idxlocalioptions->docompare) { gt_checkandresetstorematch(showmatchinfo.queryunit, &storeonline,&storeoffline); } } if (limdfsresources != NULL) { gt_freeLimdfsresources(&limdfsresources,dfst); } if (swdpresource != NULL) { gt_freeSWdpresource(swdpresource); swdpresource = NULL; } gt_seq_iterator_delete(seqit); } if (idxlocalioptions->docompare) { gt_freestorematch(&storeonline); gt_freestorematch(&storeoffline); } } if (genericindex == NULL) { gt_encseq_delete((GtEncseq *) encseq); encseq = NULL; } else { genericindex_delete(genericindex); } gt_logger_delete(logger); logger = NULL; return haserr ? -1 : 0; }
static int gt_callenumquerymatches_withindex( GtQuerysubstringmatchfunc findquerymatches, const Suffixarray *suffixarray, const GtStrArray *queryfiles, bool forwardstrand, bool reversestrand, unsigned int userdefinedleastlength, GtProcessquerybeforematching processquerybeforematching, GtProcessquerymatch processquerymatch, void *processquerymatchinfo, GtError *err) { GtSeqIterator *seqit; bool haserr = false; seqit = gt_seq_iterator_sequence_buffer_new(queryfiles, err); if (seqit == NULL) { haserr = true; } else { GtQuerymatch *querymatchspaceptr = gt_querymatch_new(); const GtUchar *query; unsigned long querylen; int retval; uint64_t queryunitnum; GtUchar *queryreverse = NULL; unsigned long queryreverse_length = 0; char *desc = NULL; int mode; gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(gt_encseq_alphabet( suffixarray->encseq))); for (queryunitnum = 0; /* Nothing */; queryunitnum++) { retval = gt_seq_iterator_next(seqit, &query, &querylen, &desc, err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } if (querylen >= (unsigned long) userdefinedleastlength) { GtQueryrep queryrep; queryrep.encseq = NULL; queryrep.readmode = GT_READMODE_FORWARD; queryrep.startpos = 0; queryrep.length = querylen; for (mode = 0; mode <= 1; mode++) { if (mode == 0 && forwardstrand) { queryrep.sequence = query; queryrep.reversecopy = false; if (processquerybeforematching != NULL) { processquerybeforematching(processquerymatchinfo,desc,query, querylen,true); } } else { if (mode == 1 && reversestrand) { if (querylen > queryreverse_length) { queryreverse = gt_realloc(queryreverse, sizeof (*queryreverse) * querylen); queryreverse_length = querylen; } gt_copy_reversecomplement(queryreverse,query,querylen); queryrep.sequence = queryreverse; queryrep.reversecopy = true; if (processquerybeforematching != NULL) { processquerybeforematching(processquerymatchinfo,desc, queryreverse,querylen,false); } } else { queryrep.sequence = NULL; queryrep.reversecopy = false; } } if (queryrep.sequence != NULL) { int ret = findquerymatches(false, suffixarray, queryunitnum, &queryrep, (unsigned long) userdefinedleastlength, processquerymatch, processquerymatchinfo, querymatchspaceptr, err); if (ret != 0) { haserr = true; break; } } } } } gt_seq_iterator_delete(seqit); gt_free(queryreverse); gt_querymatch_delete(querymatchspaceptr); } return haserr ? -1 : 0; }