コード例 #1
0
ファイル: gt_repfind.c プロジェクト: lparsons/genometools
static int gt_simplesuffixprefixmatchoutput(GT_UNUSED void *info,
                                            const GtGenericEncseq
                                              *genericencseq,
                                            GtUword matchlen,
                                            GtUword pos1,
                                            GtUword pos2,
                                            GT_UNUSED GtError *err)
{
  GtUword seqnum1, relpos1, seqnum2, relpos2, seqstartpos;
  const GtEncseq *encseq;

  if (pos1 > pos2)
  {
    GtUword tmp = pos1;
    pos1 = pos2;
    pos2 = tmp;
  }
  gt_assert(genericencseq != NULL && genericencseq->hasencseq);
  encseq = genericencseq->seqptr.encseq;
  seqnum1 = gt_encseq_seqnum(encseq,pos1);
  seqstartpos = gt_encseq_seqstartpos(encseq, seqnum1);
  gt_assert(seqstartpos <= pos1);
  relpos1 = pos1 - seqstartpos;
  seqnum2 = gt_encseq_seqnum(encseq,pos2);
  seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2);
  gt_assert(seqstartpos <= pos2);
  relpos2 = pos2 - seqstartpos;
  if (relpos1 == 0)
  {
    GtUword seqlen2 = gt_encseq_seqlength(encseq,seqnum2);

    if (relpos2 + matchlen == seqlen2)
    {
      printf(""GT_WU" "GT_WU" "GT_WU"\n",seqnum2,seqnum1,matchlen);
    }
  } else
  {
    if (relpos2 == 0)
    {
      GtUword seqlen1 = gt_encseq_seqlength(encseq,seqnum1);

      if (relpos1 + matchlen == seqlen1)
      {
        printf(""GT_WU" "GT_WU" "GT_WU"\n",seqnum1,seqnum2,matchlen);
      }
    }
  }
  return 0;
}
コード例 #2
0
ファイル: idxlocali.c プロジェクト: oeigenbrod/genometools
static void showmatch(void *processinfo,const GtIdxMatch *match)
{
  Showmatchinfo *showmatchinfo = (Showmatchinfo *) processinfo;
  unsigned long seqnum;
  unsigned long relpos;

  if (match->dbabsolute)
  {
    unsigned long seqstartpos;
    seqnum = gt_encseq_seqnum(showmatchinfo->encseq, match->dbstartpos);
    seqstartpos = gt_encseq_seqstartpos(showmatchinfo->encseq, seqnum);
    gt_assert(seqstartpos <= match->dbstartpos);
    relpos = match->dbstartpos - seqstartpos;
  } else
  {
    relpos = match->dbstartpos;
    seqnum = match->dbseqnum;
  }
  printf("%lu\t%lu\t",seqnum,relpos);
  printf("%lu\t",match->dblen);
  printf("\t" Formatuint64_t "\t%lu\t%lu\t%lu\n",
              PRINTuint64_tcast(showmatchinfo->queryunit),
              match->querystartpos,
              match->querylen,
              match->distance);
  if (showmatchinfo->showalignment)
  {
    gt_alignment_show_with_mapped_chars(
                (const GtAlignment *) match->alignment,
                showmatchinfo->characters,
                showmatchinfo->wildcardshow,
                stdout);
  }
}
コード例 #3
0
static GtUword condenseq_unique_extract_encoded(const GtCondenseq *cs,
                                                GtUword id,
                                                GtUchar *buffer,
                                                GtUword frompos,
                                                GtUword topos)
{
  GtCondenseqUnique unique = cs->uniques[id];
  GtUword startoffset,
          startpos,
          uniquelength,
          targetlength,
          endpos;
  gt_assert(unique.orig_startpos <= frompos);
  startoffset = frompos - unique.orig_startpos;
  gt_assert(startoffset < unique.len);
  startpos = gt_encseq_seqstartpos(cs->unique_es, id) + startoffset;
  uniquelength = unique.len - startoffset;
  targetlength = topos - frompos + 1;
  if (uniquelength < targetlength)
    endpos = startpos + uniquelength - 1;
  else
    endpos = startpos + targetlength - 1;

  gt_encseq_extract_encoded(cs->unique_es, buffer, startpos, endpos);
  return endpos - startpos + 1;
}
コード例 #4
0
ファイル: gt_seqorder.c プロジェクト: oeigenbrod/genometools
static void gt_seqorder_output(unsigned long seqnum, GtEncseq *encseq)
{
  GtEncseqReader *esr;
  unsigned long startpos, len, desclen = 0;
  const char *desc = NULL;
  unsigned long i;

  startpos = gt_encseq_seqstartpos(encseq, seqnum);
  len = gt_encseq_seqlength(encseq, seqnum);
  gt_xfputc(GT_FASTA_SEPARATOR, stdout);
  if (gt_encseq_has_description_support(encseq))
  {
    desc = gt_encseq_description(encseq, &desclen, seqnum);
    gt_xfwrite(desc, (size_t)1, (size_t)desclen, stdout);
  }
  gt_xfputc('\n', stdout);
  esr = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD,
      startpos);
  for (i = 0; i < len; i++)
  {
    gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout);
  }
  gt_encseq_reader_delete(esr);
  gt_xfputc('\n', stdout);
}
コード例 #5
0
ファイル: gt_maxpairs.c プロジェクト: simongog/genometools
static int gt_simpleexactselfmatchoutput(void *info,
                                         const GtEncseq *encseq,
                                         unsigned long len,
                                         unsigned long pos1,
                                         unsigned long pos2,
                                         GT_UNUSED GtError *err)
{
  unsigned long queryseqnum, seqstartpos, seqlength;
  GtQuerymatch *querymatch = (GtQuerymatch *) info;

  if (pos1 > pos2)
  {
    unsigned long tmp = pos1;
    pos1 = pos2;
    pos2 = tmp;
  }
  queryseqnum = gt_encseq_seqnum(encseq,pos2);
  seqstartpos = gt_encseq_seqstartpos(encseq, queryseqnum);
  seqlength = gt_encseq_seqlength(encseq, queryseqnum);
  gt_assert(pos2 >= seqstartpos);
  gt_querymatch_fill(querymatch,
                     len,
                     pos1,
                     GT_READMODE_FORWARD,
                     false,
                     0,
                     0,
                     true,
                     (uint64_t) queryseqnum,
                     len,
                     pos2 - seqstartpos);
  return gt_querymatch_output(info, encseq, querymatch, NULL, seqlength, err);
}
コード例 #6
0
static GtUword condenseq_link_extract_encoded(const GtCondenseq *cs,
                                              GtUword id,
                                              GtUchar *buffer,
                                              GtUword frompos,
                                              GtUword topos)
{
  GtCondenseqLink link = cs->links[id];
  GtEditscript *editscript = link.editscript;
  GtUword unique_startpos,
          targetlength,
          startoffset,
          endpos,
          linklength,
          written;
  gt_assert(link.orig_startpos <= frompos);
  unique_startpos = gt_encseq_seqstartpos(cs->unique_es, link.unique_id);
  startoffset = frompos - link.orig_startpos;
  gt_assert(startoffset < link.len);
  linklength = link.len - startoffset;
  targetlength = topos - frompos + 1;
  if (linklength < targetlength)
    endpos = link.len - 1;
  else
    endpos = startoffset + targetlength - 1;
  written =
    gt_editscript_get_sub_sequence_v(editscript, cs->unique_es,
                                     unique_startpos + link.unique_offset,
                                     GT_READMODE_FORWARD, startoffset,
                                     endpos, buffer);
  gt_assert(written == endpos - startoffset + 1);
  return written;
}
コード例 #7
0
static GtIntset *condenseq_fill_tab(GtCondenseq *condenseq,
                                    const GtEncseq *orig_es)
{
  GtIntset *ssptab = NULL;
  GtUword max, idx;
  if (condenseq->orig_num_seq > (GtUword) 1) {
    max = gt_encseq_seqstartpos(orig_es, condenseq->orig_num_seq - 1);
    /* we store the internal separators, the end is explicit */
    ssptab = gt_intset_best_new(max - 1, condenseq->orig_num_seq - 1);
    for (idx = (GtUword) 1; idx < condenseq->orig_num_seq; ++idx) {
      GtUword pos = gt_encseq_seqstartpos(orig_es, idx) - 1;
      gt_assert(pos != 0);
      gt_intset_add(ssptab, pos);
    }
  }
  return ssptab;
}
コード例 #8
0
ファイル: esa-mmsearch.c プロジェクト: simongog/genometools
int gt_callenumselfmatches(const char *indexname,
                           GtReadmode queryreadmode,
                           unsigned int userdefinedleastlength,
                           GtProcessquerymatch processquerymatch,
                           void *processquerymatchinfo,
                           GtLogger *logger,
                           GtError *err)
{
  Suffixarray suffixarray;
  bool haserr = false;

  gt_assert(queryreadmode != GT_READMODE_FORWARD);
  if (gt_mapsuffixarray(&suffixarray,
                        SARR_ESQTAB | SARR_SUFTAB | SARR_SSPTAB,
                        indexname,
                        logger,
                        err) != 0)
  {
    haserr = true;
  } else
  {
    unsigned long seqnum, numofsequences, seqlength, seqstartpos;
    GtQuerymatch *querymatchspaceptr = gt_querymatch_new();
    GtQueryrep queryrep;

    numofsequences = gt_encseq_num_of_sequences(suffixarray.encseq);
    queryrep.sequence = NULL;
    queryrep.reversecopy = false;
    queryrep.encseq = suffixarray.encseq;
    queryrep.readmode = queryreadmode;
    for (seqnum = 0; seqnum < numofsequences; seqnum++)
    {
      seqstartpos = gt_encseq_seqstartpos(suffixarray.encseq, seqnum);
      seqlength = gt_encseq_seqlength(suffixarray.encseq, seqnum);
      if (seqlength >= (unsigned long) userdefinedleastlength)
      {
        queryrep.startpos = seqstartpos;
        queryrep.length = seqlength;
        if (gt_querysubstringmatch(true,
                                   &suffixarray,
                                   (uint64_t) seqnum,
                                   &queryrep,
                                   (unsigned long) userdefinedleastlength,
                                   processquerymatch,
                                   processquerymatchinfo,
                                   querymatchspaceptr,
                                   err) != 0)
        {
          haserr = true;
          break;
        }
      }
    }
    gt_querymatch_delete(querymatchspaceptr);
  }
  gt_freesuffixarray(&suffixarray);
  return haserr ? -1 : 0;
}
コード例 #9
0
ファイル: gt_maxpairs.c プロジェクト: simongog/genometools
static int gt_simplesuffixprefixmatchoutput(GT_UNUSED void *info,
                                            const GtEncseq *encseq,
                                            unsigned long matchlen,
                                            unsigned long pos1,
                                            unsigned long pos2,
                                            GT_UNUSED GtError *err)
{
  unsigned long seqnum1, relpos1, seqnum2, relpos2, seqstartpos;

  if (pos1 > pos2)
  {
    unsigned long tmp = pos1;
    pos1 = pos2;
    pos2 = tmp;
  }
  seqnum1 = gt_encseq_seqnum(encseq,pos1);
  seqstartpos = gt_encseq_seqstartpos(encseq, seqnum1);
  gt_assert(seqstartpos <= pos1);
  relpos1 = pos1 - seqstartpos;
  seqnum2 = gt_encseq_seqnum(encseq,pos2);
  seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2);
  gt_assert(seqstartpos <= pos2);
  relpos2 = pos2 - seqstartpos;
  if (relpos1 == 0)
  {
    unsigned long seqlen2 = gt_encseq_seqlength(encseq,seqnum2);

    if (relpos2 + matchlen == seqlen2)
    {
      printf("%lu %lu %lu\n",seqnum2,seqnum1,matchlen);
    }
  } else
  {
    if (relpos2 == 0)
    {
      unsigned long seqlen1 = gt_encseq_seqlength(encseq,seqnum1);

      if (relpos1 + matchlen == seqlen1)
      {
        printf("%lu %lu %lu\n",seqnum1,seqnum2,matchlen);
      }
    }
  }
  return 0;
}
コード例 #10
0
ファイル: bioseq.c プロジェクト: ggonnella/genometools
GtUchar gt_bioseq_get_encoded_char(const GtBioseq *bs, GtUword index,
                                   GtUword position)
{
  GtUword startpos;
  gt_assert(bs);
  gt_assert(index < gt_encseq_num_of_sequences(bs->encseq));
  startpos = gt_encseq_seqstartpos(bs->encseq, index);
  return gt_encseq_get_encoded_char(bs->encseq, startpos + position,
                                    GT_READMODE_FORWARD);
}
コード例 #11
0
ファイル: bioseq.c プロジェクト: ggonnella/genometools
void gt_bioseq_get_encoded_sequence_range(const GtBioseq *bs, GtUchar *out,
                                          GtUword idx,
                                          GtUword start,
                                          GtUword end)
{
  GtUword startpos;
  gt_assert(bs);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start);
  startpos = gt_encseq_seqstartpos(bs->encseq, idx);
  gt_encseq_extract_encoded(bs->encseq, out, startpos + start, startpos + end);
}
コード例 #12
0
ファイル: bioseq.c プロジェクト: ggonnella/genometools
void gt_bioseq_get_encoded_sequence(const GtBioseq *bs, GtUchar *out,
                                    GtUword idx)
{
  GtUword startpos;
  gt_assert(bs);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq));
  startpos = gt_encseq_seqstartpos(bs->encseq, idx);
  gt_encseq_extract_encoded(bs->encseq, out, startpos,
                            startpos
                              + gt_encseq_seqlength(bs->encseq, idx) - 1);
}
コード例 #13
0
ファイル: encseq_lua.c プロジェクト: kowsky/genometools
static int encseq_lua_seqstartpos(lua_State *L)
{
  GtEncseq **encseq;
  GtUword pos;
  encseq = check_encseq(L, 1);
  pos = luaL_checknumber(L, 2);
  luaL_argcheck(L, pos < gt_encseq_num_of_sequences(*encseq), 2,
                "cannot exceed number of sequences");
  lua_pushnumber(L, gt_encseq_seqstartpos(*encseq, pos));
  return 1;
}
コード例 #14
0
ファイル: bioseq.c プロジェクト: ggonnella/genometools
char* gt_bioseq_get_sequence_range(const GtBioseq *bs, GtUword idx,
                                   GtUword start, GtUword end)
{
  char *out;
  GtUword startpos;
  gt_assert(bs);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq) && end >= start);
  out = gt_malloc((end - start + 1) * sizeof (char));
  startpos = gt_encseq_seqstartpos(bs->encseq, idx);
  gt_encseq_extract_decoded(bs->encseq, out, startpos + start, startpos + end);
  return out;
}
コード例 #15
0
ファイル: bioseq.c プロジェクト: ggonnella/genometools
char* gt_bioseq_get_sequence(const GtBioseq *bs, GtUword idx)
{
  char *out;
  GtUword startpos;
  gt_assert(bs);
  gt_assert(idx < gt_encseq_num_of_sequences(bs->encseq));
  out = gt_calloc(gt_encseq_seqlength(bs->encseq, idx), sizeof (char));
  startpos = gt_encseq_seqstartpos(bs->encseq, idx);
  gt_encseq_extract_decoded(bs->encseq, out, startpos,
                            startpos
                              + gt_encseq_seqlength(bs->encseq, idx) - 1);
  return out;
}
コード例 #16
0
ファイル: bioseq.c プロジェクト: AnnSeidel/genometools
bool gt_bioseq_seq_has_wildcards(const GtBioseq* bioseq,
                                 GtUword idx) {
  bool has_wildcard = false;
  GtUword length = gt_encseq_seqlength(bioseq->encseq, idx),
          seqstart = gt_encseq_seqstartpos(bioseq->encseq, idx),
          i;
  for (i = 0; !has_wildcard && i < length; ++i) {
    has_wildcard = gt_encseq_position_is_wildcard(bioseq->encseq,
                                                  seqstart + i,
                                                  GT_READMODE_FORWARD);
  }
  return has_wildcard;
}
コード例 #17
0
ファイル: gt_seqorder.c プロジェクト: oeigenbrod/genometools
static void gt_seqorder_sort(GtSuffixsortspace *suffixsortspace,
    GtEncseq *encseq)
{
  unsigned long i;
  Sfxstrategy sfxstrategy;

  defaultsfxstrategy(&sfxstrategy, false);
  for (i = 0; i < gt_encseq_num_of_sequences(encseq); i++)
    gt_suffixsortspace_setdirect(suffixsortspace, i,
        gt_encseq_seqstartpos(encseq, i));
  gt_sortallsuffixesfromstart(suffixsortspace,
      gt_encseq_num_of_sequences(encseq), encseq, GT_READMODE_FORWARD, NULL, 0,
      &sfxstrategy, NULL, NULL, NULL);
}
コード例 #18
0
ファイル: encseq_col.c プロジェクト: kowsky/genometools
static int gt_encseq_col_do_grep_desc(GtEncseqCol *esc, GtUword *filenum,
                                      GtUword *seqnum, GtStr *seqid,
                                      GtError *err)
{
  GtUword j;
  const GtSeqInfo *seq_info_ptr;
  GtSeqInfo seq_info;
  bool match = false;
  int had_err = 0;
  gt_error_check(err);

  gt_assert(esc && filenum && seqnum && seqid);
  /* create cache */
  if (!esc->grep_cache)
    esc->grep_cache = gt_seq_info_cache_new();
  /* try to read from cache */
  seq_info_ptr = gt_seq_info_cache_get(esc->grep_cache, gt_str_get(seqid));
  if (seq_info_ptr) {
    *filenum = seq_info_ptr->filenum;
    *seqnum = seq_info_ptr->seqnum;
    return 0;
  }
  for (j = 0; !had_err && j < gt_encseq_num_of_sequences(esc->encseq); j++) {
    const char *desc;
    char *buf;
    GtUword desc_len;
    desc = gt_encseq_description(esc->encseq, &desc_len, j);
    buf = gt_calloc(desc_len + 1, sizeof (char));
    memcpy(buf, desc, desc_len * sizeof (char));
    had_err = gt_grep(&match, gt_str_get(seqid), buf, err);
    gt_free(buf);
    if (!had_err && match) {
      *filenum = seq_info.filenum =
                       gt_encseq_filenum(esc->encseq,
                                         gt_encseq_seqstartpos(esc->encseq, j));
      *seqnum = seq_info.seqnum =
                      j - gt_encseq_filenum_first_seqnum(esc->encseq, *filenum);
      gt_seq_info_cache_add(esc->grep_cache, gt_str_get(seqid), &seq_info);
      break;
    }
  }
  if (!had_err && !match) {
    gt_error_set(err, "no description matched sequence ID '%s'",
                 gt_str_get(seqid));
    had_err = -1;
  }
  return had_err;
}
コード例 #19
0
ファイル: giextract.c プロジェクト: 9beckert/TIR
static int giextract_encodedseq2fasta(FILE *fpout,
                                      const GtEncseq *encseq,
                                      unsigned long seqnum,
                                      const Fastakeyquery *fastakeyquery,
                                      unsigned long linewidth,
                                      GT_UNUSED GtError *err)
{
  const char *desc;
  unsigned long desclen;
  bool haserr = false;

  desc = gt_encseq_description(encseq, &desclen, seqnum);
  gt_xfputc('>',fpout);
  if (fastakeyquery != NULL && !COMPLETE(fastakeyquery))
  {
    printf("%s %lu %lu ",fastakeyquery->fastakey,
                         fastakeyquery->frompos,
                         fastakeyquery->topos);
  }
  gt_xfwrite(desc,sizeof *desc,(size_t) desclen,fpout);
  if (!haserr)
  {
    unsigned long frompos, topos, seqstartpos, seqlength ;

    gt_xfputc('\n',fpout);
    seqstartpos = gt_encseq_seqstartpos(encseq, seqnum);
    seqlength = gt_encseq_seqlength(encseq, seqnum);
    if (fastakeyquery != NULL && !COMPLETE(fastakeyquery))
    {
      frompos = fastakeyquery->frompos-1;
      topos = fastakeyquery->topos - fastakeyquery->frompos + 1;
    } else
    {
      frompos = 0;
      topos = seqlength;
    }
    gt_encseq2symbolstring(fpout,
                           encseq,
                           GT_READMODE_FORWARD,
                           seqstartpos + frompos,
                           topos,
                           linewidth);
  }
  return haserr ? -1 : 0;
}
コード例 #20
0
ファイル: encseq_col.c プロジェクト: kowsky/genometools
static char* gt_encseq_col_get_sequence(const GtSeqCol *sc,
                                        GtUword filenum,
                                        GtUword seqnum,
                                        GtUword start,
                                        GtUword end)
{
  GtEncseqCol *esc;
  char *out;
  GtUword encseq_seqnum, startpos;
  esc = gt_encseq_col_cast(sc);
  gt_assert(esc && filenum < gt_encseq_num_of_files(esc->encseq));
  encseq_seqnum = gt_encseq_filenum_first_seqnum(esc->encseq, filenum) + seqnum;
  gt_assert(encseq_seqnum < gt_encseq_num_of_sequences(esc->encseq));
  gt_assert(start <= end);
  startpos = gt_encseq_seqstartpos(esc->encseq, encseq_seqnum);
  out = gt_calloc(end - start + 1, sizeof (char));
  gt_encseq_extract_decoded(esc->encseq, out, startpos + start, startpos + end);
  return out;
}
コード例 #21
0
static void verifymatch(const GtEncseq *encseq,
                        GtUword len,
                        GtUword pos1,
                        uint64_t seqnum2,
                        GtUword pos2,
                        GtReadmode readmode)
{
  if (readmode == GT_READMODE_REVERSE)
  {
    GtUword offset,
                  seqstartpos,
                  totallength = gt_encseq_total_length(encseq);
    GtUchar cc1, cc2;

    seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2);
    pos2 += seqstartpos;
    for (offset = 0; offset < len; offset++)
    {
      gt_assert(pos1 + len - 1 < totallength);
      gt_assert(pos2 + len - 1 < totallength);
      cc1 = gt_encseq_get_encoded_char(encseq,pos1+offset,GT_READMODE_FORWARD);
      cc2 = gt_encseq_get_encoded_char(encseq,pos2+len-1-offset,
                                       GT_READMODE_FORWARD);
      gt_assert(cc1 == cc2 && ISNOTSPECIAL(cc1));
    }
    if (pos1 + len < totallength)
    {
      cc1 = gt_encseq_get_encoded_char(encseq,pos1+len,GT_READMODE_FORWARD);
    } else
    {
      cc1 = SEPARATOR;
    }
    if (pos2 > 0)
    {
      cc2 = gt_encseq_get_encoded_char(encseq,pos2-1,GT_READMODE_FORWARD);
    } else
    {
      cc2 = SEPARATOR;
    }
    gt_assert(cc1 != cc2 || ISSPECIAL(cc1));
  }
}
コード例 #22
0
ファイル: rdj-contfind-bottomup.c プロジェクト: 9beckert/TIR
/* prepare sspbittab and determine length of shortest sequence */
static void prepare_sspbittab_and_shortest(unsigned long totallength,
    ContfindBUstate *state)
{
  unsigned long length, lastseqstart, i, ssp;

  GT_INITBITTAB(state->sspbittab, totallength + 1);
  lastseqstart = 0;
  state->shortest = totallength;
  for (i = 1UL; i <= state->nofsequences - 1; i++)
  {
    ssp = gt_encseq_seqstartpos(state->encseq, i) - 1;
    GT_SETIBIT(state->sspbittab, ssp);
    length = ssp - lastseqstart;
    lastseqstart = ssp + 1;
    if (length < state->shortest)
      state->shortest = length;
  }
  GT_SETIBIT(state->sspbittab, totallength);
  length = totallength - lastseqstart;
  if (length < state->shortest)
    state->shortest = length;
}
コード例 #23
0
ファイル: encseq_col.c プロジェクト: kowsky/genometools
static int gt_encseq_col_md5_to_seq(GtSeqCol *sc, char **seq,
                                    GtUword start, GtUword end,
                                    GtStr *md5_seqid, GtError *err)
{
  GtUword seqnum = GT_UNDEF_UWORD;
  char seqid[GT_MD5_SEQID_HASH_LEN + 1];
  int had_err = 0;
  GtEncseqCol *esc;
  esc = gt_encseq_col_cast(sc);
  gt_error_check(err);
  gt_assert(esc && seq && start <= end && md5_seqid && err);
  gt_assert(gt_md5_seqid_has_prefix(gt_str_get(md5_seqid)));
  if (gt_str_length(md5_seqid) >= GT_MD5_SEQID_TOTAL_LEN) {
    const char *cstrseqid = gt_str_get(md5_seqid);
    if (cstrseqid[GT_MD5_SEQID_TOTAL_LEN-1] != GT_MD5_SEQID_SEPARATOR) {
      gt_error_set(err, "MD5 sequence id %s not terminated with '%c'",
                   gt_str_get(md5_seqid), GT_MD5_SEQID_SEPARATOR);
      had_err = -1;
    }
    if (!had_err) {
      strncpy(seqid, cstrseqid + GT_MD5_SEQID_PREFIX_LEN,
              GT_MD5_SEQID_HASH_LEN);
      seqid[GT_MD5_SEQID_HASH_LEN] = '\0';
    }
  }
  seqnum = gt_md5_tab_map(esc->md5_tab, seqid);
  if (seqnum != GT_UNDEF_UWORD) {
    GtUword startpos = gt_encseq_seqstartpos(esc->encseq, seqnum),
                  GT_UNUSED seqlength = gt_encseq_seqlength(esc->encseq,
                                                            seqnum);
    *seq = gt_calloc(end - start + 1, sizeof (char));
    gt_encseq_extract_decoded(esc->encseq, (char*) *seq, startpos + start,
                              startpos + end);
  } else {
    gt_error_set(err, "sequence %s not found", gt_str_get(md5_seqid));
    had_err = -1;
  }
  return had_err;
}
コード例 #24
0
ファイル: idxlocalisw.c プロジェクト: AnnSeidel/genometools
void gt_multiapplysmithwaterman(SWdpresource *dpresource,
                             const GtEncseq *encseq,
                             const GtUchar *query,
                             GtUword querylen)
{
  GtUword seqnum,
                seqstartpos,
                seqlength,
                numofdbsequences = gt_encseq_num_of_sequences(encseq);

  for (seqnum = 0; seqnum < numofdbsequences; seqnum++)
  {
    seqstartpos = gt_encseq_seqstartpos(encseq, seqnum);
    seqlength = gt_encseq_seqlength(encseq, seqnum);
    applysmithwaterman(dpresource,
                       encseq,
                       seqnum,
                       seqstartpos,
                       seqstartpos + seqlength,
                       query,
                       querylen);
  }
}
コード例 #25
0
ファイル: gt_repfind.c プロジェクト: lparsons/genometools
static int gt_simpleexactselfmatchoutput(void *info,
                                         const GtGenericEncseq *genericencseq,
                                         GtUword len,
                                         GtUword pos1,
                                         GtUword pos2,
                                         GT_UNUSED GtError *err)
{
  GtUword queryseqnum, seqstartpos, seqlength;
  GtQuerymatch *querymatch = (GtQuerymatch *) info;
  const GtEncseq *encseq;

  if (pos1 > pos2)
  {
    GtUword tmp = pos1;
    pos1 = pos2;
    pos2 = tmp;
  }
  gt_assert(genericencseq != NULL && genericencseq->hasencseq);
  encseq = genericencseq->seqptr.encseq;
  queryseqnum = gt_encseq_seqnum(encseq,pos2);
  seqstartpos = gt_encseq_seqstartpos(encseq, queryseqnum);
  seqlength = gt_encseq_seqlength(encseq, queryseqnum);
  gt_assert(pos2 >= seqstartpos);
  gt_querymatch_fill(querymatch,
                     len,
                     pos1,
                     GT_READMODE_FORWARD,
                     false,
                     0,
                     0,
                     true,
                     (uint64_t) queryseqnum,
                     len,
                     pos2 - seqstartpos);
  return gt_querymatch_output(info, encseq, querymatch, NULL, seqlength, err);
}
コード例 #26
0
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args,
                           const char *filename, GtError *err)
{
  GtUword i, j, sfrom, sto;
  int had_err = 0;
  bool has_desc;
  GtEncseqReader *esr;
  gt_assert(encseq);

  if (!(has_desc = gt_encseq_has_description_support(encseq)))
    gt_warning("Missing description support for file %s", filename);

  if (strcmp(gt_str_get(args->mode), "fasta") == 0) {
    /* specify a single sequence to extract */
    if (args->seq != GT_UNDEF_UWORD) {
      if (args->seq >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "requested sequence "GT_WU" exceeds number of sequences "
                     "("GT_WU")", args->seq,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seq;
      sto = args->seq + 1;
    } else if (args->seqrng.start != GT_UNDEF_UWORD
                 && args->seqrng.end != GT_UNDEF_UWORD) {
      /* specify a sequence range to extract */
      if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq)
            || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "range "GT_WU"-"GT_WU" includes a sequence number "
                     "exceeding the total number of sequences ("GT_WU")",
                     args->seqrng.start,
                     args->seqrng.end,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seqrng.start;
      sto = args->seqrng.end + 1;
    } else {
      /* extract all sequences */
      sfrom = 0;
      sto = gt_encseq_num_of_sequences(encseq);
    }
    for (i = sfrom; i < sto; i++) {
      GtUword desclen, startpos, len;
      char buf[BUFSIZ];
      const char *desc = NULL;
      /* XXX: maybe make this distinction in the functions via readmode? */
      if (!GT_ISDIRREVERSE(args->rm)) {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq, i);
        if (has_desc) {
          desc = gt_encseq_description(encseq, &desclen, i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      } else {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq,
                                  gt_encseq_num_of_sequences(encseq)-1-i);
        startpos = gt_encseq_total_length(encseq)
                     - (gt_encseq_seqstartpos(encseq,
                                              gt_encseq_num_of_sequences(
                                                encseq)-1-i) + len);
        if (has_desc) {
          desc = gt_encseq_description(encseq,
                                       &desclen,
                                       gt_encseq_num_of_sequences(encseq)-1-i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      }
      gt_assert(desc);
      /* output description */
      gt_xfputc(GT_FASTA_SEPARATOR, stdout);
      gt_xfwrite(desc, 1, desclen, stdout);
      gt_xfputc('\n', stdout);
      /* XXX: make this more efficient by writing in a buffer first and then
         showing the result */
      if (args->singlechars) {
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_get_decoded_char(encseq,
                                                startpos + j,
                                                args->rm),
                     stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos);
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout);
        }
        gt_encseq_reader_delete(esr);
      }
      gt_xfputc('\n', stdout);
    }
  }

  if (strcmp(gt_str_get(args->mode), "concat") == 0) {
    GtUword from = 0,
                  to = gt_encseq_total_length(encseq) - 1;
    if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) {
      if (args->rng.end > to) {
        had_err = -1;
        gt_error_set(err,
                     "end of range ("GT_WU") exceeds encoded sequence length "
                     "("GT_WU")", args->rng.end, to);
      }
      if (!had_err) {
        from = args->rng.start;
        to = args->rng.end;
      }
    }
    if (!had_err) {
      if (args->singlechars) {
        for (j = from; j <= to; j++) {
          char cc = gt_encseq_get_decoded_char(encseq, j, args->rm);
          if (cc == (char) SEPARATOR)
            cc = gt_str_get(args->sepchar)[0];
          gt_xfputc(cc, stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from);
        if (esr) {
          for (j = from; j <= to; j++) {
            char cc = gt_encseq_reader_next_decoded_char(esr);
            if (cc == (char) SEPARATOR)
              cc = gt_str_get(args->sepchar)[0];
            gt_xfputc(cc, stdout);
          }
          gt_encseq_reader_delete(esr);
        }
      }
      gt_xfputc('\n', stdout);
    }
  }
  return had_err;
}
コード例 #27
0
ファイル: gt_repfind.c プロジェクト: lparsons/genometools
static int gt_simplexdropselfmatchoutput(void *info,
                                         const GtGenericEncseq *genericencseq,
                                         GtUword len,
                                         GtUword pos1,
                                         GtUword pos2,
                                         GtError *err)
{
  GtXdropmatchinfo *xdropmatchinfo = (GtXdropmatchinfo *) info;
  GtXdropscore score;
  GtUword dbseqnum, dbseqstartpos, dbseqlength, dbstart, dblen,
                querystart, queryseqnum, querylen, queryseqlength,
                queryseqstartpos, dbtotallength;
  const GtEncseq *encseq;

  gt_assert(genericencseq != NULL && genericencseq->hasencseq);
  encseq = genericencseq->seqptr.encseq;
  dbtotallength = gt_encseq_total_length(encseq);
  if (pos1 > pos2)
  {
    GtUword tmp = pos1;
    pos1 = pos2;
    pos2 = tmp;
  }
  dbseqnum = gt_encseq_seqnum(encseq,pos1),
  dbseqstartpos = gt_encseq_seqstartpos(encseq,dbseqnum),
  dbseqlength = gt_encseq_seqlength(encseq,dbseqnum);

  if (pos2 < dbseqstartpos + dbseqlength)
  {
    queryseqnum = dbseqnum;
    queryseqstartpos = dbseqstartpos;
    queryseqlength = dbseqlength;
  } else
  {
    queryseqnum = gt_encseq_seqnum(encseq,pos2);
    gt_assert(dbseqnum < queryseqnum);
    queryseqstartpos = gt_encseq_seqstartpos(encseq,queryseqnum);
    queryseqlength = gt_encseq_seqlength(encseq,queryseqnum);
  }
  if (pos1 > 0 && pos2 > 0)
  {
    gt_assert(pos1 >= dbseqstartpos && pos2 >= queryseqstartpos);
    gt_seqabstract_reinit_encseq(xdropmatchinfo->useq,encseq,
                                 pos1 - dbseqstartpos,0);
    gt_seqabstract_reinit_encseq(xdropmatchinfo->vseq,encseq,
                                 pos2 - queryseqstartpos,0);
    gt_evalxdroparbitscoresextend(false,
                                  &xdropmatchinfo->best_left,
                                  xdropmatchinfo->res,
                                  xdropmatchinfo->useq,
                                  xdropmatchinfo->vseq,
                                  pos1,
                                  pos2,
                                  xdropmatchinfo->belowscore);
  } else
  {
    xdropmatchinfo->best_left.ivalue = 0;
    xdropmatchinfo->best_left.jvalue = 0;
    xdropmatchinfo->best_left.score = 0;
  }
  if (pos1 + len < dbtotallength && pos2 + len < dbtotallength)
  {
    const GtUword seqend1 = dbseqstartpos + dbseqlength;
    const GtUword seqend2 = queryseqstartpos + queryseqlength;

    gt_assert(seqend1 >= pos1 + len && seqend2 >= pos2 + len);
    gt_seqabstract_reinit_encseq(xdropmatchinfo->useq,
                                 encseq,seqend1 - (pos1 + len),0);
    gt_seqabstract_reinit_encseq(xdropmatchinfo->vseq,
                                 encseq,seqend2 - (pos2 + len),0);
    gt_evalxdroparbitscoresextend(true,
                                  &xdropmatchinfo->best_right,
                                  xdropmatchinfo->res,
                                  xdropmatchinfo->useq,
                                  xdropmatchinfo->vseq,
                                  pos1 + len,
                                  pos2 + len,
                                  xdropmatchinfo->belowscore);
  } else
  {
    xdropmatchinfo->best_right.ivalue = 0;
    xdropmatchinfo->best_right.jvalue = 0;
    xdropmatchinfo->best_right.score = 0;
  }
  gt_assert(pos1 >= (GtUword) xdropmatchinfo->best_left.ivalue &&
            pos2 >= (GtUword) xdropmatchinfo->best_left.jvalue);
  querystart = pos2 - xdropmatchinfo->best_left.jvalue;
  gt_assert(querystart >= queryseqstartpos);
  dblen = len + xdropmatchinfo->best_left.ivalue
              + xdropmatchinfo->best_right.ivalue;
  dbstart = pos1 - xdropmatchinfo->best_left.ivalue;
  querylen = len + xdropmatchinfo->best_left.jvalue
                 + xdropmatchinfo->best_right.jvalue,
  score = (GtXdropscore) len * xdropmatchinfo->arbitscores.mat +
          xdropmatchinfo->best_left.score +
          xdropmatchinfo->best_right.score;
  gt_seqabstract_reinit_encseq(xdropmatchinfo->useq,
                               encseq,
                               dblen,
                               dbstart);
  gt_seqabstract_reinit_encseq(xdropmatchinfo->vseq,
                               encseq,
                               querylen,
                               querystart);
  gt_querymatch_fill(xdropmatchinfo->querymatchspaceptr,
                     dblen,
                     dbstart,
                     GT_READMODE_FORWARD,
                     false,
                     score,
                     greedyunitedist(xdropmatchinfo->frontresource,
                                     xdropmatchinfo->useq,xdropmatchinfo->vseq),
                     true,
                     (uint64_t) queryseqnum,
                     querylen,
                     querystart - queryseqstartpos);
  return gt_querymatch_output(info, encseq, xdropmatchinfo->querymatchspaceptr,
                              NULL, gt_encseq_seqlength(encseq, queryseqnum),
                              err);
}
コード例 #28
0
static void gt_querysubstringmatch(bool selfmatch,
                                   const GtEncseq *dbencseq,
                                   const ESASuffixptr *suftabpart,
                                   GtReadmode readmode,
                                   GtUword numberofsuffixes,
                                   uint64_t queryunitnum,
                                   GtQueryrepresentation *queryrep,
                                   GtUword minmatchlength,
                                   GtProcessquerymatch processquerymatch,
                                   void *processquerymatchinfo,
                                   GtQuerymatch *querymatchspaceptr)
{
  GtMMsearchiterator *mmsi;
  GtUword totallength, localqueryoffset = 0;
  uint64_t localqueryunitnum = queryunitnum;
  GtQuerysubstring querysubstring;

  gt_assert(numberofsuffixes > 0);
  totallength = gt_encseq_total_length(dbencseq);
  querysubstring.queryrep = queryrep;
  for (querysubstring.currentoffset = 0;
       querysubstring.currentoffset <= queryrep->seqlen - minmatchlength;
       querysubstring.currentoffset++)
  {
    GtUword dbstart;

    mmsi = gt_mmsearchiterator_new(dbencseq,
                                   suftabpart,
                                   0, /* leftbound */
                                   numberofsuffixes - 1, /* rightbound */
                                   0, /* offset */
                                   readmode,
                                   &querysubstring,
                                   minmatchlength);
    while (gt_mmsearchiterator_next(&dbstart,mmsi))
    {
      if (gt_mmsearch_isleftmaximal(dbencseq,
                                    readmode,
                                    dbstart,
                                    &querysubstring))
      {
        GtUword dbseqnum, dbseqstartpos, dbseqlen, extend;

        extend = gt_mmsearch_extendright(dbencseq,
                                         mmsi->esr,
                                         readmode,
                                         totallength,
                                         dbstart + minmatchlength,
                                         &querysubstring,
                                         minmatchlength);

        if (gt_encseq_has_multiseq_support(dbencseq))
        {
          dbseqnum = gt_encseq_seqnum(dbencseq,dbstart);
          dbseqstartpos = gt_encseq_seqstartpos(dbencseq,dbseqnum);
          dbseqlen = gt_encseq_seqlength(dbencseq,dbseqnum);
        } else
        {
          dbseqnum = dbseqstartpos = dbseqlen = 0;
        }
        gt_querymatch_init(querymatchspaceptr,
                           minmatchlength + extend,
                           dbstart,
                           dbseqnum,
                           dbstart - dbseqstartpos,
                           dbseqlen,
                           0, /* score */
                           0, /* edist */
                           selfmatch,
                           localqueryunitnum,
                           minmatchlength + extend,
                           localqueryoffset,
                           queryrep->seqlen);
        processquerymatch(processquerymatchinfo,querymatchspaceptr);
      }
    }
    gt_mmsearchiterator_delete(mmsi);
    mmsi = NULL;
    if (gt_mmsearch_accessquery(queryrep,querysubstring.currentoffset)
        == (GtUchar) SEPARATOR)
    {
      localqueryunitnum++;
      localqueryoffset = 0;
    } else
    {
      localqueryoffset++;
    }
  }
}
コード例 #29
0
void gt_queryuniquematch(bool selfmatch,
                        const Suffixarray *suffixarray,
                        uint64_t queryunitnum,
                        GtQueryrepresentation *queryrep,
                        GtUword minmatchlength,
                        GtProcessquerymatch processquerymatch,
                        void *processquerymatchinfo,
                        GtQuerymatch *querymatchspaceptr)
{
  GtUword offset, totallength = gt_encseq_total_length(suffixarray->encseq),
          localqueryoffset = 0;
  uint64_t localqueryunitnum = queryunitnum;

  gt_assert(!selfmatch && queryrep->seqlen >= minmatchlength);
  for (offset = 0; offset <= queryrep->seqlen - minmatchlength; offset++)
  {
    GtUword matchlen, dbstart;

    matchlen = gt_suffixarrayfindmums (suffixarray,
                                       0,
                                       0, /* leftbound */
                                       totallength, /* rightbound */
                                       &dbstart,
                                       queryrep->sequence + offset,
                                       queryrep->sequence + queryrep->seqlen);
    if (dbstart != ULONG_MAX &&
        matchlen >= minmatchlength &&
        gt_mum_isleftmaximal(suffixarray->encseq,
                             suffixarray->readmode,
                             dbstart,
                             offset,
                             queryrep->sequence))
    {
      GtUword dbseqnum = gt_encseq_seqnum(suffixarray->encseq,dbstart),
              dbseqstartpos = gt_encseq_seqstartpos(suffixarray->encseq,
                                                    dbseqnum),
              dbseqlen = gt_encseq_seqlength(suffixarray->encseq,dbseqnum);

      gt_querymatch_init(querymatchspaceptr,
                         matchlen,
                         dbstart,
                         dbseqnum,
                         dbstart - dbseqstartpos,
                         dbseqlen,
                         0, /* score */
                         0, /* edist */
                         selfmatch,
                         localqueryunitnum,
                         matchlen,
                         localqueryoffset,
                         queryrep->seqlen);
      processquerymatch(processquerymatchinfo,querymatchspaceptr);
    }
    if (queryrep->sequence[offset] == (GtUchar) SEPARATOR)
    {
      localqueryunitnum++;
      localqueryoffset = 0;
    } else
    {
      localqueryoffset++;
    }
  }
}
コード例 #30
0
int gt_querysubstringmatchiterator_next(GtQuerysubstringmatchiterator *qsmi,
                                        GtError *err)
{
  gt_assert(qsmi != NULL);
  while (true)
  {
    if (qsmi->query_seqlen < qsmi->userdefinedleastlength)
    {
      if (qsmi->seqit != NULL)
      {
        int retval = gt_seq_iterator_next(qsmi->seqit,
                                          &qsmi->query_for_seqit,
                                          &qsmi->query_seqlen,
                                          &qsmi->desc,
                                          err);
        if (retval < 0)
        {
          return -1; /* error */
        }
        if (retval == 0)
        {
          return 1; /* no more sequences */
        }
        gt_assert(qsmi->query_seqlen > 0 && qsmi->query_for_seqit != NULL);
        qsmi->queryrep.sequence = qsmi->query_for_seqit;
      } else
      {
        if (qsmi->queryunitnum == qsmi->query_encseq_numofsequences)
        {
          return 1;
        }
        qsmi->queryrep.startpos = gt_encseq_seqstartpos(qsmi->queryrep.encseq,
                                                        qsmi->queryunitnum);
        qsmi->query_seqlen = gt_encseq_seqlength(qsmi->queryrep.encseq,
                                                 qsmi->queryunitnum);
      }
      gt_assert(qsmi->query_seqlen > 0);
      qsmi->queryrep.seqlen = qsmi->query_seqlen;
      qsmi->querysubstring.currentoffset = 0;
    }
    if (qsmi->query_seqlen >= qsmi->userdefinedleastlength)
    {
      if (!qsmi->mmsi_defined)
      {
        gt_mmsearchiterator_reinit(qsmi->mmsi,
                                   qsmi->dbencseq,
                                   qsmi->suftabpart,
                                   0, /* l */
                                   qsmi->numberofsuffixes - 1, /* r */
                                   0, /* offset */
                                   qsmi->db_readmode,
                                   &qsmi->querysubstring,
                                   qsmi->userdefinedleastlength);
        qsmi->mmsi_defined = true;
      } else
      {
        if (gt_mmsearchiterator_next(&qsmi->dbstart,qsmi->mmsi))
        {
          GtUword extend;

          if (gt_mmsearch_isleftmaximal(qsmi->dbencseq,
                                        qsmi->db_readmode,
                                        qsmi->dbstart,
                                        &qsmi->querysubstring))
          {
            extend = gt_mmsearch_extendright(qsmi->dbencseq,
                                             qsmi->mmsi->esr,
                                             qsmi->db_readmode,
                                             qsmi->totallength,
                                             qsmi->dbstart +
                                               qsmi->userdefinedleastlength,
                                             &qsmi->querysubstring,
                                             qsmi->userdefinedleastlength);
            qsmi->matchlength = qsmi->userdefinedleastlength + extend;
            return 0;
          }
        } else
        {
          qsmi->mmsi_defined = false;
          if (qsmi->querysubstring.currentoffset +
              qsmi->userdefinedleastlength < qsmi->query_seqlen)
          {
            qsmi->querysubstring.currentoffset++;
          } else
          {
            qsmi->query_seqlen = 0;
            qsmi->queryunitnum++;
          }
        }
      }
    } else
    {
      qsmi->query_seqlen = 0;
      qsmi->queryunitnum++;
    }
  }
}