Beispiel #1
0
GtRange gth_seq_con_get_relative_range(GthSeqCon *seq_con,
                                       unsigned long seq_num)
{
  GtRange relative_range, range;
  gt_assert(seq_con);
  gt_assert(seq_num < gth_seq_con_num_of_seqs(seq_con));
  range = gth_seq_con_get_range(seq_con, seq_num);
  relative_range.start = 0;
  relative_range.end = range.end - range.start;
  return relative_range;
}
Beispiel #2
0
static void calc_chains_from_matches(GthChainCollection *chain_collection,
                                     GtArray *matches,
                                     GthChainingInfo *chaining_info,
                                     GthSeqCon *gen_seq_con,
                                     GthSeqCon *ref_seq_con,
                                     GtUword rare,
                                     double fragweightfactor,
                                     GthJumpTableNew jump_table_new,
                                     GthJumpTableNewReverse
                                     jump_table_new_reverse,
                                     GthJumpTableDelete jump_table_delete)
{
  GtUword i, numofchains = 0, num_of_fragments, maxbucketlength = 0;
  GtRange range;
  GtFile *outfp = chaining_info->call_info->out->outfp;
  GtFragment *fragments;
  GthSaveChainInfo info;
  GtArray *buckets;
  Bucket *bucket;

  /* this is a random sample to check that no equal matches exist
     either one match to chain or if more than one the first two differ */
  gt_assert(gt_array_size(matches) == 1 ||
            (gt_array_size(matches) > 1 &&
             !gth_matches_are_equal(gt_array_get(matches, 0),
                                    gt_array_get(matches, 1))));

  /* init */
  buckets = gt_array_new(sizeof (Bucket));

  /* output unsorted matches */
  if (chaining_info->call_info->out->comments) {
    gt_file_xprintf(outfp, "%c output unsorted matches\n", COMMENTCHAR);
    showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp);
  }

  /* transform reference sequence positions to opposite strand if necessary */
  if (!chaining_info->directmatches) {
    if (chaining_info->call_info->out->comments) {
      gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
      gt_file_xprintf(outfp, "%c transform reference sequence positions to "
                                "opposite strand\n", COMMENTCHAR);
      gt_file_xprintf(outfp, "%c\n", COMMENTCHAR);
    }

    transform_refseq_positions(matches, ref_seq_con);

    /* output transformed matches */
    if (chaining_info->call_info->out->comments) {
      gt_file_xprintf(outfp, "%c output transformed matches\n", COMMENTCHAR);
      showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp);
    }
  }

  /* sort matches */
  sort_matches_and_calc_buckets(matches, buckets, &maxbucketlength);

  /* output sorted matches */
  if (chaining_info->call_info->out->comments) {
    gt_file_xprintf(outfp, "%c output sorted matches\n", COMMENTCHAR);
    showmatches(gt_array_get_space(matches), gt_array_size(matches), outfp);
  }

  /* output buckets */
  if (chaining_info->call_info->out->comments) {
    gt_file_xprintf(outfp, "%c output buckets\n", COMMENTCHAR);
    outputbuckets(buckets, gt_array_get_space(matches), outfp);
  }

  /* alloc space for fragments */
  fragments = gt_malloc(sizeof (GtFragment) * maxbucketlength);

  /* save data to process the chains with saveChainasDPrange; constant part */
  info.chain_collection  = chain_collection;
  info.gcmincoverage     = chaining_info->call_info->gcmincoverage;
  info.stat              = chaining_info->stat;
  info.comments          = chaining_info->call_info->out->comments;
  info.stopafterchaining = chaining_info->call_info->simfilterparam
                           .stopafterchaining;
  info.paralogs          = chaining_info->call_info->simfilterparam.paralogs;
  info.enrichchains      = chaining_info->call_info->simfilterparam
                           .enrichchains;
  info.jump_table        = chaining_info->call_info->simfilterparam.jump_table;
  info.jump_table_new    = jump_table_new;
  info.jump_table_new_reverse = jump_table_new_reverse;
  info.jump_table_delete = jump_table_delete;
  info.jtdebug           = chaining_info->jtdebug;
  info.directmatches     = chaining_info->directmatches;
  info.outfp             = outfp;
  info.gen_file_num      = chaining_info->gen_file_num;
  info.ref_file_num      = chaining_info->ref_file_num;

  /* for every bucket a chain and for every chain a DP call (later maybe more
     than one chain) */
  for (i = 0; i < gt_array_size(buckets); i++) {
    bucket = gt_array_get(buckets, i);
    if (chaining_info->call_info->out->showverbose) {
      if (chaining_info->refseqisindex &&
          !chaining_info->call_info->simfilterparam.online) {
        /* in this case the exact number of chains is known */
        numofchains = gt_array_size(buckets);
      }
      else {
        /* this expression gives an upper bound on the number of chains
           (because we do not know the exact number here) */
        numofchains = chaining_info->bucketnum +
                      gth_seq_con_num_of_seqs(gen_seq_con) *
                      (gth_seq_con_num_of_seqs(ref_seq_con) - bucket->seqnum1);

        if (numofchains > chaining_info->maxbucketnum)
          numofchains = chaining_info->maxbucketnum;
        else
          chaining_info->maxbucketnum = numofchains;
      }
    }

    /* compute a set of fragments from every bucket of matches */

    gthinitfragments(fragments, &num_of_fragments,
                     (GthMatch*) gt_array_get_space(matches) + bucket->startpos,
                     bucket->length, rare, fragweightfactor);

    if (chaining_info->call_info->out->showverbose) {
      show_chain_calc_status (chaining_info->call_info->out->showverbose,
                              ++chaining_info->bucketnum, numofchains,
                              num_of_fragments, chaining_info->gen_file_num,
                              gth_input_num_of_gen_files(chaining_info->input),
                              chaining_info->ref_file_num,
                              gth_input_num_of_ref_files(chaining_info->input),
                              chaining_info->directmatches,
                              chaining_info->call_info->out->verboseseqs,
                              bucket->seqnum2, bucket->seqnum1);
    }

    info.gen_seq_num = ((GthMatch*) gt_array_get(matches, bucket->startpos))
                       ->Storeseqnumgenomic;
    info.ref_seq_num = ((GthMatch*) gt_array_get(matches, bucket->startpos))
                       ->Storeseqnumreference;

    /* store genomic offset */
    range = gth_seq_con_get_range(gen_seq_con, info.gen_seq_num);
    info.gen_total_length = range.end - range.start + 1;
    info.gen_offset       = range.start;

    /* store length of reference sequence */
    range = gth_seq_con_get_range(ref_seq_con, info.ref_seq_num);
    info.ref_total_length = range.end - range.start + 1;
    info.ref_offset       = range.start;
    info.referencelength  = range.end - range.start + 1;

    /* set number of remaining buckets */
    info.numofremainingbuckets = gt_array_size(buckets) - i;

    if (chaining_info->call_info->simfilterparam.paralogs) {
      gt_globalchaining_coverage(fragments, num_of_fragments,
                                 chaining_info->call_info->gcmaxgapwidth,
                                 info.referencelength,
                                 ((double)
                                  chaining_info->call_info->gcmincoverage) /
                                  100.0, gth_save_chain, &info);
    }
    else {
      gt_globalchaining_max(fragments, num_of_fragments,
                            chaining_info->call_info->gcmaxgapwidth,
                            gth_save_chain, &info);
    }
  }

  /* free space */
  gt_array_delete(buckets);
  gt_free(fragments);
}