Example #1
0
static int store_in_sa_collection(void *data, GthSA *sa,
                                  GT_UNUSED const char *outputfilename,
                                  GT_UNUSED GtError *err)
{
  SACollectionData *sa_collection_data = (SACollectionData*) data;
  bool inserted;
  inserted = gth_sa_collection_insert_sa(sa_collection_data->sa_collection, sa,
                                         sa_collection_data->sa_filter,
                                         sa_collection_data->stat);
  if (!inserted) { /* unsuccessful insertion; discard sa */
    gth_sa_delete(sa);
  }
  return 0;
}
/* the following function saves <sa> by inserting it into <sa_collection> and
   sets <significantmatchfound> to true, if the insertion was successful */
static void save_sa(GthSACollection *sa_collection, GthSA *sa,
                    GthSAFilter *sa_filter, GthMatchInfo *match_info,
                    GthStat *stat)
{
  if (!gth_sa_collection_insert_sa(sa_collection, sa, sa_filter, stat)) {
    /* unsuccessful insertion; discard sa */
    gth_sa_delete(sa);
    match_info->call_number--;
  }
  else {
    /* else successful insertion */
    match_info->significant_match_found = true;
  }
}
Example #3
0
static int store_in_subset_file(void *data, GthSA *sa,
                                const char *outputfilename, GtError *err)
{
  Store_in_subset_file_data *store_in_subset_file_data =
    (Store_in_subset_file_data*) data;
  double split_determing_percentage = 0.0;
  unsigned long filenum;
  char filenamesuffix[4];
  int had_err = 0;

  gt_error_check(err);

  /* filter before we do any further processing */
  if (gth_sa_filter_filter_sa(store_in_subset_file_data->sa_filter, sa)) {
    /* and free it afterwards */
    gth_sa_delete(sa);
    /* discard */
    return 0;
  }

  /* check whether we got a new output file to process */
  if (!store_in_subset_file_data->current_outputfilename) {
    store_in_subset_file_data->current_outputfilename =
      gt_cstr_dup(outputfilename);
  }
  else if (strcmp(store_in_subset_file_data->current_outputfilename,
                  outputfilename)) {
    /* close current output files */
    close_output_files(store_in_subset_file_data);
    gt_free(store_in_subset_file_data->current_outputfilename);
 }

  /* determine in which file the current sa needs to be put */
  switch (store_in_subset_file_data->gthsplitinfo->splitmode) {
    case ALIGNMENTSCORE_SPLIT:
      split_determing_percentage = gth_sa_score(sa);
      strcpy(filenamesuffix, "scr");
      break;
    case COVERAGE_SPLIT:
      split_determing_percentage = gth_sa_coverage(sa);
      strcpy(filenamesuffix, "cov");
      break;
    default: gt_assert(0);
  }
  gt_assert(split_determing_percentage >= 0.0);
  /* XXX: change into an assertion when coverage problem is fixed */
  if (split_determing_percentage > 1.0)
    split_determing_percentage = 1.0;

  if (split_determing_percentage == 1.0)
    filenum = store_in_subset_file_data->num_of_subset_files - 1;
  else {
    filenum =  floor(split_determing_percentage * 100.0 /
                           store_in_subset_file_data->gthsplitinfo->range);
  }
  gt_assert(filenum < store_in_subset_file_data->num_of_subset_files);

  /* make sure the file exists and is open */
  if (!store_in_subset_file_data->subset_files[filenum]) {
    gt_assert(store_in_subset_file_data->subset_filenames[filenum] == NULL);
    store_in_subset_file_data->subset_filenames[filenum] = gt_str_new();
    gt_str_append_cstr_nt(store_in_subset_file_data->subset_filenames[filenum],
                          outputfilename,
                          gt_file_basename_length(outputfilename));
    gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum],
                       '.');
    gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum],
                       filenamesuffix);
    gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum],
                        filenum *
                        store_in_subset_file_data->gthsplitinfo->range);
    gt_str_append_char(store_in_subset_file_data->subset_filenames[filenum],
                       '-');
    gt_str_append_ulong(store_in_subset_file_data->subset_filenames[filenum],
                     (filenum + 1) *
                     store_in_subset_file_data->gthsplitinfo->range);
    gt_str_append_cstr(store_in_subset_file_data->subset_filenames[filenum],
                       gt_file_mode_suffix(store_in_subset_file_data
                                           ->gthsplitinfo->file_mode));

    /* if not disabled by -force, check if file already exists */
    if (!store_in_subset_file_data->gthsplitinfo->force) {
      store_in_subset_file_data->subset_files[filenum] =
        gt_file_open(store_in_subset_file_data->gthsplitinfo->file_mode,
                     gt_str_get(store_in_subset_file_data
                                ->subset_filenames[filenum]), "r", NULL);
      if (store_in_subset_file_data->subset_files[filenum]) {
        gt_error_set(err, "file \"%s\" exists already. use option -%s to "
                     "overwrite", gt_str_get(store_in_subset_file_data
                                             ->subset_filenames[filenum]),
                     GT_FORCE_OPT_CSTR);
        had_err = -1;
      }
    }
    if (!had_err) {
      /* open split file for writing */
      store_in_subset_file_data->subset_files[filenum] =
          gt_file_xopen_file_mode(store_in_subset_file_data->gthsplitinfo
                                  ->file_mode,
                                  gt_str_get(store_in_subset_file_data
                                             ->subset_filenames[filenum]), "w");
      /* store XML header in file */
      gth_xml_show_leader(true,
                          store_in_subset_file_data->subset_files[filenum]);
    }
  }

  /* put it there */
  if (!had_err) {
    gth_xml_inter_sa_visitor_set_outfp(store_in_subset_file_data->sa_visitor,
                                       store_in_subset_file_data
                                       ->subset_files[filenum]);
    gth_sa_visitor_visit_sa(store_in_subset_file_data->sa_visitor, sa);
  }

  /* adjust counter */
  if (!had_err)
    store_in_subset_file_data->subset_file_sa_counter[filenum]++;

  /* and free it afterwards */
  gth_sa_delete(sa);

  return had_err;
}
static int calc_spliced_alignments(GthSACollection *sa_collection,
                                   GthChainCollection *chain_collection,
                                   GthCallInfo *call_info,
                                   GthInput *input,
                                   GthStat *stat,
                                   GtUword gen_file_num,
                                   GtUword ref_file_num,
                                   bool directmatches,
                                   GthMatchInfo *match_info,
                                   GthDNACompletePathMatrixJT
                                   dna_complete_path_matrix_jt,
                                   GthProteinCompletePathMatrixJT
                                   protein_complete_path_matrix_jt)
{
  const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL,
                      *ref_seq_orig_rc = NULL;
  GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length,
                ref_total_length;
  GtFile *outfp = call_info->out->outfp;
  GtRange gen_seq_bounds, gen_seq_bounds_rc;
  bool refseqisdna;
  GthChain *chain;
  GtRange range;
  GthSA *saA;
  int rval;

  gt_assert(sa_collection && chain_collection);

  refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num);

  for (chainctr = 0;
       chainctr < gth_chain_collection_size(chain_collection);
       chainctr++) {
       chain = gth_chain_collection_get(chain_collection, chainctr);
    if (++match_info->call_number > call_info->firstalshown &&
        call_info->firstalshown > 0) {
      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "<!--\n");

      if (!call_info->out->gff3out) {
        gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n",
                        refseqisdna ? "EST" : "protein",
                        call_info->firstalshown);
        gt_file_xprintf(outfp, "Only the first %u matches will be "
                           "displayed.\n", call_info->firstalshown);
      }

      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "-->\n");

      match_info->max_call_number_reached = true;
      break; /* break out of loop */
    }

    /* compute considered genomic regions if not set by -frompos */
    if (!gth_input_use_substring_spec(input)) {
      gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num,
                                                   chain->gen_seq_num);
      gen_total_length      = gt_range_length(&gen_seq_bounds);
      gen_offset            = gen_seq_bounds.start;
      gen_seq_bounds_rc     = gen_seq_bounds;
    }
    else {
      /* genomic multiseq contains exactly one sequence */
      gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1);
      gen_total_length = gth_input_genomic_file_total_length(input,
                                                             chain
                                                             ->gen_file_num);
      gen_seq_bounds.start    = gth_input_genomic_substring_from(input);
      gen_seq_bounds.end      = gth_input_genomic_substring_to(input);
      gen_offset              = 0;
      gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end;
      gen_seq_bounds_rc.end   = gen_total_length - 1 - gen_seq_bounds.start;
    }

    /* "retrieving" the reference sequence */
    range = gth_input_get_reference_range(input, chain->ref_file_num,
                                          chain->ref_seq_num);
    ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start;
    ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start;
    if (refseqisdna) {
      ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start;
      ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start;
    }
    ref_total_length = range.end - range.start + 1;

    /* check if protein sequences have a stop amino acid */
    if (!refseqisdna && !match_info->stop_amino_acid_warning &&
       ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) {
      GtStr *ref_id = gt_str_new();
      gth_input_save_ref_id(input, ref_id, chain->ref_file_num,
                            chain->ref_seq_num);
      gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end "
                 "with a stop amino acid ('%c'). If it is not a protein "
                 "fragment you should add a stop amino acid to improve the "
                 "prediction. For example with `gt seqtransform "
                 "-addstopaminos` (see http://genometools.org for details).",
                 gt_str_get(ref_id), chain->ref_seq_num,
                 gth_input_get_reference_filename(input, chain->ref_file_num),
                 GT_STOP_AMINO);
      match_info->stop_amino_acid_warning = true;
      gt_str_delete(ref_id);
    }

    /* allocating space for alignment */
    saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num,
                             chain->gen_seq_num, chain->ref_file_num,
                             chain->ref_seq_num, match_info->call_number,
                             gen_total_length, gen_offset, ref_total_length);

    /* extend the DP borders to the left and to the right */
    gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc,
                             gen_total_length, gen_offset);

    /* From here on the dp positions always refer to the forward strand of the
       genomic DNA. */

    /* call the Dynamic Programming */
    if (refseqisdna) {
      rval = call_dna_DP(directmatches, call_info, input, stat,
                         sa_collection, saA, gen_file_num, ref_file_num,
                         gen_total_length, gen_offset, &gen_seq_bounds,
                         &gen_seq_bounds_rc, ref_total_length, range.start,
                         chainctr, gth_chain_collection_size(chain_collection),
                         match_info, ref_seq_tran, ref_seq_orig,
                         ref_seq_tran_rc, ref_seq_orig_rc, chain,
                         dna_complete_path_matrix_jt,
                         protein_complete_path_matrix_jt);
    }
    else {
      rval = call_protein_DP(directmatches, call_info, input,
                             stat, sa_collection, saA, gen_file_num,
                             ref_file_num, gen_total_length, gen_offset,
                             &gen_seq_bounds, &gen_seq_bounds_rc,
                             ref_total_length, range.start, chainctr,
                             gth_chain_collection_size(chain_collection),
                             match_info, ref_seq_tran, ref_seq_orig, chain,
                             dna_complete_path_matrix_jt,
                             protein_complete_path_matrix_jt);
    }
    /* check return value */
    if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) {
      /* statistics bookkeeping */
      gth_stat_increment_numoffailedDPparameterallocations(stat);
      gth_stat_increment_numofundeterminedSAs(stat);
      /* free space */
      gth_sa_delete(saA);
      match_info->call_number--;
      continue; /* continue with the next DP range */
    }
    else if (rval)
      return -1;
  }

  if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches &&
      !match_info->significant_match_found &&
      match_info->call_number <= call_info->firstalshown) {
    show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp);
  }

  return 0;
}
static int call_protein_DP(bool directmatches,
                           GthCallInfo *call_info,
                           GthInput *input,
                           GthStat *stat,
                           GthSACollection *sa_collection,
                           GthSA *saA,
                           GtUword gen_file_num,
                           GtUword ref_file_num,
                           GtUword gen_total_length,
                           GtUword gen_offset,
                           const GtRange *gen_seq_bounds,
                           const GtRange *gen_seq_bounds_rc,
                           GtUword ref_total_length,
                           GtUword ref_offset,
                           GtUword chainctr,
                           GtUword num_of_chains,
                           GthMatchInfo *match_info,
                           const unsigned char *ref_seq_tran,
                           const unsigned char *ref_seq_orig,
                           GthChain *chain,
                           GthDNACompletePathMatrixJT
                           dna_complete_path_matrix_jt,
                           GthProteinCompletePathMatrixJT
                           protein_complete_path_matrix_jt)
{
  GtFile *outfp = call_info->out->outfp;
  int rval;

#ifndef NDEBUG
  /* strand is in searchmode */
  if (directmatches)
    gt_assert(gth_input_forward(input));
  else
    gt_assert(gth_input_reverse(input));
#endif

  /* calculate alignment */
  rval = callsahmt(false, saA, directmatches, gen_file_num, ref_file_num,
                   chain, gen_total_length, gen_offset, gen_seq_bounds,
                   gen_seq_bounds_rc, ref_seq_tran, ref_seq_orig,
                   ref_total_length, ref_offset, input,
                   &call_info->simfilterparam.introncutoutinfo, stat, chainctr,
                   num_of_chains, call_info->translationtable, directmatches,
                   call_info->proteinexonpenal, call_info->splice_site_model,
                   call_info->dp_options_core, call_info->dp_options_est,
                   call_info->dp_options_postpro, dna_complete_path_matrix_jt,
                   protein_complete_path_matrix_jt, call_info->out);
  if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) {
                   /* ^ this error is treated below */
    return rval;
  }

  if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
      isunsuccessfulalignment(saA, call_info->out->comments, outfp)) {
    match_info->call_number--;
    /* if the spliced alignment was unsuccessful, it is deleted and the
       next hit is considered. */
    gth_sa_delete(saA);
    /* continue */
    return 0;
  }

  /* we can save the alignment now */
  save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);

  return 0;
}
static int call_dna_DP(bool directmatches, GthCallInfo *call_info,
                       GthInput *input, GthStat *stat,
                       GthSACollection *sa_collection, GthSA *saA,
                       GtUword gen_file_num,
                       GtUword ref_file_num,
                       GtUword gen_total_length,
                       GtUword gen_offset,
                       const GtRange *gen_seq_bounds,
                       const GtRange *gen_seq_bounds_rc,
                       GtUword ref_total_length, GtUword ref_offset,
                       GtUword chainctr,
                       GtUword num_of_chains, GthMatchInfo *match_info,
                       const unsigned char *ref_seq_tran,
                       const unsigned char *ref_seq_orig,
                       const unsigned char *ref_seq_tran_rc,
                       const unsigned char *ref_seq_orig_rc,
                       GthChain *chain,
                       GthDNACompletePathMatrixJT dna_complete_path_matrix_jt,
                       GthProteinCompletePathMatrixJT
                       protein_complete_path_matrix_jt)
{
  int rval;
  bool bothstrandsanalyzed, firstdp = true,
       GT_UNUSED gs2outdirectmatches = directmatches;
  GthSA *saB = NULL;
  GtFile *outfp = call_info->out->outfp;

  if (directmatches ? gth_input_forward(input)
                    : gth_input_reverse(input)) {
    /* calculate alignment */
    rval = callsahmt(true, saA, directmatches, gen_file_num, ref_file_num,
                     chain, gen_total_length, gen_offset, gen_seq_bounds,
                     gen_seq_bounds_rc,
                     ref_seq_tran, ref_seq_orig, ref_total_length, ref_offset,
                     input, &call_info->simfilterparam.introncutoutinfo, stat,
                     chainctr, num_of_chains, call_info->translationtable,
                     directmatches, call_info->proteinexonpenal,
                     call_info->splice_site_model, call_info->dp_options_core,
                     call_info->dp_options_est, call_info->dp_options_postpro,
                     dna_complete_path_matrix_jt,
                     protein_complete_path_matrix_jt, call_info->out);
    if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) {
                     /* ^ this error is treated below */
      return rval;
    }

    firstdp = false;
    bothstrandsanalyzed = gth_input_both(input);

    if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
        isunsuccessfulalignment(saA, call_info->out->comments, outfp)) {
      match_info->call_number--;
      /* if the spliced alignment was unsuccessful, it is deleted and the
         next hit is considered. */
      gth_sa_delete(saA);
      return 0; /* continue */
    }

    /* if not both strands are analyzed, we can save this alignment now.
       Otherwise we have to calculate the alignment to the other strand
       first and then save the better one. */
    if (!bothstrandsanalyzed)
      save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
  }

  if (directmatches ? gth_input_reverse(input)
                    : gth_input_forward(input)) {
    if ((firstdp || gth_sa_is_poor(saA, call_info->minaveragessp)) &&
        !call_info->cdnaforwardonly) {
      if (firstdp) {
        /* space for first alignment is already allocated, bu we have to
           change the direction of the genomic and the reference strand */
        gth_sa_set_gen_strand(saA, !directmatches);
        gth_sa_set_ref_strand(saA, false);
      }
      else {
        /* allocating space for second alignment */
        saB = gth_sa_new_and_set(!directmatches, false, input,
                                 chain->gen_file_num, chain->gen_seq_num,
                                 chain->ref_file_num, chain->ref_seq_num,
                                 match_info->call_number, gen_total_length,
                                 gen_offset, ref_total_length);
      }

      /* setting gs2outdirectmatches (for compatibility) */
      gs2outdirectmatches = (bool) !directmatches;

      /* calculate alignment */
      rval = callsahmt(true, firstdp ? saA : saB, !directmatches,
                       gen_file_num, ref_file_num, chain, gen_total_length,
                       gen_offset, gen_seq_bounds, gen_seq_bounds_rc,
                       ref_seq_tran_rc, ref_seq_orig_rc, ref_total_length,
                       ref_offset, input,
                       &call_info->simfilterparam.introncutoutinfo, stat,
                       chainctr, num_of_chains, call_info->translationtable,
                       directmatches, call_info->proteinexonpenal,
                       call_info->splice_site_model, call_info->dp_options_core,
                       call_info->dp_options_est, call_info->dp_options_postpro,
                       dna_complete_path_matrix_jt,
                       protein_complete_path_matrix_jt, call_info->out);
      if (rval && rval != GTH_ERROR_SA_COULD_NOT_BE_DETERMINED) {
                       /* ^ this error is treated below */
        return rval;
      }

      if (firstdp) {
        if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
            isunsuccessfulalignment(saA, call_info->out->comments, outfp)) {
          /* for compatibility with GS2 */
          /* XXX: makes no sense. Possibly only if -gs2out is used. */
          match_info->significant_match_found= true;

          /* if the spliced alignment was unsuccessful, it is deleted and
             the next hit is considered. */
          gth_sa_delete(saA);
          return 0; /* continue */
        }

        save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
      }
      else /* !firstdp */
      {
        if (rval == GTH_ERROR_SA_COULD_NOT_BE_DETERMINED ||
            isunsuccessfulalignment(saB, call_info->out->comments, outfp) ||
            !gth_sa_B_is_better_than_A(saA, saB)) {
          /* insert first SA */
          save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
          /* discard second SA */
          gth_sa_delete(saB);
        }
        else {
          /* insert second SA */
          save_sa(sa_collection, saB, call_info->sa_filter, match_info, stat);
          /* free first SA */
          gth_sa_delete(saA);
        }
      }
    }
    else
      save_sa(sa_collection, saA, call_info->sa_filter, match_info, stat);
  }

  return 0;
}