Example #1
0
static void xml_showgthreferenceinformation(GthSA *sa,
                                            GthInput *input,
                                            unsigned int indentlevel,
                                            GtFile *outfp)
{
  gt_assert(gth_sa_ref_file_num(sa) != GT_UNDEF_ULONG);

  gth_indent(outfp, indentlevel);

  switch (gth_sa_alphatype(sa)) {
    case DNA_ALPHA:
      gt_file_xprintf(outfp, "<reference ref_file=\"%s\" ref_id=\"%s\" "
                                "ref_strand=\"%c\" ref_description=\"",
                         gth_input_get_reference_filename(input,
                                                  gth_sa_ref_file_num(sa)),
                         gth_sa_ref_id(sa),
                         gth_sa_ref_strand_char(sa));
      break;
    case PROTEIN_ALPHA:
      gt_file_xprintf(outfp, "<reference ref_file=\"%s\" ref_id=\"%s\" "
                                "ref_description=\"",
                         gth_input_get_reference_filename(input,
                                                  gth_sa_ref_file_num(sa)),
                         gth_sa_ref_id(sa));
      break;
    default: gt_assert(0);
  }

  gth_input_echo_reference_description(input, gth_sa_ref_file_num(sa),
                                       gth_sa_ref_seq_num(sa), outfp);

  gt_file_xprintf(outfp, "\">\n");
}
static void showgthreferenceinformation(GthSA *sa, GthInput *input,
                                        bool showseqnums,
                                        GtFile *outfp)
{
  gt_assert(gth_sa_ref_file_num(sa) != GT_UNDEF_UWORD);

  switch (gth_sa_alphatype(sa)) {
    case DNA_ALPHA:
      gt_file_xprintf(outfp,
                         "EST Sequence: file=%s, strand=%c, description=",
                         gth_input_get_reference_filename(input,
                                                  gth_sa_ref_file_num(sa)),
                         gth_sa_ref_strand_char(sa));
      break;
    case PROTEIN_ALPHA:
      gt_file_xprintf(outfp, "Protein Sequence: file=%s, description=",
                         gth_input_get_reference_filename(input,
                                                 gth_sa_ref_file_num(sa)));
      break;
    default: gt_assert(0);
  }

  gth_sa_echo_reference_description(sa, input, outfp);

  if (showseqnums)
    gt_file_xprintf(outfp, ", seqnum="GT_WU"",  gth_sa_ref_seq_num(sa));

  gt_file_xfputc('\n', outfp);
  gt_file_xfputc('\n', outfp);
}
static void showreferencefilename(GthSA *sa,
                                  GthInput *input,
                                  unsigned int indentlevel, GtFile *outfp)
{
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referencefile>\n");
  indentlevel++;

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referencefilename>%s</referencefilename>\n",
                     gth_input_get_reference_filename(input,
                                                 gth_sa_ref_file_num(sa)));
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<referencefilehash>%s</referencefilehash>\n",
                     GTH_UNDEFINED_HASH);

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</referencefile>\n");
}
Example #4
0
void gth_chaining(GthChainCollection *chain_collection,
                  GtUword gen_file_num,
                  GtUword ref_file_num,
                  GthCallInfo *call_info,
                  GthInput *input,
                  GthStat *stat,
                  bool directmatches,
                  const GthPlugins *plugins)
{
  GtUword i, numofsequences = 0;
  GtArray *matches;
  GthChainingInfo chaining_info;
  void *matcher_arguments;
  GtFile *outfp = call_info->out->outfp;
  GthMatchProcessorInfo match_processor_info;
  bool refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num);

  /* make sure matcher is defined */
  gt_assert(plugins);
  gt_assert(plugins->matcher_arguments_new);
  gt_assert(plugins->matcher_arguments_delete);
  gt_assert(plugins->matcher_runner);

  /* init */
  matches = gt_array_new(sizeof (GthMatch));

  chaining_info_init(&chaining_info, directmatches, refseqisdna, call_info,
                     input, stat, gen_file_num, ref_file_num);

  matcher_arguments =
    plugins->matcher_arguments_new(true,
                          input,
                          call_info->simfilterparam.inverse || !refseqisdna
                          ? gth_input_get_genomic_filename(input, gen_file_num)
                          : gth_input_get_reference_filename(input,
                                                             ref_file_num),
                          call_info->simfilterparam.inverse || !refseqisdna
                          ? gth_input_get_reference_filename(input,
                                                             ref_file_num)
                          : gth_input_get_genomic_filename(input, gen_file_num),
                          directmatches,
                          refseqisdna,
                          call_info->progname,
                          gt_str_get(gth_input_proteinsmap(input)),
                          call_info->simfilterparam.exact,
                          call_info->simfilterparam.edist,
                          false,
                          0,
                          call_info->simfilterparam.minmatchlength,
                          call_info->simfilterparam.seedlength,
                          call_info->simfilterparam.exdrop,
                          call_info->simfilterparam.prminmatchlen,
                          call_info->simfilterparam.prseedlength,
                          call_info->simfilterparam.prhdist,
                          call_info->translationtable,
                          call_info->simfilterparam.online,
                          call_info->simfilterparam.noautoindex,
                          call_info->simfilterparam.maskpolyAtails,
                          false);

  match_processor_info_init(&match_processor_info, matches, chain_collection,
                            directmatches, refseqisdna,
                            call_info->simfilterparam.online,
                            call_info->simfilterparam.inverse, stat,
                            &chaining_info,
                            call_info->simfilterparam.maxnumofmatches,
                            call_info->simfilterparam.rare,
                            call_info->fragweightfactor,
                            plugins->jump_table_new,
                            plugins->jump_table_new_reverse,
                            plugins->jump_table_delete);

  if (call_info->simfilterparam.maxnumofmatches > 0 ||
      gth_stat_get_matchnumdistri(stat)) {
    /* alloc space of match number counter */
    numofsequences = gth_input_num_of_ref_seqs(input, ref_file_num);
    match_processor_info.matchnumcounter = gt_malloc(sizeof (GtUword) *
                                                     numofsequences);

    /* init match number counter to 0 */
    memset(match_processor_info.matchnumcounter, 0,
           (size_t) numofsequences * sizeof (GtUword));
  }

  /* free input, which contains the virtual trees.
     because vmatch loads the virtual trees into memory, too.
     this prevents that the virtual trees are loaded twice. */
  gth_input_delete_current(input);

  /* call matcher */
  if (call_info->out->showverbose)
    call_info->out->showverbose("call vmatch to compute matches");

  plugins->matcher_runner(matcher_arguments, call_info->out->showverbose,
                          call_info->out->showverboseVM, &match_processor_info);

  /* free matcher stuff here, because otherwise the reference file is mapped
     twice below */
  plugins->matcher_arguments_delete(matcher_arguments);

  /* free sequence collections (if they have been filled by the matcher) */
  gth_seq_con_delete(match_processor_info.gen_seq_con);
  gth_seq_con_delete(match_processor_info.ref_seq_con);

  /* save match numbers of match number distribution, if necessary */
  if (gth_stat_get_matchnumdistri(stat)) {
    for (i = 0; i < numofsequences; i++) {
      if (match_processor_info.matchnumcounter[i] > 0) {
        gth_stat_add_to_matchnumdistri(stat,
                                      match_processor_info.matchnumcounter[i]);
      }
    }
  }

  /* free match number counter */
  gt_free(match_processor_info.matchnumcounter);

  /* return if no match has been found */
  if (!gt_array_size(matches)) {
    if (call_info->out->comments)
      gt_file_xprintf(outfp, "%c no match has been found\n", COMMENTCHAR);
    gt_array_delete(matches);
    return;
  }

  /* load genomic file back into memory */
  gth_input_load_genomic_file(input, gen_file_num, true);

  /* load reference file back into memory */
  gth_input_load_reference_file(input, ref_file_num, true);

  /* compute chains from matches */
  calc_chains_from_matches(chain_collection, matches, &chaining_info,
                           gth_input_current_gen_seq_con(input),
                           gth_input_current_ref_seq_con(input),
                           call_info->simfilterparam.rare,
                           call_info->fragweightfactor,
                           plugins->jump_table_new,
                           plugins->jump_table_new_reverse,
                           plugins->jump_table_delete);

  if (call_info->out->showverbose) {
    call_info->out->showverbose("sort global chains according to reference "
                                "sequence coverage");
  }

  /* sort chains */
  gth_chain_collection_sort(chain_collection);

  /* free */
  gt_array_delete(matches);
}
Example #5
0
static void show_xml_run_header(GthCallInfo *call_info, GthInput *input,
                                const char *timestring, const char *gth_version,
                                unsigned int indentlevel, const char **args)
{
  GtFile *outfp = call_info->out->outfp;
  GtUword i;

  gth_indent(outfp, indentlevel);
  if (call_info->intermediate) {
    gt_file_xprintf(outfp, "<header xmlns=\"http://www.GenomeThreader.org/"
                       "SplicedAlignment/header/\">\n");
  }
  else {
    gt_file_xprintf(outfp,
              "<header xmlns=\"http://www.genomethreader.org/GTH_output/"
              "header/\">\n");
  }

  /* at least one genomic file defined */
  gt_assert(gth_input_num_of_gen_files(input));
  /* at least one reference file defined */
  gt_assert(gth_input_num_of_ref_files(input));

  /* show a readable version of GthCallInfo. That is, it is shown with wich
     parameters the program was called */

  indentlevel++;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<source program=\"GenomeThreader\" version=\"%s\" "
                  "build_date=\"%s\" run_date=\"%s\"/>\n", gth_version,
                  GT_BUILT, timestring);

  /* show genomic file names */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<gDNA_template_files>\n");
  indentlevel++;

  for (i = 0; i < gth_input_num_of_gen_files(input); i++) {
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<temp_name>%s</temp_name>\n",
                    gth_input_get_genomic_filename(input, i));
  }

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</gDNA_template_files>\n");

  /* show reference file names */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<reference_files>\n");
  indentlevel++;

  for (i = 0; i < gth_input_num_of_ref_files(input); i++) {
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<file ref_name=\"%s\" type=\"%s\"/>\n",
                    gth_input_get_reference_filename(input, i),
                    gth_input_get_alphatype(input, i) == DNA_ALPHA
                    ? "ESTcDNA" : "Protein");
  }

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</reference_files>\n");

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<splice_site_parameters parameter_type=\"%s\" "
                  "species=\"%s\"/>\n", SPLICE_SITE_MODEL_NAME,
                  call_info->speciesnum ==  NUMOFSPECIES
                  ? GENERIC_SPECIES_NAME
                  : speciestab[call_info->speciesnum]);

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameters>\n");
  indentlevel++;

  /* output name of BSSM file */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameter name=\"bssmfile\" value=\"%s\"/>\n",
                  gth_input_bssmfilename(input));

  /* output name of scorematrix */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameter name=\"scorematrixfile\" value=\"%s\"/>\n",
                  gt_str_get(call_info->scorematrixfile));

  /* output searchmode */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameter name=\"searchmode\" "
                  "value=\"forward=%s,reverse=%s)\"/>\n",
                  GTH_SHOWBOOL(gth_input_forward(input)),
                  GTH_SHOWBOOL(gth_input_reverse(input)));

  /* output arguments as comment */
  gt_file_xprintf(outfp, "<!--\n%c Arguments: ", COMMENTCHAR);
  gt_cstr_array_show_genfile(args, outfp);
  gt_file_xprintf(outfp, "-->\n");

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</parameters>\n");

  show_overall_reference_type(gth_input_overall_alphatype(input),
                              indentlevel, outfp);

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</header>\n");
}
static int calc_spliced_alignments(GthSACollection *sa_collection,
                                   GthChainCollection *chain_collection,
                                   GthCallInfo *call_info,
                                   GthInput *input,
                                   GthStat *stat,
                                   GtUword gen_file_num,
                                   GtUword ref_file_num,
                                   bool directmatches,
                                   GthMatchInfo *match_info,
                                   GthDNACompletePathMatrixJT
                                   dna_complete_path_matrix_jt,
                                   GthProteinCompletePathMatrixJT
                                   protein_complete_path_matrix_jt)
{
  const unsigned char *ref_seq_tran, *ref_seq_orig, *ref_seq_tran_rc = NULL,
                      *ref_seq_orig_rc = NULL;
  GtUword chainctr, gen_offset = GT_UNDEF_UWORD, gen_total_length,
                ref_total_length;
  GtFile *outfp = call_info->out->outfp;
  GtRange gen_seq_bounds, gen_seq_bounds_rc;
  bool refseqisdna;
  GthChain *chain;
  GtRange range;
  GthSA *saA;
  int rval;

  gt_assert(sa_collection && chain_collection);

  refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num);

  for (chainctr = 0;
       chainctr < gth_chain_collection_size(chain_collection);
       chainctr++) {
       chain = gth_chain_collection_get(chain_collection, chainctr);
    if (++match_info->call_number > call_info->firstalshown &&
        call_info->firstalshown > 0) {
      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "<!--\n");

      if (!call_info->out->gff3out) {
        gt_file_xprintf(outfp, "Maximal matching %s count (%u) reached.\n",
                        refseqisdna ? "EST" : "protein",
                        call_info->firstalshown);
        gt_file_xprintf(outfp, "Only the first %u matches will be "
                           "displayed.\n", call_info->firstalshown);
      }

      if (!(call_info->out->xmlout || call_info->out->gff3out))
        gt_file_xfputc('\n', outfp);
      else if (call_info->out->xmlout)
        gt_file_xprintf(outfp, "-->\n");

      match_info->max_call_number_reached = true;
      break; /* break out of loop */
    }

    /* compute considered genomic regions if not set by -frompos */
    if (!gth_input_use_substring_spec(input)) {
      gen_seq_bounds = gth_input_get_genomic_range(input, chain->gen_file_num,
                                                   chain->gen_seq_num);
      gen_total_length      = gt_range_length(&gen_seq_bounds);
      gen_offset            = gen_seq_bounds.start;
      gen_seq_bounds_rc     = gen_seq_bounds;
    }
    else {
      /* genomic multiseq contains exactly one sequence */
      gt_assert(gth_input_num_of_gen_seqs(input, chain->gen_file_num) == 1);
      gen_total_length = gth_input_genomic_file_total_length(input,
                                                             chain
                                                             ->gen_file_num);
      gen_seq_bounds.start    = gth_input_genomic_substring_from(input);
      gen_seq_bounds.end      = gth_input_genomic_substring_to(input);
      gen_offset              = 0;
      gen_seq_bounds_rc.start = gen_total_length - 1 - gen_seq_bounds.end;
      gen_seq_bounds_rc.end   = gen_total_length - 1 - gen_seq_bounds.start;
    }

    /* "retrieving" the reference sequence */
    range = gth_input_get_reference_range(input, chain->ref_file_num,
                                          chain->ref_seq_num);
    ref_seq_tran = gth_input_current_ref_seq_tran(input) + range.start;
    ref_seq_orig = gth_input_current_ref_seq_orig(input) + range.start;
    if (refseqisdna) {
      ref_seq_tran_rc = gth_input_current_ref_seq_tran_rc(input) + range.start;
      ref_seq_orig_rc = gth_input_current_ref_seq_orig_rc(input) + range.start;
    }
    ref_total_length = range.end - range.start + 1;

    /* check if protein sequences have a stop amino acid */
    if (!refseqisdna && !match_info->stop_amino_acid_warning &&
       ref_seq_orig[ref_total_length - 1] != GT_STOP_AMINO) {
      GtStr *ref_id = gt_str_new();
      gth_input_save_ref_id(input, ref_id, chain->ref_file_num,
                            chain->ref_seq_num);
      gt_warning("protein sequence '%s' (#" GT_WU " in file %s) does not end "
                 "with a stop amino acid ('%c'). If it is not a protein "
                 "fragment you should add a stop amino acid to improve the "
                 "prediction. For example with `gt seqtransform "
                 "-addstopaminos` (see http://genometools.org for details).",
                 gt_str_get(ref_id), chain->ref_seq_num,
                 gth_input_get_reference_filename(input, chain->ref_file_num),
                 GT_STOP_AMINO);
      match_info->stop_amino_acid_warning = true;
      gt_str_delete(ref_id);
    }

    /* allocating space for alignment */
    saA = gth_sa_new_and_set(directmatches, true, input, chain->gen_file_num,
                             chain->gen_seq_num, chain->ref_file_num,
                             chain->ref_seq_num, match_info->call_number,
                             gen_total_length, gen_offset, ref_total_length);

    /* extend the DP borders to the left and to the right */
    gth_chain_extend_borders(chain, &gen_seq_bounds, &gen_seq_bounds_rc,
                             gen_total_length, gen_offset);

    /* From here on the dp positions always refer to the forward strand of the
       genomic DNA. */

    /* call the Dynamic Programming */
    if (refseqisdna) {
      rval = call_dna_DP(directmatches, call_info, input, stat,
                         sa_collection, saA, gen_file_num, ref_file_num,
                         gen_total_length, gen_offset, &gen_seq_bounds,
                         &gen_seq_bounds_rc, ref_total_length, range.start,
                         chainctr, gth_chain_collection_size(chain_collection),
                         match_info, ref_seq_tran, ref_seq_orig,
                         ref_seq_tran_rc, ref_seq_orig_rc, chain,
                         dna_complete_path_matrix_jt,
                         protein_complete_path_matrix_jt);
    }
    else {
      rval = call_protein_DP(directmatches, call_info, input,
                             stat, sa_collection, saA, gen_file_num,
                             ref_file_num, gen_total_length, gen_offset,
                             &gen_seq_bounds, &gen_seq_bounds_rc,
                             ref_total_length, range.start, chainctr,
                             gth_chain_collection_size(chain_collection),
                             match_info, ref_seq_tran, ref_seq_orig, chain,
                             dna_complete_path_matrix_jt,
                             protein_complete_path_matrix_jt);
    }
    /* check return value */
    if (rval == GTH_ERROR_DP_PARAMETER_ALLOCATION_FAILED) {
      /* statistics bookkeeping */
      gth_stat_increment_numoffailedDPparameterallocations(stat);
      gth_stat_increment_numofundeterminedSAs(stat);
      /* free space */
      gth_sa_delete(saA);
      match_info->call_number--;
      continue; /* continue with the next DP range */
    }
    else if (rval)
      return -1;
  }

  if (!call_info->out->xmlout && !call_info->out->gff3out && !directmatches &&
      !match_info->significant_match_found &&
      match_info->call_number <= call_info->firstalshown) {
    show_no_match_line(gth_input_get_alphatype(input, ref_file_num), outfp);
  }

  return 0;
}