Beispiel #1
0
static void xml_showgthgenomicinformation(GthSA *sa,
                                          GthInput *input,
                                          unsigned int indentlevel,
                                          GtFile *outfp)
{
  gt_assert(gth_sa_gen_file_num(sa) != GT_UNDEF_ULONG);

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<gDNA_segment>\n");
  indentlevel++;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<template temp_file=\"%s\" temp_id=\"%s\" "
                            "temp_strand=\"%c\" temp_description=\"",
                     gth_input_get_genomic_filename(input,
                                                    gth_sa_gen_file_num(sa)),
                     gth_sa_gen_id(sa),
                     gth_sa_gen_strand_char(sa));

  gth_input_echo_genomic_description(input, gth_sa_gen_file_num(sa),
                                     gth_sa_gen_seq_num(sa), outfp);

  gt_file_xprintf(outfp, "\">\n");
  indentlevel++;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<position start=\"%lu\" stop=\"%lu\"/>\n",
                     gth_sa_gen_dp_start_show(sa),
                     gth_sa_gen_dp_end_show(sa));
  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</template>\n");
  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</gDNA_segment>\n");
}
static void showgenomicfilename(GthSA *sa, GthInput *input,
                                unsigned int indentlevel, GtFile *outfp)
{
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomicfile>\n");
  indentlevel++;

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomicfilename>%s</genomicfilename>\n",
                     gth_input_get_genomic_filename(input,
                                                   gth_sa_gen_file_num(sa)));
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<genomicfilehash>%s</genomicfilehash>\n",
                     GTH_UNDEFINED_HASH);

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</genomicfile>\n");
}
static void showgthgenomicinformation(GthSA *sa, GthInput *input,
                                      bool showseqnums, GtFile *outfp)
{
  gt_assert(gth_sa_gen_file_num(sa) != GT_UNDEF_UWORD);

  gt_file_xprintf(outfp, "Genomic Template: file=%s, strand=%c, from="GT_WU", "
                            "to="GT_WU", description=",
                     gth_input_get_genomic_filename(input,
                                                    gth_sa_gen_file_num(sa)),
                     gth_sa_gen_strand_char(sa),
                     gth_sa_gen_dp_start_show(sa),
                     gth_sa_gen_dp_end_show(sa));

  gth_sa_echo_genomic_description(sa, input, outfp);

  if (showseqnums)
    gt_file_xprintf(outfp, ", seqnum="GT_WU"",  gth_sa_gen_seq_num(sa));

  gt_file_xfputc('\n', outfp);
  gt_file_xfputc('\n', outfp);
}
Beispiel #4
0
static void make_sequence_region(GtHashmap *sequence_regions,
                                 GtStr *sequenceid,
                                 GthRegionFactory *srf,
                                 GthInput *input,
                                 GtUword filenum,
                                 GtUword seqnum)
{
    GtUword offset_is_defined = false;
    GtRange range, descrange;
    GtGenomeNode *sr = NULL;
    gt_assert(sequence_regions && sequenceid && srf && input);
    if (gth_input_use_substring_spec(input)) {
        range.start = gth_input_genomic_substring_from(input);
        range.end   = gth_input_genomic_substring_to(input);
    }
    else {
        range = gth_input_get_relative_genomic_range(input, filenum, seqnum);
    }
    if (srf->use_desc_ranges) {
        GtStr *description = gt_str_new();
        gth_input_get_genomic_description(input, description, filenum, seqnum);
        if (!gt_parse_description_range(gt_str_get(description), &descrange))
            offset_is_defined = true;
        gt_str_delete(description);
    }
    if (offset_is_defined)
        range = gt_range_offset(&range, descrange.start);
    else
        range = gt_range_offset(&range, 1); /* 1-based */
    if (!gt_str_length(sequenceid) ||
            (gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid)) &&
             !offset_is_defined)) {
        /* sequenceid is empty or exists already (and no offset has been parsed)
           -> make one up */
        GtStr *seqid;
        char *base;
        base = gt_basename(gth_input_get_genomic_filename(input, filenum));
        seqid = gt_str_new_cstr(base);
        gt_free(base);
        gt_str_append_char(seqid, '|');
        gt_str_append_uword(seqid, seqnum + 1); /* 1-based */
        seqid_store_add(srf->seqid_store, filenum, seqnum, seqid, GT_UNDEF_UWORD);
        gt_assert(!gt_cstr_table_get(srf->used_seqids, gt_str_get(seqid)));
        gt_cstr_table_add(srf->used_seqids, gt_str_get(seqid));
        sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum, seqnum),
                                range.start, range.end);
        gt_hashmap_add(sequence_regions,
                       (void*) gt_cstr_table_get(srf->used_seqids,
                               gt_str_get(seqid)),
                       sr);
        gt_str_delete(seqid);
    }
    else {
        /* sequenceid does not exists already (or an offset has been parsed)
           -> use this one */
        if (!gt_cstr_table_get(srf->used_seqids, gt_str_get(sequenceid))) {
            /* no sequence region with this id exists -> create one */
            gt_cstr_table_add(srf->used_seqids, gt_str_get(sequenceid));
            seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid,
                            offset_is_defined ? descrange.start : GT_UNDEF_UWORD);
            sr = gt_region_node_new(seqid_store_get(srf->seqid_store, filenum,
                                                    seqnum), range.start, range.end);
            gt_hashmap_add(sequence_regions,
                           (void*) gt_cstr_table_get(srf->used_seqids,
                                   gt_str_get(sequenceid)),
                           sr);
        }
        else {
            GtRange prev_range, new_range;
            /* sequence region with this id exists already -> modify range */
            sr = gt_hashmap_get(sequence_regions, gt_str_get(sequenceid));
            gt_assert(sr);
            prev_range = gt_genome_node_get_range(sr);
            new_range = gt_range_join(&prev_range, &range);
            gt_genome_node_set_range(sr, &new_range);
            seqid_store_add(srf->seqid_store, filenum, seqnum, sequenceid,
                            offset_is_defined ? descrange.start : GT_UNDEF_UWORD);
        }
    }
    gt_assert(sr);
}
Beispiel #5
0
void gth_chaining(GthChainCollection *chain_collection,
                  GtUword gen_file_num,
                  GtUword ref_file_num,
                  GthCallInfo *call_info,
                  GthInput *input,
                  GthStat *stat,
                  bool directmatches,
                  const GthPlugins *plugins)
{
  GtUword i, numofsequences = 0;
  GtArray *matches;
  GthChainingInfo chaining_info;
  void *matcher_arguments;
  GtFile *outfp = call_info->out->outfp;
  GthMatchProcessorInfo match_processor_info;
  bool refseqisdna = gth_input_ref_file_is_dna(input, ref_file_num);

  /* make sure matcher is defined */
  gt_assert(plugins);
  gt_assert(plugins->matcher_arguments_new);
  gt_assert(plugins->matcher_arguments_delete);
  gt_assert(plugins->matcher_runner);

  /* init */
  matches = gt_array_new(sizeof (GthMatch));

  chaining_info_init(&chaining_info, directmatches, refseqisdna, call_info,
                     input, stat, gen_file_num, ref_file_num);

  matcher_arguments =
    plugins->matcher_arguments_new(true,
                          input,
                          call_info->simfilterparam.inverse || !refseqisdna
                          ? gth_input_get_genomic_filename(input, gen_file_num)
                          : gth_input_get_reference_filename(input,
                                                             ref_file_num),
                          call_info->simfilterparam.inverse || !refseqisdna
                          ? gth_input_get_reference_filename(input,
                                                             ref_file_num)
                          : gth_input_get_genomic_filename(input, gen_file_num),
                          directmatches,
                          refseqisdna,
                          call_info->progname,
                          gt_str_get(gth_input_proteinsmap(input)),
                          call_info->simfilterparam.exact,
                          call_info->simfilterparam.edist,
                          false,
                          0,
                          call_info->simfilterparam.minmatchlength,
                          call_info->simfilterparam.seedlength,
                          call_info->simfilterparam.exdrop,
                          call_info->simfilterparam.prminmatchlen,
                          call_info->simfilterparam.prseedlength,
                          call_info->simfilterparam.prhdist,
                          call_info->translationtable,
                          call_info->simfilterparam.online,
                          call_info->simfilterparam.noautoindex,
                          call_info->simfilterparam.maskpolyAtails,
                          false);

  match_processor_info_init(&match_processor_info, matches, chain_collection,
                            directmatches, refseqisdna,
                            call_info->simfilterparam.online,
                            call_info->simfilterparam.inverse, stat,
                            &chaining_info,
                            call_info->simfilterparam.maxnumofmatches,
                            call_info->simfilterparam.rare,
                            call_info->fragweightfactor,
                            plugins->jump_table_new,
                            plugins->jump_table_new_reverse,
                            plugins->jump_table_delete);

  if (call_info->simfilterparam.maxnumofmatches > 0 ||
      gth_stat_get_matchnumdistri(stat)) {
    /* alloc space of match number counter */
    numofsequences = gth_input_num_of_ref_seqs(input, ref_file_num);
    match_processor_info.matchnumcounter = gt_malloc(sizeof (GtUword) *
                                                     numofsequences);

    /* init match number counter to 0 */
    memset(match_processor_info.matchnumcounter, 0,
           (size_t) numofsequences * sizeof (GtUword));
  }

  /* free input, which contains the virtual trees.
     because vmatch loads the virtual trees into memory, too.
     this prevents that the virtual trees are loaded twice. */
  gth_input_delete_current(input);

  /* call matcher */
  if (call_info->out->showverbose)
    call_info->out->showverbose("call vmatch to compute matches");

  plugins->matcher_runner(matcher_arguments, call_info->out->showverbose,
                          call_info->out->showverboseVM, &match_processor_info);

  /* free matcher stuff here, because otherwise the reference file is mapped
     twice below */
  plugins->matcher_arguments_delete(matcher_arguments);

  /* free sequence collections (if they have been filled by the matcher) */
  gth_seq_con_delete(match_processor_info.gen_seq_con);
  gth_seq_con_delete(match_processor_info.ref_seq_con);

  /* save match numbers of match number distribution, if necessary */
  if (gth_stat_get_matchnumdistri(stat)) {
    for (i = 0; i < numofsequences; i++) {
      if (match_processor_info.matchnumcounter[i] > 0) {
        gth_stat_add_to_matchnumdistri(stat,
                                      match_processor_info.matchnumcounter[i]);
      }
    }
  }

  /* free match number counter */
  gt_free(match_processor_info.matchnumcounter);

  /* return if no match has been found */
  if (!gt_array_size(matches)) {
    if (call_info->out->comments)
      gt_file_xprintf(outfp, "%c no match has been found\n", COMMENTCHAR);
    gt_array_delete(matches);
    return;
  }

  /* load genomic file back into memory */
  gth_input_load_genomic_file(input, gen_file_num, true);

  /* load reference file back into memory */
  gth_input_load_reference_file(input, ref_file_num, true);

  /* compute chains from matches */
  calc_chains_from_matches(chain_collection, matches, &chaining_info,
                           gth_input_current_gen_seq_con(input),
                           gth_input_current_ref_seq_con(input),
                           call_info->simfilterparam.rare,
                           call_info->fragweightfactor,
                           plugins->jump_table_new,
                           plugins->jump_table_new_reverse,
                           plugins->jump_table_delete);

  if (call_info->out->showverbose) {
    call_info->out->showverbose("sort global chains according to reference "
                                "sequence coverage");
  }

  /* sort chains */
  gth_chain_collection_sort(chain_collection);

  /* free */
  gt_array_delete(matches);
}
static void show_xml_run_header(GthCallInfo *call_info, GthInput *input,
                                const char *timestring, const char *gth_version,
                                unsigned int indentlevel, const char **args)
{
  GtFile *outfp = call_info->out->outfp;
  GtUword i;

  gth_indent(outfp, indentlevel);
  if (call_info->intermediate) {
    gt_file_xprintf(outfp, "<header xmlns=\"http://www.GenomeThreader.org/"
                       "SplicedAlignment/header/\">\n");
  }
  else {
    gt_file_xprintf(outfp,
              "<header xmlns=\"http://www.genomethreader.org/GTH_output/"
              "header/\">\n");
  }

  /* at least one genomic file defined */
  gt_assert(gth_input_num_of_gen_files(input));
  /* at least one reference file defined */
  gt_assert(gth_input_num_of_ref_files(input));

  /* show a readable version of GthCallInfo. That is, it is shown with wich
     parameters the program was called */

  indentlevel++;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<source program=\"GenomeThreader\" version=\"%s\" "
                  "build_date=\"%s\" run_date=\"%s\"/>\n", gth_version,
                  GT_BUILT, timestring);

  /* show genomic file names */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<gDNA_template_files>\n");
  indentlevel++;

  for (i = 0; i < gth_input_num_of_gen_files(input); i++) {
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<temp_name>%s</temp_name>\n",
                    gth_input_get_genomic_filename(input, i));
  }

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</gDNA_template_files>\n");

  /* show reference file names */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<reference_files>\n");
  indentlevel++;

  for (i = 0; i < gth_input_num_of_ref_files(input); i++) {
    gth_indent(outfp, indentlevel);
    gt_file_xprintf(outfp, "<file ref_name=\"%s\" type=\"%s\"/>\n",
                    gth_input_get_reference_filename(input, i),
                    gth_input_get_alphatype(input, i) == DNA_ALPHA
                    ? "ESTcDNA" : "Protein");
  }

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</reference_files>\n");

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<splice_site_parameters parameter_type=\"%s\" "
                  "species=\"%s\"/>\n", SPLICE_SITE_MODEL_NAME,
                  call_info->speciesnum ==  NUMOFSPECIES
                  ? GENERIC_SPECIES_NAME
                  : speciestab[call_info->speciesnum]);

  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameters>\n");
  indentlevel++;

  /* output name of BSSM file */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameter name=\"bssmfile\" value=\"%s\"/>\n",
                  gth_input_bssmfilename(input));

  /* output name of scorematrix */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameter name=\"scorematrixfile\" value=\"%s\"/>\n",
                  gt_str_get(call_info->scorematrixfile));

  /* output searchmode */
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "<parameter name=\"searchmode\" "
                  "value=\"forward=%s,reverse=%s)\"/>\n",
                  GTH_SHOWBOOL(gth_input_forward(input)),
                  GTH_SHOWBOOL(gth_input_reverse(input)));

  /* output arguments as comment */
  gt_file_xprintf(outfp, "<!--\n%c Arguments: ", COMMENTCHAR);
  gt_cstr_array_show_genfile(args, outfp);
  gt_file_xprintf(outfp, "-->\n");

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</parameters>\n");

  show_overall_reference_type(gth_input_overall_alphatype(input),
                              indentlevel, outfp);

  indentlevel--;
  gth_indent(outfp, indentlevel);
  gt_file_xprintf(outfp, "</header>\n");
}