static GthFlt get_score(GtScoreMatrix *score_matrix, GtAlphabet *score_matrix_alphabet, unsigned char amino, unsigned char origreferencechar) { GthFlt rval = 0.0, scalefactor = SCALEFACTOR, indel_penalty = INDEL_PENALTY; if (amino == DASH || origreferencechar == DASH) { /* 1.) scaled INDEL_PENALTY for deletions from and insertions into genomic DNA of lengths 1, 2, or 3, irrespective of indel size */ rval = scalefactor * indel_penalty; } else if (amino != WILDCARD && amino <= CHAR_MAX && gt_alphabet_valid_input(score_matrix_alphabet, amino) && origreferencechar <= CHAR_MAX && gt_alphabet_valid_input(score_matrix_alphabet, origreferencechar)) { /* XXX: shorten this */ if (amino == GT_STOP_AMINO) { /* 2.) (-)2*INDEL_PENALTY for matching/mismatching a stop codon */ if (origreferencechar == GT_STOP_AMINO) rval = scalefactor * -2 * indel_penalty; else rval = scalefactor * 2 * indel_penalty; } else { /* 3.) amino acid substitution score */ if (origreferencechar == GT_STOP_AMINO) rval = scalefactor * 2 * indel_penalty; else { GtUchar code1, code2; int wcidx; code1 = gt_alphabet_encode(score_matrix_alphabet, amino); code2 = gt_alphabet_encode(score_matrix_alphabet, origreferencechar); wcidx = gt_alphabet_size(score_matrix_alphabet) - 1; rval = scalefactor * gt_score_matrix_get_score(score_matrix, code1 == WILDCARD ? wcidx : code1, code2 == WILDCARD ? wcidx : code2); } } } /* 4.) else: neutral score in case of wild-card characters in the genomic DNA */ return rval; }
static int hcr_base_qual_distr_add(GtBaseQualDistr *bqd, const GtUchar *qual, const GtUchar *seq, GtUword len) { GtUword i; unsigned cur_char_code, cur_qual; for (i = 0; i < len; i++) { cur_char_code = (unsigned) gt_alphabet_encode(bqd->alpha, (char) gt_alphabet_pretty_symbol(bqd->alpha, (unsigned) seq[i])); cur_qual = (unsigned) qual[i]; if (bqd->qrange_start != GT_UNDEF_UINT) { if (cur_qual <= bqd->qrange_start) cur_qual = bqd->qrange_start; } if (bqd->qrange_end != GT_UNDEF_UINT) { if (cur_qual >= bqd->qrange_end) cur_qual = bqd->qrange_end; } if (cur_char_code == WILDCARD) bqd->distr[cur_qual][bqd->wildcard_indx]++; else bqd->distr[cur_qual][cur_char_code]++; if (cur_qual > bqd->max_qual) bqd->max_qual = cur_qual; if (cur_qual < bqd->min_qual) bqd->min_qual = cur_qual; } return 0; }
static GtUchar bambase2gtbase(uint8_t base, GtAlphabet *alphabet) { switch (base) { case BAMBASEA: return gt_alphabet_encode(alphabet, 'A'); case BAMBASEC: return gt_alphabet_encode(alphabet, 'C'); case BAMBASEG: return gt_alphabet_encode(alphabet, 'G'); case BAMBASET: return gt_alphabet_encode(alphabet, 'T'); default: return gt_alphabet_encode(alphabet, (char) gt_alphabet_wildcard_show(alphabet)); } }
void gt_gc_content_show(const char *seq, unsigned long len, GtAlphabet *alphabet, GtFile *outfp) { unsigned long i, gc = 0, /* number of G/C bases */ at = 0, /* number of A/T bases */ n = 0; /* number of N bases */ unsigned int a_code, c_code, g_code, t_code, n_code, cc; gt_assert(seq && alphabet); gt_assert(gt_alphabet_is_dna(alphabet)); a_code = gt_alphabet_encode(alphabet, 'A'); c_code = gt_alphabet_encode(alphabet, 'C'); g_code = gt_alphabet_encode(alphabet, 'G'); t_code = gt_alphabet_encode(alphabet, 'T'); n_code = gt_alphabet_encode(alphabet, 'N'); for (i = 0; i < len; i++) { cc = gt_alphabet_encode(alphabet, seq[i]); if (cc == g_code || cc == c_code) gc++; else if (cc == a_code || cc == t_code) at++; else if (cc == n_code) n++; else { gt_assert(0); } } gt_file_xprintf(outfp, "GC-content: %.2f%% (AT-content: %.2f%%, " "N-content: %.2f%%)\n", ((double) gc / len) * 100.0, ((double) at / len) * 100.0, ((double) n / len) * 100.0); }
static GtBaseQualDistr* hcr_base_qual_distr_new_from_file(FILE *fp, GtAlphabet *alpha) { GtBaseQualDistr *bqd; char read_char_code; GtUchar cur_char_code; unsigned char cur_qual; unsigned alpha_size, min_qual = HCR_HIGHESTQUALVALUE, max_qual = HCR_LOWESTQUALVALUE; GtUword numofleaves, i; GtUint64 cur_freq; GT_UNUSED size_t read, one = (size_t) 1; alpha_size = gt_alphabet_size(alpha); bqd = gt_malloc(sizeof (GtBaseQualDistr)); gt_array2dim_calloc(bqd->distr, HCR_HIGHESTQUALVALUE + 1UL, alpha_size) bqd->ncols = alpha_size; bqd->nrows = HCR_HIGHESTQUALVALUE + 1U; bqd->qual_offset = HCR_LOWESTQUALVALUE; bqd->wildcard_indx = alpha_size - 1; read = gt_xfread_one(&numofleaves, fp); gt_assert(read == one); for (i = 0; i < numofleaves; i++) { read = gt_xfread_one(&read_char_code, fp); gt_assert(read == one); read = gt_xfread_one(&cur_qual, fp); gt_assert(read == one); read = gt_xfread_one(&cur_freq, fp); gt_assert(read == one); cur_char_code = gt_alphabet_encode(alpha, read_char_code); if (cur_char_code == (GtUchar) WILDCARD) gt_safe_assign(cur_char_code, bqd->wildcard_indx); bqd->distr[cur_qual][cur_char_code] = cur_freq; if ((unsigned) cur_qual > max_qual) max_qual = cur_qual; if ((unsigned) cur_qual < min_qual) min_qual = cur_qual; } bqd->min_qual = min_qual; bqd->max_qual = max_qual; hcr_base_qual_distr_trim(bqd); return bqd; }
double gt_pck_getGCcontent(const FMindex *bwtSubject, const GtAlphabet *alphabet) { GtUword c, length; double gc; const MRAEnc *FM_alphabet; GtUchar c_sym; FM_alphabet = BWTSeqGetAlphabet((const BWTSeq *) bwtSubject); c_sym = MRAEncMapSymbol(FM_alphabet, gt_alphabet_encode(alphabet, 'c')); length = ((const BWTSeq *) bwtSubject)->seqIdx->seqLen; c = ((const BWTSeq *) bwtSubject)->count[c_sym+1] - ((const BWTSeq *) bwtSubject)->count[c_sym]; gc = c * 2 / (double) (length - 2); return gc; }
static int gt_seed_extend_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtSeedExtendArguments *arguments = tool_arguments; GtEncseqLoader *encseq_loader = NULL; GtEncseq *aencseq = NULL, *bencseq = NULL; GtGreedyextendmatchinfo *grextinfo = NULL; GtXdropmatchinfo *xdropinfo = NULL; GtQuerymatchoutoptions *querymatchoutopt = NULL; GtTimer *seedextendtimer = NULL; GtExtendCharAccess cam = GT_EXTEND_CHAR_ACCESS_ANY; GtUword errorpercentage = 0UL; int had_err = 0; gt_error_check(err); gt_assert(arguments != NULL); gt_assert(arguments->se_minidentity >= GT_EXTEND_MIN_IDENTITY_PERCENTAGE && arguments->se_minidentity <= 100UL); /* Calculate error percentage from minidentity */ errorpercentage = 100UL - arguments->se_minidentity; /* Measure whole running time */ if (arguments->benchmark || arguments->verbose) { gt_showtime_enable(); } if (gt_showtime_enabled()) { seedextendtimer = gt_timer_new(); gt_timer_start(seedextendtimer); } /* Load encseq A */ encseq_loader = gt_encseq_loader_new(); gt_encseq_loader_enable_autosupport(encseq_loader); aencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_indexname), err); if (aencseq == NULL) had_err = -1; /* If there is a 2nd read set: Load encseq B */ if (!had_err) { if (strcmp(gt_str_get(arguments->dbs_queryname), "") != 0) { bencseq = gt_encseq_loader_load(encseq_loader, gt_str_get(arguments->dbs_queryname), err); } else { bencseq = gt_encseq_ref(aencseq); } if (bencseq == NULL) { had_err = -1; gt_encseq_delete(aencseq); } } gt_encseq_loader_delete(encseq_loader); /* set character access method */ if (!had_err && (gt_option_is_set(arguments->se_option_greedy) || gt_option_is_set(arguments->se_option_xdrop) || arguments->se_alignmentwidth > 0)) { cam = gt_greedy_extend_char_access(gt_str_get (arguments->se_char_access_mode), err); if ((int) cam == -1) { had_err = -1; gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Use bias dependent parameters, adapted from E. Myers' DALIGNER */ if (!had_err && arguments->bias_parameters) { const GtAlphabet *alpha = gt_encseq_alphabet(aencseq); const double bias_factor[10] = {.690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000}; if (gt_alphabet_is_dna(alpha)) { GtUword at, cg; at = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'a')); at += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 't')); cg = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'c')); cg += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'g')); if (at + cg > 0) { const double ratio = (double)MIN(at, cg) / (at + cg); int bias_index = (int)MAX(0.0, (ratio + 0.025) * 20.0 - 1.0); gt_assert(bias_index < 10); arguments->se_maxalilendiff = 30; arguments->se_perc_match_hist = (GtUword)(100.0 - errorpercentage * bias_factor[bias_index]); if (arguments->verbose) { printf("# Base ratio = %4.2lf -> percmathistory = "GT_WU"\n", ratio, arguments->se_perc_match_hist); } } else { had_err = -1; } } else { had_err = -1; } if (had_err) { gt_error_set(err, "option \"-bias-parameters\" can only be applied to " "the DNA alphabet"); gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); } } /* Prepare options for greedy extension */ if (!had_err && gt_option_is_set(arguments->se_option_greedy)) { grextinfo = gt_greedy_extend_matchinfo_new(errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, arguments->se_alignlength, cam, arguments->se_extendgreedy); if (arguments->benchmark) { gt_greedy_extend_matchinfo_silent_set(grextinfo); } } /* Prepare options for xdrop extension */ if (!had_err && gt_option_is_set(arguments->se_option_xdrop)) { xdropinfo = gt_xdrop_matchinfo_new(arguments->se_alignlength, errorpercentage, arguments->se_xdropbelowscore, arguments->se_extendxdrop); if (arguments->benchmark) { gt_xdrop_matchinfo_silent_set(xdropinfo); } } /* Prepare output options */ if (!had_err && (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop))) { querymatchoutopt = gt_querymatchoutoptions_new(arguments->se_alignmentwidth); if (gt_option_is_set(arguments->se_option_xdrop) || gt_option_is_set(arguments->se_option_greedy)) { const GtUword sensitivity = gt_option_is_set(arguments->se_option_greedy) ? arguments->se_extendgreedy : 100; gt_querymatchoutoptions_extend(querymatchoutopt, errorpercentage, arguments->se_maxalilendiff, arguments->se_historysize, arguments->se_perc_match_hist, cam, sensitivity); } } /* Start algorithm */ if (!had_err) { GtDiagbandseed dbsarguments; dbsarguments.errorpercentage = errorpercentage; dbsarguments.userdefinedleastlength = arguments->se_alignlength; dbsarguments.seedlength = arguments->dbs_seedlength; dbsarguments.logdiagbandwidth = arguments->dbs_logdiagbandwidth; dbsarguments.mincoverage = arguments->dbs_mincoverage; dbsarguments.maxfreq = arguments->dbs_maxfreq; dbsarguments.memlimit = arguments->dbs_memlimit; dbsarguments.mirror = arguments->mirror; dbsarguments.overlappingseeds = arguments->overlappingseeds; dbsarguments.verify = arguments->dbs_verify; dbsarguments.verbose = arguments->verbose; dbsarguments.debug_kmer = arguments->dbs_debug_kmer; dbsarguments.debug_seedpair = arguments->dbs_debug_seedpair; dbsarguments.seed_display = arguments->seed_display; dbsarguments.extendgreedyinfo = grextinfo; dbsarguments.extendxdropinfo = xdropinfo; dbsarguments.querymatchoutopt = querymatchoutopt; had_err = gt_diagbandseed_run(aencseq, bencseq, &dbsarguments, err); /* clean up */ gt_encseq_delete(aencseq); gt_encseq_delete(bencseq); if (gt_option_is_set(arguments->se_option_greedy)) { gt_greedy_extend_matchinfo_delete(grextinfo); } if (gt_option_is_set(arguments->se_option_xdrop)) { gt_xdrop_matchinfo_delete(xdropinfo); } if (arguments->se_alignmentwidth > 0 || gt_option_is_set(arguments->se_option_xdrop)) { gt_querymatchoutoptions_delete(querymatchoutopt); } } if (gt_showtime_enabled()) { if (!had_err) { char *keystring = gt_seed_extend_params_keystring(gt_option_is_set(arguments-> se_option_greedy), gt_option_is_set(arguments-> se_option_xdrop), arguments->dbs_seedlength, arguments->se_alignlength, arguments->se_minidentity, arguments->se_maxalilendiff, arguments->se_perc_match_hist, arguments->se_extendgreedy, arguments->se_extendxdrop, arguments->se_xdropbelowscore); printf("# TIME seedextend-%s", keystring); gt_free(keystring); gt_timer_show_formatted(seedextendtimer, " overall " GT_WD ".%06ld\n", stdout); } gt_timer_delete(seedextendtimer); } return had_err; }
int gth_bssm_param_parameterize(GthBSSMParam *bssm_param, const char *path, Termtype termtype, bool gzip, GtError *err) { GtAlphabet *alphabet = NULL; GtBioseq *bioseq; GtStr *file2proc; GtUword i, j; int had_err = 0; gt_error_check(err); file2proc = gt_str_new(); /* set version number */ bssm_param->version_num = (unsigned char) MYVERSION; /* set model to true and set window sizes */ switch (termtype) { case GT_DONOR_TYPE: bssm_param->gt_donor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->gt_donor_model); break; case GC_DONOR_TYPE: bssm_param->gc_donor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->gc_donor_model); break; case AG_ACCEPTOR_TYPE: bssm_param->ag_acceptor_model_set = true; set_window_sizes_in_Bssmmodel(&bssm_param->ag_acceptor_model); break; default: gt_assert(0); } for (i = 0; !had_err && i < NUMOFFILES; i++) { /* process datafile */ gt_str_append_cstr(file2proc, path); switch (termtype) { case GT_DONOR_TYPE: gt_str_append_cstr(file2proc, "/GT_donor/"); gt_str_append_cstr(file2proc, filenames[i]); break; case GC_DONOR_TYPE: gt_str_append_cstr(file2proc, "/GC_donor/"); gt_str_append_cstr(file2proc, filenames[i]); break; case AG_ACCEPTOR_TYPE: gt_str_append_cstr(file2proc, "/AG_acceptor/"); gt_str_append_cstr(file2proc, filenames[i]); break; default: gt_assert(0); } if (gzip) gt_str_append_cstr(file2proc, ".gz"); if (!(bioseq = gt_bioseq_new(gt_str_get(file2proc), err))) had_err = -1; if (!had_err) alphabet = gt_bioseq_get_alphabet(bioseq); /* check here if all sequences have the length 102 and correct bases at positions 51 and 52 (i.e., GT, GC, or AG) */ for (j = 0; !had_err && j < gt_bioseq_number_of_sequences(bioseq); j++) { GtUchar encoded_seq[2]; /* check length */ if (gt_bioseq_get_sequence_length(bioseq, j) != STRINGSIZE) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" does not have length %u", j, gt_str_get(file2proc), STRINGSIZE); had_err = -1; } encoded_seq[0] = gt_bioseq_get_encoded_char(bioseq, j, 50); encoded_seq[1] = gt_bioseq_get_encoded_char(bioseq, j, 51); if (!had_err) { /* check base correctness */ switch (termtype) { case GT_DONOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'T')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GT " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; case GC_DONOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'G') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'C')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a GC " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; case AG_ACCEPTOR_TYPE: if (encoded_seq[0] != gt_alphabet_encode(alphabet, 'A') || encoded_seq[1] != gt_alphabet_encode(alphabet, 'G')) { gt_error_set(err, "sequence "GT_WU" in file \"%s\" is not a AG " "sequence", j, gt_str_get(file2proc)); had_err = -1; } break; default: gt_assert(0); } } } if (!had_err) { switch (termtype) { case GT_DONOR_TYPE: build_bssm(bioseq, &bssm_param->gt_donor_model, i); break; case GC_DONOR_TYPE: build_bssm(bioseq, &bssm_param->gc_donor_model, i); break; case AG_ACCEPTOR_TYPE: build_bssm(bioseq, &bssm_param->ag_acceptor_model, i); break; default: gt_assert(0); } } /* reset */ gt_str_reset(file2proc); /* free space */ gt_bioseq_delete(bioseq); } gt_str_delete(file2proc); return had_err; }
void gth_sa_calc_polyAtailpos(GthSA *sa, const unsigned char *ref_seq_tran, GtAlphabet *ref_alphabet) { GtUword ppa, mma, rightreferenceborder, referencelength; GtWord i, leftreferenceborder; sa->polyAtailpos.start = 0; sa->polyAtailpos.end = 0; ppa = mma = 0; rightreferenceborder = ((Exoninfo*) gt_array_get_last(sa->exons)) ->rightreferenceexonborder; leftreferenceborder = ((Exoninfo*) gt_array_get_first(sa->exons)) ->leftreferenceexonborder; /* setting i */ referencelength = gth_sa_ref_total_length(sa); if ((rightreferenceborder + 1) >= (referencelength - 1 - CALCPOLYATAILWINDOW)) { i = gt_safe_cast2long(rightreferenceborder + 1); } else { if (referencelength < 1 + CALCPOLYATAILWINDOW) i = 0; else i = referencelength - 1 - CALCPOLYATAILWINDOW; } for (/* i already set */; i < gt_safe_cast2long(referencelength); i++) { if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'A')) ppa++; else { if (ppa > 0 && mma < 1) { mma++; continue; } else { if (ppa >= MINIMUMPOLYATAILLENGTH) break; else { ppa = mma = 0; continue; } } } } if (ppa >= MINIMUMPOLYATAILLENGTH) { sa->polyAtailpos.start = gt_safe_cast2ulong(i - ppa - mma); sa->polyAtailpos.end = i - 1; } else { ppa = mma = 0; /* setting i */ if ((leftreferenceborder - 1) <= CALCPOLYATAILWINDOW) i = leftreferenceborder - 1; else i = CALCPOLYATAILWINDOW - 1; for (/* i already set */; i >= 0; i--) { if (ref_seq_tran[i] == gt_alphabet_encode(ref_alphabet, 'T')) ppa++; else { if (ppa > 0 && mma < 1) { mma++; continue; } else { if (ppa >= MINIMUMPOLYATAILLENGTH) break; else { ppa = mma = 0; continue; } } } } if (ppa >= MINIMUMPOLYATAILLENGTH) { sa->polyAtailpos.start = gt_safe_cast2ulong(i + ppa + mma); sa->polyAtailpos.end = i + 1; } } }
static int parse_score_line(GtScoreMatrix *sm, GtTokenizer *tz, GtArray *index_to_alpha_char_mapping, char *parsed_characters, GtError *err) { unsigned int num_of_chars, i = 0; char amino_acid; int score, had_err = 0; GtStr *token; gt_assert(sm && tz && index_to_alpha_char_mapping); gt_error_check(err); token = gt_tokenizer_get_token(tz); gt_assert(token); if (gt_str_length(token) != 1) { gt_error_set(err, "illegal character token '%s' on line %lu in file '%s'", gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; } amino_acid = gt_str_get(token)[0]; /* check for character duplications */ if (parsed_characters[(int) amino_acid]) { gt_error_set(err, "multiple character '%c' entry on line %lu in file '%s'", amino_acid, gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz)); had_err = -1; } parsed_characters[(int) amino_acid] = GT_UNDEF_CHAR; gt_str_delete(token); if (!had_err) { num_of_chars = gt_alphabet_num_of_chars(sm->alphabet); gt_tokenizer_next_token(tz); while ((token = gt_tokenizer_get_token(tz))) { unsigned int idx1, idx2; /* the tokenizer can return tokens which are empty except for a newline -> skip these */ if (!strcmp(gt_str_get(token), "\n")) { gt_str_delete(token); gt_tokenizer_next_token(tz); if (gt_tokenizer_line_start(tz)) break; continue; } /* token is not empty -> parse score */ had_err = gt_parse_int_line(&score, gt_str_get(token), gt_tokenizer_get_line_number(tz), gt_tokenizer_get_filename(tz), err); if (had_err) break; idx1 = gt_alphabet_encode(sm->alphabet, amino_acid); idx2 = gt_alphabet_encode(sm->alphabet, *(char*) gt_array_get(index_to_alpha_char_mapping, i)); gt_score_matrix_set_score(sm, idx1 == WILDCARD ? num_of_chars : idx1, idx2 == WILDCARD ? num_of_chars : idx2, score); i++; gt_str_delete(token); gt_tokenizer_next_token(tz); if (gt_tokenizer_line_start(tz)) break; } } return had_err; }
int gt_genomediff_pck_shu_simple(GtLogger *logger, const GtGenomediffArguments *arguments, GtError *err) { int had_err = 0; int retval; GtSeqIterator *queries = NULL; const GtUchar *symbolmap, *currentQuery; const GtAlphabet *alphabet; GtUchar c_sym = 0, g_sym = 0; uint64_t queryNo; char *description = NULL; unsigned long queryLength, subjectLength = 0, currentSuffix; double avgShuLength, currentShuLength = 0.0, /*gc_subject,*/ gc_query /*, gc*/; const FMindex *subjectindex = NULL; Genericindex *genericindexSubject; const GtEncseq *encseq = NULL; double *ln_n_fac; /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */ ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac); gt_log_log("ln(max_ln_n_fac!) = %f\n", ln_n_fac[arguments->max_ln_n_fac]); genericindexSubject = genericindex_new(gt_str_get( arguments->indexname), arguments->with_esa, true, false, true, arguments->user_max_depth, logger, err); if (genericindexSubject == NULL) { had_err = 1; } else { encseq = genericindex_getencseq(genericindexSubject); } if (!had_err) { subjectLength = genericindex_get_totallength(genericindexSubject) - 1; /*subjectLength /= 2;*/ /*gt_log_log("subject length: %lu", subjectLength);*/ subjectindex = genericindex_get_packedindex(genericindexSubject); queries = gt_seqiterator_sequence_buffer_new( arguments->queryname, err); gt_assert(queries); alphabet = gt_encseq_alphabet(encseq); /* makes assumption that alphabet is dna, it has to calculate the gc! */ if (!gt_alphabet_is_dna(alphabet)) { fprintf(stderr, "error: Sequences need to be dna"); had_err = 1; } else { symbolmap = gt_alphabet_symbolmap(alphabet); gt_seqiterator_set_symbolmap(queries, symbolmap); c_sym = gt_alphabet_encode(alphabet, 'c'); g_sym = gt_alphabet_encode(alphabet, 'g'); } } for (queryNo = 0; !had_err; queryNo++) { retval = gt_seqiterator_next(queries, ¤tQuery, &queryLength, &description, err); if ( retval != 1) { if (retval < 0) { gt_free(description); } break; } gt_logger_log(logger, "found query of length: %lu", queryLength); avgShuLength = 0.0; gc_query = 0.0; for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++) { currentShuLength = (double) gt_pck_getShuStringLength( subjectindex, ¤tQuery[currentSuffix], queryLength - currentSuffix); avgShuLength += currentShuLength; if (currentQuery[currentSuffix] == c_sym || currentQuery[currentSuffix] == g_sym) { gc_query++; } } if (arguments->shulen_only) { printf("# Query %d sum of shulen:\n %.0f\n", (int) queryNo, avgShuLength); } else { avgShuLength /= (double) queryLength; gc_query /= (double) queryLength; gt_logger_log(logger, "Query %d has an average SHUstring length " "of\n# shulength: %f", (int) queryNo, avgShuLength); gt_logger_log(logger, "Query description: %s", description); gt_log_log("Query (i): %s", description); /* XXX Fehlerabfragen einbauen */ if ( !had_err ) { double div, kr; gt_logger_log(logger, "shulen:\n%f", avgShuLength); gt_log_log("shu: %f, gc: %f, len: %lu", avgShuLength, gc_query, subjectLength); div = gt_divergence(arguments->divergence_rel_err, arguments->divergence_abs_err, arguments->divergence_m, arguments->divergence_threshold, avgShuLength, subjectLength, gc_query, ln_n_fac, arguments->max_ln_n_fac); gt_logger_log(logger, "divergence:\n%f", div); kr = gt_calculateKr(div); printf("# Kr:\n%f\n", kr); } } } gt_free(ln_n_fac); gt_seqiterator_delete(queries); genericindex_delete(genericindexSubject); return had_err; }
static int gt_encseq_info_runner(GT_UNUSED int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtEncseqInfoArguments *arguments = tool_arguments; int had_err = 0; GtAlphabet *alpha; const GtUchar *chars; gt_error_check(err); gt_assert(arguments); if (arguments->nomap) { GtEncseqMetadata *emd = gt_encseq_metadata_new(argv[parsed_args], err); if (!emd) had_err = -1; if (!had_err) { if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_version(emd)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_metadata_is64bit(emd) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_total_length(emd)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_sequences(emd)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_metadata_num_of_files(emd)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_metadata_min_seq_length(emd), gt_encseq_metadata_max_seq_length(emd)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_metadata_accesstype(emd))); alpha = gt_encseq_metadata_alphabet(emd); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } } gt_encseq_metadata_delete(emd); } else { GtEncseqLoader *encseq_loader; GtEncseq *encseq; encseq_loader = gt_encseq_loader_new(); if (arguments->mirror) gt_encseq_loader_mirror(encseq_loader); if (!(encseq = gt_encseq_loader_load(encseq_loader, argv[parsed_args], err))) had_err = -1; if (!had_err) { const GtStrArray *filenames; GtUword i; if (!arguments->noindexname) { gt_file_xprintf(arguments->outfp, "index name: "); gt_file_xprintf(arguments->outfp, "%s\n", argv[parsed_args]); } gt_file_xprintf(arguments->outfp, "file format version: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_version(encseq)); gt_file_xprintf(arguments->outfp, "64-bit file: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_is_64_bit(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "total length: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "compressed size: "); gt_file_xprintf(arguments->outfp, ""GT_WU" bytes\n", gt_encseq_sizeofrep(encseq)); gt_file_xprintf(arguments->outfp, "number of sequences: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_sequences(encseq)); gt_file_xprintf(arguments->outfp, "number of files: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_num_of_files(encseq)); gt_file_xprintf(arguments->outfp, "length of shortest/longest " "sequence: "); gt_file_xprintf(arguments->outfp, ""GT_WU"/"GT_WU"\n", gt_encseq_min_seq_length(encseq), gt_encseq_max_seq_length(encseq)); filenames = gt_encseq_filenames(encseq); gt_file_xprintf(arguments->outfp, "original filenames:\n"); for (i = 0; i < gt_str_array_size(filenames); i++) { gt_file_xprintf(arguments->outfp, "\t%s ("GT_WU" characters)\n", gt_str_array_get(filenames, i), (GtUword) gt_encseq_effective_filelength(encseq, i)); } alpha = gt_encseq_alphabet(encseq); chars = gt_alphabet_characters(alpha); gt_file_xprintf(arguments->outfp, "alphabet size: "); gt_file_xprintf(arguments->outfp, "%u\n", gt_alphabet_num_of_chars(alpha)); gt_file_xprintf(arguments->outfp, "alphabet characters: "); gt_file_xprintf(arguments->outfp, "%.*s", gt_alphabet_num_of_chars(alpha), (char*) chars); if (gt_alphabet_is_dna(alpha)) gt_file_xprintf(arguments->outfp, " (DNA)"); if (gt_alphabet_is_protein(alpha)) gt_file_xprintf(arguments->outfp, " (Protein)"); gt_file_xprintf(arguments->outfp, "\n"); if (arguments->show_alphabet) { GtStr *out = gt_str_new(); gt_alphabet_to_str(alpha, out); gt_file_xprintf(arguments->outfp, "alphabet definition:\n"); gt_file_xprintf(arguments->outfp, "%s\n", gt_str_get(out)); gt_str_delete(out); } gt_file_xprintf(arguments->outfp, "character distribution:\n"); for (i = 0; i < gt_alphabet_num_of_chars(alpha); i++) { GtUword cc; cc = gt_encseq_charcount(encseq, gt_alphabet_encode(alpha, chars[i])); gt_file_xprintf(arguments->outfp, "\t%c: "GT_WU" (%.2f%%)\n", (char) chars[i], cc, (cc /(double) (gt_encseq_total_length(encseq) - gt_encseq_num_of_sequences(encseq)+1))*100); } gt_file_xprintf(arguments->outfp, "number of wildcards: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_wildcards(encseq), gt_encseq_realwildcardranges(encseq)); gt_file_xprintf(arguments->outfp, "number of special characters: "); gt_file_xprintf(arguments->outfp, ""GT_WU" ("GT_WU" range(s))\n", gt_encseq_specialcharacters(encseq), gt_encseq_realspecialranges(encseq)); gt_file_xprintf(arguments->outfp, "length of longest non-special " "character stretch: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_lengthoflongestnonspecial(encseq)); gt_file_xprintf(arguments->outfp, "accesstype: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_access_type_str(gt_encseq_accesstype_get(encseq))); gt_file_xprintf(arguments->outfp, "bits used per character: "); gt_file_xprintf(arguments->outfp, "%f\n", (double) ((uint64_t) CHAR_BIT * (uint64_t) gt_encseq_sizeofrep(encseq)) / (double) gt_encseq_total_length(encseq)); gt_file_xprintf(arguments->outfp, "has special ranges: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_specialranges(encseq) ? "yes" : "no"); gt_file_xprintf(arguments->outfp, "has description support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_description_support(encseq) ? "yes" : "no"); if (gt_encseq_has_description_support(encseq)) { gt_file_xprintf(arguments->outfp, "length of longest description: "); gt_file_xprintf(arguments->outfp, ""GT_WU"\n", gt_encseq_max_desc_length(encseq)); } gt_file_xprintf(arguments->outfp, "has multiple sequence support: "); gt_file_xprintf(arguments->outfp, "%s\n", gt_encseq_has_multiseq_support(encseq) ? "yes" : "no"); } gt_encseq_delete(encseq); gt_encseq_loader_delete(encseq_loader); } return had_err; }