static GtMatchReference* gt_mirror_and_sort_matches(GtArray *matches) { GtMatchReference *mref; GtMatch *match; GtRange rng_seq1, rng_seq2; unsigned long i, j; mref = gt_calloc((size_t) (gt_array_size(matches) * 2), sizeof (GtMatchReference)); for (i = 0, j = 0; i < gt_array_size(matches); i++, j+=2) { match = *(GtMatch**) gt_array_get(matches, i); gt_match_get_range_seq1(match, &rng_seq1); gt_match_get_range_seq2(match, &rng_seq2); mref[j].startpos = rng_seq1.start; mref[j].matchnum = i; mref[j+1].startpos = rng_seq2.start; mref[j+1].matchnum = i; } qsort (mref, (size_t) (2 * gt_array_size(matches)), sizeof (GtMatchReference), cmpmatchreferences); return mref; }
static int gt_condenser_search_runner(GT_UNUSED int argc, GT_UNUSED const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GtError *err) { GtCondenserSearchArguments *arguments = tool_arguments; int i, had_err = 0; char *querypath = gt_str_get(arguments->querypath); GtStr* coarse_fname = gt_str_new_cstr("coarse_"); char *db_basename = NULL; char *suffix_ptr = NULL; GtTimer *timer = NULL; GtLogger *logger = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr); db_basename = gt_basename(gt_str_get(arguments->dbpath)); /* if first char is '.' this might be a hidden file */ if (strlen(db_basename) > (size_t) 1 && (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) { /* remove suffix */ *suffix_ptr = '\0'; } gt_str_append_cstr(coarse_fname, db_basename); gt_str_append_cstr(coarse_fname, ".fas"); gt_free(db_basename); db_basename = NULL; suffix_ptr = NULL; if (arguments->blastn || arguments->blastp) { GtMatch *match; GtMatchIterator *mp = NULL; GtNREncseq *nrencseq = NULL; GtStr *fastaname = gt_str_clone(arguments->dbpath); HitPosition *hits; double eval, raw_eval = 0.0; GtUword coarse_db_len = 0; GtMatchIteratorStatus status; int curr_hits = 0, max_hits = 100; hits = gt_malloc(sizeof (*hits) * (size_t) max_hits); gt_str_append_cstr(fastaname, ".fas"); for (i=0; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("initialization"); gt_timer_start(timer); } /*extract sequences from compressed database*/ if (!had_err) { nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath), logger, err); if (nrencseq == NULL) had_err = -1; } if (!had_err) { if (arguments->ceval == GT_UNDEF_DOUBLE || arguments->feval == GT_UNDEF_DOUBLE) { /* from NCBI BLAST tutorial: E = Kmne^{-lambdaS} calculates E-value for score S with natural scale parameters K for search space size and lambda for the scoring system E = mn2^-S' m being the subject (total) length, n the length of ONE query calculates E-value for bit-score S' */ GtFastaReader *reader; GtCondenserSearchAvg avg = {0,0}; reader = gt_fasta_reader_rec_new(arguments->querypath); had_err = gt_fasta_reader_run(reader, NULL, NULL, gt_condenser_search_cum_moving_avg, &avg, err); if (!had_err) { GtUword S = arguments->bitscore; gt_log_log(GT_WU " queries, avg query size: " GT_WU, avg.count, avg.avg); raw_eval = 1/pow(2.0, (double) S) * avg.avg; gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval); gt_assert(avg.avg != 0); } gt_fasta_reader_delete(reader); } } /*create BLAST database from compressed database fasta file*/ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create coarse BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname), err); } if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "coarse BLAST run", stderr); if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(fastaname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, arguments->ceval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtUword hit_seq_id; char string[7]; const char *dbseqid = gt_match_get_seqid2(match); if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) { gt_match_get_range_seq2(match, hits[curr_hits].range); hits[curr_hits].idx = hit_seq_id; gt_match_delete(match); curr_hits++; if (curr_hits == max_hits) { HitPosition *hit_extention; max_hits += 100; hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits); for (i=max_hits - 100; i < max_hits; i++) { hits[i].range = gt_malloc(sizeof (*hits[i].range)); } } } else { gt_error_set(err, "could not parse unique db header %s", dbseqid); had_err = -1; } } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_match_iterator_delete(mp); } /*extract sequences*/ if (!had_err) { GtNREncseqDecompressor *decomp; GtFile *coarse_hits; if (timer != NULL) gt_timer_show_progress(timer, "extract coarse search hits", stderr); decomp = gt_n_r_encseq_decompressor_new(nrencseq); coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err); /* TODO DW do NOT extract complete uniques! these could be complete chromosomes!! just extract something around it? maybe +- max query length*/ for (i = 0; i < curr_hits; i++) { gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp, hits[i].idx); } had_err = gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits, decomp, &coarse_db_len, err); gt_assert(coarse_db_len != 0); gt_file_delete(coarse_hits); gt_n_r_encseq_decompressor_delete(decomp); } gt_n_r_encseq_delete(nrencseq); /* create BLAST database from decompressed database file */ if (!had_err) { if (timer != NULL) gt_timer_show_progress(timer, "create fine BLAST db", stderr); if (arguments->blastn) had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname), err); else had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname), err); } /* perform fine BLAST search */ if (!had_err) { GtBlastProcessCall *call; if (timer != NULL) gt_timer_show_progress(timer, "fine BLAST run", stderr); if (arguments->feval == GT_UNDEF_DOUBLE) { eval = raw_eval * coarse_db_len; } else { eval = arguments->feval; } if (arguments->blastp) call = gt_blast_process_call_new_prot(); else call = gt_blast_process_call_new_nucl(); gt_blast_process_call_set_db(call, gt_str_get(coarse_fname)); gt_blast_process_call_set_query(call, querypath); gt_blast_process_call_set_evalue(call, eval); gt_blast_process_call_set_num_threads(call, arguments->blthreads); gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval, coarse_db_len); mp = gt_match_iterator_blast_process_new(call, err); if (!mp) had_err = -1; gt_blast_process_call_delete(call); if (!had_err) { GtUword numofhits = 0; while (!had_err && (status = gt_match_iterator_next(mp, &match, err)) != GT_MATCHER_STATUS_END) { if (status == GT_MATCHER_STATUS_OK) { GtMatchBlast *matchb = (GtMatchBlast*) match; char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50); GtRange range_seq1; GtRange range_seq2; numofhits++; gt_match_get_range_seq1(match, &range_seq1); gt_match_get_range_seq2(match, &range_seq2); gt_file_xprintf( arguments->outfp, "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t%g\t%.3f\n", gt_match_get_seqid1(match), gt_match_get_seqid2(match), gt_match_blast_get_similarity(matchb), gt_match_blast_get_align_length(matchb), range_seq1.start, range_seq1.end, range_seq2.start, range_seq2.end, gt_match_blast_get_evalue(matchb), (double) gt_match_blast_get_bitscore(matchb)); gt_match_delete(match); gt_free(dbseqid); } else if (status == GT_MATCHER_STATUS_ERROR) { had_err = -1; } } gt_log_log(GT_WU " hits found\n", numofhits); } gt_match_iterator_delete(mp); } if (!had_err) if (timer != NULL) gt_timer_show_progress_final(timer, stderr); gt_timer_delete(timer); /*cleanup*/ for (i=0; i < max_hits; i++) { gt_free(hits[i].range); } gt_free(hits); gt_str_delete(fastaname); } gt_str_delete(coarse_fname); gt_logger_delete(logger); return had_err; }
static int cluster_sequences(GtArray *matches, GtClusteredSet *cs, GtHashmap *seqdesc2seqnum, unsigned int psmall, unsigned int plarge, GtEncseq *encseq, GtError *err) { GtMatch *match; GtMatchEdgeTable matchedgetab; GtMatchEdge matchedge; GtRange rng_seq1, rng_seq2; int had_err = 0; unsigned long i, lsmall, llarge, matchlen1, matchlen2, num_of_seq, seqnum1 = 0, seqnum2 = 0; const char *seqid; num_of_seq = gt_encseq_num_of_sequences(encseq); gt_assert(matches && cs && seqdesc2seqnum && encseq); if (gt_clustered_set_num_of_elements(cs, err) != num_of_seq) { had_err = -1; gt_error_set(err, "number of sequences (%lu) unequals number of elements in" " clustered set (%lu)", num_of_seq, gt_clustered_set_num_of_elements(cs, err)); } if (!had_err) { matchedgetab.edges = gt_array_new(sizeof (GtMatchEdge)); matchedgetab.num_of_edges = 0; for (i = 0; i < gt_array_size(matches); i++) { match = *(GtMatch**) gt_array_get(matches, i); gt_match_get_range_seq1(match, &rng_seq1); gt_match_get_range_seq2(match, &rng_seq2); matchlen1 = gt_range_length(&rng_seq1); matchlen2 = gt_range_length(&rng_seq2); seqid = gt_match_get_seqid1(match); if (gt_hashmap_get(seqdesc2seqnum, (void*) seqid) != NULL) seqnum1 = ((unsigned long) gt_hashmap_get(seqdesc2seqnum, seqid)) - 1; else { had_err = -1; gt_error_set(err, "key %s not found", seqid); } seqid = gt_match_get_seqid2(match); if (!had_err && gt_hashmap_get(seqdesc2seqnum, (void*) seqid) != NULL) seqnum2 = ((unsigned long) gt_hashmap_get(seqdesc2seqnum, seqid)) - 1; else { had_err = -1; gt_error_set(err, "key %s not found", seqid); } if (!had_err) { if (gt_encseq_seqlength(encseq, seqnum1) > gt_encseq_seqlength(encseq, seqnum2)) { llarge = gt_encseq_seqlength(encseq, seqnum1); lsmall = gt_encseq_seqlength(encseq, seqnum2); } else { lsmall = gt_encseq_seqlength(encseq, seqnum1); llarge = gt_encseq_seqlength(encseq, seqnum2); } if (((llarge * plarge)/100 <= matchlen1) && ((lsmall * psmall)/100 <= matchlen1) && ((llarge * plarge)/100 <= matchlen2) && ((lsmall * psmall)/100 <= matchlen2)) { if (seqnum1 != seqnum2) { matchedge.matchnum0 = seqnum1; matchedge.matchnum1 = seqnum2; gt_array_add(matchedgetab.edges, matchedge); matchedgetab.num_of_edges++; } } } } } if (!had_err) if (gt_cluster_matches(cs, &matchedgetab, err) != 0) had_err = -1; if (!had_err) gt_array_delete(matchedgetab.edges); return had_err; }