static int hmmsearch_call_coarse_search(GtCondenseq* ces, char *hmmsearch_path, char *table_filename, char *hmm_filename, GtLogger *logger, GtError *err) { int had_err = 0; char **hmmargs = NULL, *hmmenv[] = { NULL }; GtStr *coarse_fas = gt_condenseq_unique_fasta_file(ces); GtSafePipe *pipe = NULL; gt_assert(coarse_fas != NULL); /* Array has to end with NULL */ hmmargs = gt_calloc((size_t) 8, sizeof (*hmmargs)); hmmargs[0] = hmmsearch_path; hmmargs[1] = gt_cstr_dup("--noali"); hmmargs[2] = gt_cstr_dup("--notextw"); hmmargs[3] = gt_cstr_dup("--domtblout"); hmmargs[4] = table_filename; hmmargs[5] = hmm_filename; hmmargs[6] = gt_str_get(coarse_fas); gt_logger_log(logger, "calling: %s", hmmsearch_path); pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err); if (pipe == NULL) had_err = -1; gt_free(hmmargs[1]); gt_free(hmmargs[2]); gt_free(hmmargs[3]); gt_free(hmmargs); gt_str_delete(coarse_fas); /* pipe test for splint */ if (!had_err && pipe != NULL) { if (gt_log_enabled()) { GtStr *line = gt_str_new(); while (gt_str_read_next_line(line, pipe->read_fd) == 0) { gt_log_log("%s", gt_str_get(line)); gt_str_reset(line); } gt_str_delete(line); } (void) gt_safe_pclose(pipe); } return had_err; }
static int hmmsearch_call_fine_search(GtStr *table_filename, char *fine_fasta_filename, char *hmmsearch_path, char *hmm_filename, GtLogger *logger, GtError *err) { int had_err = 0; GtSafePipe *pipe = NULL; char **hmmargs = NULL, *hmmenv[] = { NULL }; size_t hmmargc = (size_t) 4; unsigned int hmmidx = 0; if (table_filename != NULL) { hmmargc += (size_t) 2; } hmmargs = gt_calloc(hmmargc, sizeof (*hmmargs)); hmmargs[hmmidx++] = hmmsearch_path; if (table_filename != NULL) { hmmargs[hmmidx++] = gt_cstr_dup("--tblout"); hmmargs[hmmidx++] = gt_str_get(table_filename); } hmmargs[hmmidx++] = hmm_filename; hmmargs[hmmidx++] = fine_fasta_filename; gt_assert(hmmargs[hmmidx] == NULL); gt_logger_log(logger, "calling: %s", hmmsearch_path); pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err); if (table_filename != NULL) gt_free(hmmargs[1]); gt_free(hmmargs); if (pipe == NULL) had_err = -1; if (!had_err) { GtStr *line = gt_str_new(); gt_assert(pipe != NULL); /* shut up splint */ while (gt_str_read_next_line(line, pipe->read_fd) == 0) { printf("%s\n", gt_str_get(line)); gt_str_reset(line); } gt_str_delete(line); (void) gt_safe_pclose(pipe); } return had_err; }
static int gt_genomediff_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { bool mirrored = false; int had_err = 0, i; GtEncseq *encseq = NULL; GtGenomediffArguments *arguments = tool_arguments; GtLogger *logger; GtShuUnitFileInfo *unit_info = NULL; GtTimer *timer = NULL; gt_error_check(err); gt_assert(arguments); logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout); gt_assert(logger); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(arguments->filenames, argv[i]); } if (gt_showtime_enabled()) { timer = gt_timer_new_with_progress_description("start"); gt_timer_start(timer); gt_assert(timer); } if (arguments->with_units) { gt_logger_log(logger, "unitfile option set, filename is %s\n", gt_str_get(arguments->unitfile)); } if (timer != NULL) gt_timer_show_progress(timer, "start shu search", stdout); if (gt_str_array_size(arguments->filenames) > 1UL) { GtEncseqEncoder *ee = gt_encseq_encoder_new(); gt_encseq_encoder_set_timer(ee, timer); gt_encseq_encoder_set_logger(ee, logger); /* kr only makes sense for dna, so we can check this already with ee */ gt_encseq_encoder_set_input_dna(ee); had_err = gt_encseq_encoder_encode(ee, arguments->filenames, gt_str_get(arguments->indexname), err); gt_encseq_encoder_delete(ee); } else { gt_str_append_str(arguments->indexname, gt_str_array_get_str(arguments->filenames, 0)); if (arguments->with_esa || arguments->with_pck) { GtStr *current_line = gt_str_new(); FILE *prj_fp; const char *buffer; char **elements = NULL; prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname), GT_PROJECTFILESUFFIX,"rb",err); if (prj_fp == NULL) had_err = -1; while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) { buffer = gt_str_get(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); elements = gt_cstr_split(buffer, '='); gt_log_log("%s", elements[0]); if (strcmp("mirrored", elements[0]) == 0) { gt_log_log("%s", elements[1]); if (strcmp("1", elements[1]) == 0) { mirrored = true; gt_log_log("sequences are treated as mirrored"); } } gt_str_reset(current_line); } gt_str_delete(current_line); if (elements != NULL) { gt_free(elements[0]); gt_free(elements[1]); } gt_free(elements); gt_fa_xfclose(prj_fp); } } if (!had_err) { GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts, err); if (mirrored) gt_encseq_loader_mirror(el); encseq = gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err); gt_encseq_loader_delete(el); } if (encseq == NULL) had_err = -1; if (!had_err) { unit_info = gt_shu_unit_info_new(encseq); if (arguments->with_units) had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info, logger, err); } if (!had_err) { uint64_t **shusums = NULL; if (arguments->with_esa || arguments->with_pck) { shusums = gt_genomediff_shulen_sum(arguments, unit_info, logger, timer, err); if (shusums == NULL) had_err = -1; } else { const bool doesa = true; GenomediffInfo gd_info; Suffixeratoroptions sopts; sopts.beverbose = arguments->verbose; sopts.indexname = arguments->indexname; sopts.db = NULL; sopts.encopts = NULL; sopts.genomediff = true; sopts.inputindex = arguments->indexname; sopts.loadopts = arguments->loadopts; sopts.showprogress = false; sopts.idxopts = arguments->idxopts; gt_assert(unit_info != NULL); gt_array2dim_calloc(shusums, unit_info->num_of_genomes, unit_info->num_of_genomes); gd_info.shulensums = shusums; gd_info.unit_info = unit_info; had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err); } if (!had_err && shusums != NULL) { had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info, arguments->with_pck, logger, timer, err); gt_array2dim_delete(shusums); } } if (timer != NULL) { gt_timer_show_progress_final(timer, stdout); gt_timer_delete(timer); } gt_logger_delete(logger); gt_encseq_delete(encseq); gt_shu_unit_info_delete(unit_info); return had_err; }
static int hmmsearch_process_coarse_hits( char *table_filename, GtCondenseq *ces, GtCondenseqHmmsearchArguments *arguments, GtLogger *logger, GtError *err) { int had_err = 0; GtStr *line = gt_str_new(); FILE *table = NULL; GtSplitter *splitter = gt_splitter_new(); GtStr *query = gt_str_new(), *fine_fasta_filename = gt_str_new_cstr("condenseq"); GtRBTree *sequences = NULL; GtUword filecount = (GtUword) 1; unsigned int querycount = 0; const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename); const GtUword table_name_length = gt_str_length(arguments->outtable_filename); table = gt_xfopen(table_filename, "r"); sequences = gt_rbtree_new(hmmsearch_cmp_seqnum, hmmsearch_tree_free_node, NULL); while (!had_err && gt_str_read_next_line(line, table) == 0) { char *c_line = gt_str_get(line); GtUword uid; const GtUword target_column = 0, query_column = (GtUword) 3; if (c_line[0] != '#') { gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' '); gt_assert(gt_splitter_size(splitter) == (GtUword) 23); if (sscanf(gt_splitter_get_token(splitter, target_column), GT_WU, &uid) != 1) { gt_error_set(err, "couldn't parse target number: %s", gt_splitter_get_token(splitter, target_column)); had_err = -1; } if (gt_str_length(query) == 0 || strcmp(gt_str_get(query), gt_splitter_get_token(splitter, query_column)) != 0) { gt_str_set(query, gt_splitter_get_token(splitter, query_column)); gt_logger_log(logger, "new query: %s", gt_str_get(query)); querycount++; } if (!had_err && querycount == arguments->max_queries) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); gt_rbtree_clear(sequences); gt_str_set_length(fine_fasta_filename, fine_fasta_name_length); if (table_name_length != 0) gt_str_set_length(arguments->outtable_filename, table_name_length); querycount = 0; } if (!had_err) { if (gt_condenseq_each_redundant_seq(ces, uid, hmmsearch_process_seq, sequences, err) == 0) { had_err = -1; } } gt_splitter_reset(splitter); } gt_str_reset(line); } gt_splitter_delete(splitter); gt_str_delete(line); gt_str_delete(query); gt_xfclose(table); if (!had_err) { hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces); if (table_name_length != 0) gt_str_append_uword(arguments->outtable_filename, filecount++); had_err = hmmsearch_call_fine_search(table_name_length != 0 ? arguments->outtable_filename : NULL, gt_str_get(fine_fasta_filename), gt_str_get(arguments->hmmsearch_path), gt_str_get(arguments->hmm), logger, err); } gt_log_log("created " GT_WU " files", filecount); gt_rbtree_delete(sequences); gt_str_delete(fine_fasta_filename); return had_err; }
static int scanprjfileuintkeysviafileptr(Suffixarray *suffixarray, const char *indexname, GtLogger *logger, FILE *fpin, GtError *err) { uint32_t integersize, littleendian, readmodeint, mirrored; unsigned int linenum; GtUword currentlinelength; size_t dbfilelen = strlen(DBFILEKEY); bool haserr = false; GtScannedprjkeytable *scannedprjkeytable; GtStr *currentline; /* the following five variables are local as the parsed values are not required: they are determined by reading the encseq */ GtSpecialcharinfo specialcharinfo; GtUword totallength, numofsequences, numofdbsequences, numofquerysequences; gt_error_check(err); scannedprjkeytable = gt_scannedprjkeytable_new(); GT_SCANNEDPRJKEY_ADD("totallength",&totallength,NULL); GT_SCANNEDPRJKEY_ADD("specialcharacters", &specialcharinfo.specialcharacters,NULL); GT_SCANNEDPRJKEY_ADD("specialranges", &specialcharinfo.specialranges,NULL); GT_SCANNEDPRJKEY_ADD("realspecialranges", &specialcharinfo.realspecialranges,NULL); GT_SCANNEDPRJKEY_ADD("lengthofspecialprefix", &specialcharinfo.lengthofspecialprefix,NULL); GT_SCANNEDPRJKEY_ADD("lengthofspecialsuffix", &specialcharinfo.lengthofspecialsuffix,NULL); GT_SCANNEDPRJKEY_ADD("wildcards", &specialcharinfo.wildcards,NULL); GT_SCANNEDPRJKEY_ADD("wildcardranges", &specialcharinfo.wildcardranges,NULL); GT_SCANNEDPRJKEY_ADD("realwildcardranges", &specialcharinfo.realwildcardranges,NULL); GT_SCANNEDPRJKEY_ADD("lengthofwildcardprefix", &specialcharinfo.lengthofwildcardprefix,NULL); GT_SCANNEDPRJKEY_ADD("lengthofwildcardsuffix", &specialcharinfo.lengthofwildcardsuffix,NULL); GT_SCANNEDPRJKEY_ADD("numofsequences",&numofsequences,NULL); GT_SCANNEDPRJKEY_ADD("numofdbsequences",&numofdbsequences,NULL); gt_scannedprjkey_add(scannedprjkeytable,"numofquerysequences", &numofquerysequences,0,false,NULL); GT_SCANNEDPRJKEY_ADD("numberofallsortedsuffixes", &suffixarray->numberofallsortedsuffixes,NULL); GT_SCANNEDPRJKEY_ADD("longest",&suffixarray->longest.valueunsignedlong, &suffixarray->longest.defined); GT_SCANNEDPRJKEY_ADD("prefixlength",&suffixarray->prefixlength,NULL); GT_SCANNEDPRJKEY_ADD("largelcpvalues", &suffixarray->numoflargelcpvalues.valueunsignedlong, &suffixarray->numoflargelcpvalues.defined); gt_scannedprjkey_add(scannedprjkeytable,"averagelcp", &suffixarray->averagelcp.valuedouble, sizeof (suffixarray->averagelcp.valuedouble), true, &suffixarray->averagelcp.defined); GT_SCANNEDPRJKEY_ADD("maxbranchdepth", &suffixarray->maxbranchdepth.valueunsignedlong, &suffixarray->maxbranchdepth.defined); GT_SCANNEDPRJKEY_ADD("integersize",&integersize,NULL); GT_SCANNEDPRJKEY_ADD("littleendian",&littleendian,NULL); GT_SCANNEDPRJKEY_ADD("readmode",&readmodeint,NULL); GT_SCANNEDPRJKEY_ADD("mirrored",&mirrored,NULL); currentline = gt_str_new(); for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF; linenum++) { currentlinelength = gt_str_length(currentline); if (dbfilelen <= (size_t) currentlinelength && memcmp(DBFILEKEY,gt_str_get(currentline),dbfilelen) == 0) { /* Nothing */ } else { if (gt_scannedprjkey_analyze(indexname, GT_PROJECTFILESUFFIX, linenum, gt_str_get(currentline), currentlinelength, scannedprjkeytable, err) != 0) { haserr = true; break; } } gt_str_reset(currentline); } gt_str_delete(currentline); if (!haserr && gt_scannedprjkey_allkeysdefined(indexname,GT_PROJECTFILESUFFIX, scannedprjkeytable, logger,err) != 0) { haserr = true; } if (!haserr && integersize != (uint32_t) 32 && integersize != (uint32_t) 64) { gt_error_set(err,"%s%s contains illegal line defining the integer size", indexname,GT_PROJECTFILESUFFIX); haserr = true; } if (!haserr && integersize != (uint32_t) (sizeof (GtUword) * CHAR_BIT)) { gt_error_set(err,"index was generated for %u-bit integers while " "this program uses %u-bit integers", (unsigned int) integersize, (unsigned int) (sizeof (GtUword) * CHAR_BIT)); haserr = true; } if (!haserr) { if (gt_is_little_endian()) { if (littleendian != (uint32_t) 1) { gt_error_set(err,"computer has little endian byte order, while index " "was built on computer with big endian byte order"); haserr = true; } } else { if (littleendian == (uint32_t) 1) { gt_error_set(err,"computer has big endian byte order, while index " "was built on computer with little endian byte " "order"); haserr = true; } } } if (!haserr) { if (readmodeint > (uint32_t) 3) { gt_error_set(err,"illegal readmode %u",(unsigned int) readmodeint); haserr = true; } suffixarray->readmode = (GtReadmode) readmodeint; } if (!haserr) { if (mirrored > (uint32_t) 1) { gt_error_set(err,"illegal mirroring flag: only 0(=no mirroring) and " "1 (=mirroring) is supported, but read %u", (unsigned int) mirrored); haserr = true; } suffixarray->mirroredencseq = (mirrored == (uint32_t) 1); } gt_scannedprjkeytable_delete(scannedprjkeytable); return haserr ? -1 : 0; }
static int scanfmafileviafileptr(Fmindex *fmindex, GtSpecialcharinfo *specialcharinfo, bool *storeindexpos, const char *indexname, FILE *fpin, GtLogger *logger, GtError *err) { bool haserr = false; GtScannedprjkeytable *scannedprjkeytable; unsigned int intstoreindexpos; gt_error_check(err); scannedprjkeytable = gt_scannedprjkeytable_new(); GT_SCANNEDPRJKEY_ADD("bwtlength",&fmindex->bwtlength,NULL); GT_SCANNEDPRJKEY_ADD("longest",&fmindex->longestsuffixpos,NULL); GT_SCANNEDPRJKEY_ADD("storeindexpos",&intstoreindexpos,NULL); GT_SCANNEDPRJKEY_ADD("log2blocksize",&fmindex->log2bsize,NULL); GT_SCANNEDPRJKEY_ADD("log2markdist",&fmindex->log2markdist,NULL); GT_SCANNEDPRJKEY_ADD("specialcharacters", &specialcharinfo->specialcharacters,NULL); GT_SCANNEDPRJKEY_ADD("specialranges",&specialcharinfo->specialranges,NULL); GT_SCANNEDPRJKEY_ADD("realspecialranges",&specialcharinfo->realspecialranges, NULL); GT_SCANNEDPRJKEY_ADD("lengthofspecialprefix", &specialcharinfo->lengthofspecialprefix,NULL); GT_SCANNEDPRJKEY_ADD("lengthofspecialsuffix", &specialcharinfo->lengthofspecialsuffix,NULL); GT_SCANNEDPRJKEY_ADD("wildcards",&specialcharinfo->wildcards,NULL); GT_SCANNEDPRJKEY_ADD("wildcardranges",&specialcharinfo->wildcardranges,NULL); GT_SCANNEDPRJKEY_ADD("realwildcardranges", &specialcharinfo->realwildcardranges,NULL); GT_SCANNEDPRJKEY_ADD("lengthofwildcardprefix", &specialcharinfo->lengthofwildcardprefix,NULL); GT_SCANNEDPRJKEY_ADD("lengthofwildcardsuffix", &specialcharinfo->lengthofwildcardsuffix,NULL); GT_SCANNEDPRJKEY_ADD("suffixlength",&fmindex->suffixlength,NULL); if (!haserr) { GtStr *currentline; unsigned int linenum; currentline = gt_str_new(); for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF; linenum++) { if (gt_scannedprjkey_analyze(indexname, FMASCIIFILESUFFIX, linenum, gt_str_get(currentline), gt_str_length(currentline), scannedprjkeytable, err) != 0) { haserr = true; break; } gt_str_reset(currentline); } gt_str_delete(currentline); } if (!haserr && gt_scannedprjkey_allkeysdefined(indexname,FMASCIIFILESUFFIX, scannedprjkeytable, logger,err) != 0) { haserr = true; } if (!haserr) { if (intstoreindexpos == 1U) { *storeindexpos = true; } else { if (intstoreindexpos == 0) { *storeindexpos = false; } else { gt_error_set(err,"illegal value in line matching \"storeindexpos=\""); haserr = true; } } } gt_scannedprjkeytable_delete(scannedprjkeytable); return haserr ? -1 : 0; }
static int scanprjfileuintkeysviafileptr(Suffixarray *suffixarray, const GtStr *indexname, Verboseinfo *verboseinfo, FILE *fpin, GtError *err) { uint32_t integersize, littleendian, readmodeint; unsigned int linenum; unsigned long currentlinelength; DefinedSeqpos maxbranchdepth; size_t dbfilelen = strlen(DBFILEKEY); bool haserr = false; GtArray *riktab; GtStr *currentline; /* the following five variables are local as the parsed values are not required: they are determined by reading the encodedsequence */ Seqpos totallength; Specialcharinfo specialcharinfo; unsigned long numofsequences, numofdbsequences, numofquerysequences; gt_error_check(err); riktab = gt_array_new(sizeofReadintkeys()); SETREADINTKEYS("totallength",&totallength,NULL); SETREADINTKEYS("specialcharacters", &specialcharinfo.specialcharacters,NULL); SETREADINTKEYS("specialranges", &specialcharinfo.specialranges,NULL); SETREADINTKEYS("realspecialranges", &specialcharinfo.realspecialranges,NULL); SETREADINTKEYS("lengthofspecialprefix", &specialcharinfo.lengthofspecialprefix,NULL); SETREADINTKEYS("lengthofspecialsuffix", &specialcharinfo.lengthofspecialsuffix,NULL); SETREADINTKEYS("numofsequences",&numofsequences,NULL); SETREADINTKEYS("numofdbsequences",&numofdbsequences,NULL); setreadintkeys(riktab,"numofquerysequences",&numofquerysequences,0,NULL); SETREADINTKEYS("longest",&suffixarray->longest.valueseqpos, &suffixarray->longest.defined); SETREADINTKEYS("prefixlength",&suffixarray->prefixlength,NULL); SETREADINTKEYS("largelcpvalues", &suffixarray->numoflargelcpvalues.valueseqpos, &suffixarray->numoflargelcpvalues.defined); SETREADINTKEYS("maxbranchdepth",&maxbranchdepth.valueseqpos, &maxbranchdepth.defined); SETREADINTKEYS("integersize",&integersize,NULL); SETREADINTKEYS("littleendian",&littleendian,NULL); SETREADINTKEYS("readmode",&readmodeint,NULL); currentline = gt_str_new(); for (linenum = 0; gt_str_read_next_line(currentline, fpin) != EOF; linenum++) { currentlinelength = gt_str_length(currentline); if (dbfilelen <= (size_t) currentlinelength && memcmp(DBFILEKEY,gt_str_get(currentline),dbfilelen) == 0) { /* Nothing */ } else { if (analyzeuintline(indexname, PROJECTFILESUFFIX, linenum, gt_str_get(currentline), currentlinelength, riktab, err) != 0) { haserr = true; break; } } gt_str_reset(currentline); } gt_str_delete(currentline); if (!haserr && allkeysdefined(indexname,PROJECTFILESUFFIX,riktab, verboseinfo,err) != 0) { haserr = true; } if (!haserr && integersize != (uint32_t) 32 && integersize != (uint32_t) 64) { gt_error_set(err,"%s%s contains illegal line defining the integer size", gt_str_get(indexname),PROJECTFILESUFFIX); haserr = true; } if (!haserr && integersize != (uint32_t) (sizeof (Seqpos) * CHAR_BIT)) { gt_error_set(err,"index was generated for %u-bit integers while " "this program uses %u-bit integers", (unsigned int) integersize, (unsigned int) (sizeof (Seqpos) * CHAR_BIT)); haserr = true; } if (!haserr) { if (gt_is_little_endian()) { if (littleendian != (uint32_t) 1) { gt_error_set(err,"computer has little endian byte order, while index " "was build on computer with big endian byte order"); haserr = true; } } else { if (littleendian == (uint32_t) 1) { gt_error_set(err,"computer has big endian byte order, while index " "was build on computer with little endian byte " "order"); haserr = true; } } } if (!haserr) { if (readmodeint > (uint32_t) 3) { gt_error_set(err,"illegal readmode %u",(unsigned int) readmodeint); haserr = true; } suffixarray->readmode = (Readmode) readmodeint; } gt_array_delete(riktab); return haserr ? -1 : 0; }
static int itersearchoverallkeys(const GtEncseq *encseq, const char *keytab, unsigned long numofkeys, unsigned long keysize, const GtStr *fileofkeystoextract, unsigned long linewidth, GtError *err) { FILE *fp; GtStr *currentline; uint64_t linenum; unsigned long seqnum, countmissing = 0; bool haserr = false; Fastakeyquery fastakeyquery; if (linewidth == 0) { gt_error_set(err,"use option width to specify line width for formatting"); return -1; } fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err); if (fp == NULL) { return -1; } currentline = gt_str_new(); fastakeyquery.fastakey = gt_malloc(sizeof (char) * (keysize+1)); for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++) { if (extractkeyfromcurrentline(&fastakeyquery, keysize, currentline, linenum, fileofkeystoextract, err) != 0) { haserr = true; break; } seqnum = searchfastaqueryindes(fastakeyquery.fastakey,keytab,numofkeys, keysize); if (seqnum < numofkeys) { if (giextract_encodedseq2fasta(stdout, encseq, seqnum, &fastakeyquery, linewidth, err) != 0) { haserr = true; break; } } else { countmissing++; } gt_str_reset(currentline); } if (!haserr && countmissing > 0) { printf("# number of unsatified fastakey-queries: %lu\n",countmissing); } gt_str_delete(currentline); gt_fa_fclose(fp); gt_free(fastakeyquery.fastakey); return haserr ? - 1 : 0; }
int gt_extractkeysfromdesfile(const char *indexname, bool sortkeys, GtLogger *logger, GtError *err) { FILE *fpin, *fpout = NULL; GtStr *line = NULL; const char *keyptr; unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/ bool haserr = false, firstdesc = true; char *previouskey = NULL; Fixedsizekey *keytab = NULL, *keytabptr = NULL; GtEncseq *encseq = NULL; unsigned long numofentries = 0; const unsigned long linewidth = 60UL; fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err); if (fpin == NULL) { return -1; } if (!sortkeys) { fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err); if (fpout == NULL) { haserr = true; } } if (!haserr) { line = gt_str_new(); } for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF; linenum++) { keyptr = desc2key(&keylen,gt_str_get(line),err); if (keyptr == NULL) { haserr = true; break; } if (keylen == 0) { gt_error_set(err,"key of length 0 in \"%s\" not expected", gt_str_get(line)); haserr = true; break; } if (firstdesc) { if (keylen > (unsigned long) CHAR_MAX) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX); haserr = true; break; } constantkeylen = keylen; previouskey = gt_malloc(sizeof (char) * (constantkeylen+1)); firstdesc = false; if (!sortkeys) { gt_xfputc((char) constantkeylen,fpout); } else { GtEncseqLoader *el; if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE) { gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; " "no key must be larger than %d", (int) keylen,(int) keylen,keyptr,keylen, MAXFIXEDKEYSIZE); haserr = true; break; } el = gt_encseq_loader_new(); gt_encseq_loader_set_logger(el, logger); encseq = gt_encseq_loader_load(el, indexname, err); gt_encseq_loader_delete(el); if (encseq == NULL) { haserr = true; break; } numofentries = gt_encseq_num_of_sequences(encseq); gt_assert(numofentries > 0); keytab = gt_malloc(sizeof (*keytab) * numofentries); keytabptr = keytab; } } else { if (constantkeylen != keylen) { gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of " "the same length which for all previously seen " "headers is %lu", (int) keylen,(int) keylen,keyptr,keylen, constantkeylen); haserr = true; break; } gt_assert(previouskey != NULL); if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0) { gt_error_set(err,"previous key \"%s\" is not lexicographically smaller " "than current key \"%*.*s\"", previouskey,(int) keylen,(int) keylen,keyptr); haserr = true; break; /* printf("previous key \"%s\" (no %lu) is lexicographically larger " "than current key \"%*.*s\"\n", previouskey,linenum,(int) keylen,(int) keylen,keyptr); incorrectorder++; */ } } if (!sortkeys) { gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout); gt_xfputc('\0',fpout); } else { gt_assert(keytabptr != NULL); strncpy(keytabptr->key,keyptr,(size_t) constantkeylen); keytabptr->key[constantkeylen] = '\0'; keytabptr->seqnum = linenum; keytabptr++; } strncpy(previouskey,keyptr,(size_t) constantkeylen); previouskey[constantkeylen] = '\0'; gt_str_reset(line); } if (!haserr) { gt_logger_log(logger,"number of keys of length %lu = %lu", constantkeylen,linenum); /* gt_logger_log(logger,"number of incorrectly ordered keys = %lu", incorrectorder); */ } gt_str_delete(line); gt_fa_fclose(fpin); gt_fa_fclose(fpout); gt_free(previouskey); if (!haserr && sortkeys) { gt_assert(keytabptr != NULL); gt_assert(numofentries > 0); gt_assert(keytabptr == keytab + numofentries); qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys); gt_assert(keytabptr != NULL); for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries; keytabptr++) { if (giextract_encodedseq2fasta(stdout, encseq, keytabptr->seqnum, NULL, linewidth, err) != 0) { haserr = true; break; } } } if (encseq != NULL) { gt_encseq_delete(encseq); encseq = NULL; } gt_free(keytab); return haserr ? -1 : 0; }
static Fastakeyquery *readfileofkeystoextract(bool verbose, unsigned long *numofqueries, const GtStr *fileofkeystoextract, GtError *err) { FILE *fp; GtStr *currentline; bool haserr = false; uint64_t linenum; Fastakeyquery *fastakeyqueries; #undef SKDEBUG #ifdef SKDEBUG unsigned long i; #endif gt_error_check(err); *numofqueries = gt_file_number_of_lines(gt_str_get(fileofkeystoextract)); if (*numofqueries == 0) { gt_error_set(err,"empty file \"%s\" not allowed", gt_str_get(fileofkeystoextract)); return NULL; } fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err); if (fp == NULL) { return NULL; } if (verbose) { printf("# opened keyfile \"%s\"\n",gt_str_get(fileofkeystoextract)); } fastakeyqueries = gt_malloc(sizeof (*fastakeyqueries) * (*numofqueries)); currentline = gt_str_new(); for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++) { if (extractkeyfromcurrentline(fastakeyqueries + linenum, 0, currentline, linenum, fileofkeystoextract, err) != 0) { haserr = true; break; } gt_str_reset(currentline); } gt_str_delete(currentline); gt_fa_fclose(fp); if (haserr) { fastakeyqueries_delete(fastakeyqueries,*numofqueries); return NULL; } qsort(fastakeyqueries,(size_t) *numofqueries,sizeof (*fastakeyqueries), comparefastakeys); if (verbose) { printf("# %lu fastakey-queries successfully parsed and sorted\n", *numofqueries); } *numofqueries = remdupsfastakeyqueries(fastakeyqueries,*numofqueries,verbose); #ifdef SKDEBUG for (i=0; i<*numofqueries; i++) { printf("%lu %s\n",i,fastakeyqueries[i].fastakey); } #endif return fastakeyqueries; }