int gt_mmapandread(int argc, const char **argv, GtError *err) { int i, fd, parsed_args; void *map; struct stat sb; unsigned long long j; unsigned int byte = 0; gt_error_check(err); /* option parsing */ switch (parse_options(&parsed_args, argc, argv, err)) { case GT_OPTION_PARSER_OK: break; case GT_OPTION_PARSER_ERROR: return -1; case GT_OPTION_PARSER_REQUESTS_EXIT: return 0; } /* iterate over all files */ for (i = parsed_args; i < argc; i++) { /* open file */ fd = gt_xopen(argv[i], O_RDONLY, 0); /* get file statistics */ gt_xfstat(fd, &sb); if (sb.st_size == 0) printf("file \"%s\" is empty\n", argv[i]); else if (!(sb.st_mode & S_IFREG)) printf("\"%s\" is not a regular file\n", argv[i]); else { /* map file */ map = gt_xmmap(0, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); /* read file */ printf("reading file \"%s\"\n", argv[i]); j = 0; gt_progressbar_start(&j, (unsigned long long) sb.st_size); for (; j < (unsigned long long) sb.st_size; j++) byte |= (unsigned int) ((char*) map)[j]; gt_progressbar_stop(); /* unmap file */ gt_xmunmap(map, sb.st_size); } /* close file */ gt_xclose(fd); } if (!byte) printf("all read files contained only null characters\n"); return 0; }
unsigned long gt_contfind_bottomup(Sequentialsuffixarrayreader *ssar, bool show_progressbar, GtBitsequence *contained, unsigned long firstrevcompl, unsigned long read_length /* 0 = variable */) { ContfindBUstate state; unsigned long totallength; GT_UNUSED int retval; gt_assert(ssar != NULL); gt_assert(contained != NULL); state.contained = contained; state.encseq = gt_encseqSequentialsuffixarrayreader(ssar); totallength = gt_encseq_total_length(state.encseq); state.nofsequences = gt_encseq_num_of_sequences(state.encseq); if (read_length == 0) { prepare_sspbittab_and_shortest(totallength, &state); } else { state.shortest = read_length; state.spacing = read_length + 1; } state.show_progressbar = show_progressbar; state.csize = 0; state.cmin = 0; state.firstrevcompl = firstrevcompl; state.counter = 0; if (show_progressbar) { state.progress = 0; gt_progressbar_start(&(state.progress), (unsigned long long)totallength); } retval = (read_length == 0) ? gt_esa_bottomup_rdjcv(ssar, &state, NULL) : gt_esa_bottomup_rdjce(ssar, &state, NULL); gt_assert(retval == 0); if (show_progressbar) gt_progressbar_stop(); if (read_length == 0) gt_free(state.sspbittab); return state.counter; }
static inline void rdj_pairwise_generic(bool use_dp, GtOvlfindMode m, GtEncseq *encseq, bool revcompl, bool show_progressbar, bool use_kmp, double max_error, GtUword min_length, bool find_nonmaximal, GtSpmproc proc, GtSpmprocA proc_a, void* procdata, bool cntfilter, GtBitsequence *cntreads_in, GtBitsequence **cntreads_out, GtUword *nofreads) { GtContfind containment_status; GtBitsequence *cntreads = NULL; GtUint64 progress = 0; GtUword i, j, startpos, v_seqnum, nofsequences, n; struct Read u, v; struct Data d; gt_kmp_t** kmp_values = NULL; GT_RDJ_PAIRWISE_INIT_STRUCT_DATA(d, proc, proc_a, procdata, &u, &v, 0); gt_assert(encseq != NULL); d.mode = m; if ((m == GT_OVLFIND_ALL) && cntfilter) d.mode = GT_OVLFIND_PROPER_SPM; n = gt_encseq_num_of_sequences(encseq); if (use_kmp) kmp_values = prepare_kmp_values(encseq, n); nofsequences = n; if (revcompl) n = n >> 1; if (cntreads_in != NULL) cntreads = cntreads_in; else if (m != GT_OVLFIND_SPM) GT_INITBITTAB(cntreads, n); if (show_progressbar) gt_progressbar_start(&progress, (GtUint64)n * ((GtUint64)n - 1ULL) / 2ULL); for (i = 0; i < n; i++) { u.seqnum = i; u.direct = true; u.len = gt_encseq_seqlength(encseq, i); u.seq = gt_malloc(sizeof (char) * (u.len + 1)); startpos = gt_encseq_seqstartpos(encseq, i); gt_encseq_extract_decoded(encseq, u.seq, startpos, startpos + u.len - 1); u.seq[u.len] = '\0'; if (use_kmp) { gt_assert(kmp_values != NULL); u.pi = kmp_values[i]; } for (j = i; j < n; j++) { if (cntfilter) { gt_assert(cntreads != NULL); if ((bool)GT_ISIBITSET(cntreads, i)) break; if ((bool)GT_ISIBITSET(cntreads, j)) continue; } v.seqnum = j; /* find overlaps using direct v */ v.direct = true; v.len = gt_encseq_seqlength(encseq, j); v.seq = gt_malloc(sizeof (char) * (v.len + 1)); startpos = gt_encseq_seqstartpos(encseq, j); gt_encseq_extract_decoded(encseq, v.seq, startpos, startpos + v.len - 1); v.seq[v.len] = '\0'; if (use_kmp) { gt_assert(kmp_values != NULL); v.pi = kmp_values[j]; } containment_status = use_dp ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal) : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal); if (m != GT_OVLFIND_SPM) mark_contained(containment_status, u.seqnum, v.seqnum, cntreads); /* find overlaps using reverse complement of v */ if (revcompl) { v_seqnum = nofsequences - j - 1; v.direct = false; gt_assert(gt_encseq_seqlength(encseq, j) == gt_encseq_seqlength(encseq, v_seqnum)); startpos = gt_encseq_seqstartpos(encseq, v_seqnum); gt_encseq_extract_decoded(encseq, v.seq, startpos, startpos + v.len - 1); if (use_kmp) { gt_assert(kmp_values != NULL); v.pi = kmp_values[v_seqnum]; } containment_status = use_dp ? find_approx_overlaps(&d, max_error, min_length, find_nonmaximal) : find_exact_overlaps(&d, use_kmp, min_length, find_nonmaximal); if (m != GT_OVLFIND_SPM) mark_contained(containment_status, u.seqnum, v.seqnum, cntreads); } gt_free(v.seq); progress++; } gt_free(u.seq); } if (cntreads_out != NULL) *cntreads_out = cntreads; else if (cntreads_in == NULL) gt_free(cntreads); if (nofreads != NULL) *nofreads = n; if (use_kmp) free_kmp_values(kmp_values, revcompl ? n << 1 : n); if (show_progressbar) gt_progressbar_stop(); }
static int testfullscan(const GtStrArray *filenametab, const Encodedsequence *encseq, Readmode readmode, GtError *err) { Seqpos pos, totallength; GtUchar ccscan = 0, ccra, ccsr; GtSequenceBuffer *fb = NULL; int retval; bool haserr = false; Encodedsequencescanstate *esr; unsigned long long fullscanpbar = 0; gt_error_check(err); totallength = getencseqtotallength(encseq); gt_progressbar_start(&fullscanpbar,(unsigned long long) totallength); if (filenametab != NULL) { fb = gt_sequence_buffer_new_guess_type((GtStrArray*) filenametab, err); if (!fb) haserr = true; if (!haserr) gt_sequence_buffer_set_symbolmap(fb, getencseqAlphabetsymbolmap(encseq)); } if (!haserr) { esr = newEncodedsequencescanstate(); initEncodedsequencescanstate(esr,encseq,readmode,0); for (pos=0; /* Nothing */; pos++) { if (filenametab != NULL && readmode == Forwardmode) { retval = gt_sequence_buffer_next(fb,&ccscan,err); if (retval < 0) { haserr = true; break; } if (retval == 0) { break; } } else { if (pos >= totallength) { break; } } ccra = getencodedchar(encseq,pos,readmode); /* Random access */ if (filenametab != NULL && readmode == Forwardmode) { if (ccscan != ccra) { gt_error_set(err,"access=%s, position=" FormatSeqpos ": scan (readnextchar) = %u != " "%u = random access", encseqaccessname(encseq), pos, (unsigned int) ccscan, (unsigned int) ccra); haserr = true; break; } } ccsr = sequentialgetencodedchar(encseq,esr,pos,readmode); if (ccra != ccsr) { gt_error_set(err,"access=%s, mode=%s: position=" FormatSeqpos ": random access = %u != %u = sequential read", encseqaccessname(encseq), showreadmode(readmode), pos, (unsigned int) ccra, (unsigned int) ccsr); haserr = true; break; } fullscanpbar++; } gt_progressbar_stop(); } if (!haserr) { if (pos != totallength) { gt_error_set(err,"sequence length must be " FormatSeqpos " but is " FormatSeqpos,totallength,pos); haserr = true; } } freeEncodedsequencescanstate(&esr); gt_sequence_buffer_delete(fb); return haserr ? -1 : 0; }
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtSequniqArguments *arguments = tool_arguments; GtUint64 duplicates = 0, num_of_sequences = 0; int i, had_err = 0; GtMD5Set *md5set; gt_error_check(err); gt_assert(arguments); md5set = gt_md5set_new(arguments->nofseqs); if (!arguments->seqit) { GtUword j; GtBioseq *bs; for (i = parsed_args; !had_err && i < argc; i++) { if (!(bs = gt_bioseq_new(argv[i], err))) had_err = -1; if (!had_err) { GtMD5SetStatus retval; for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) { char *seq = gt_bioseq_get_sequence(bs, j); retval = gt_md5set_add_sequence(md5set, seq, gt_bioseq_get_sequence_length(bs, j), arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq, gt_bioseq_get_sequence_length(bs, j), arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; gt_free(seq); } gt_bioseq_delete(bs); } } } else { GtSeqIterator *seqit; GtStrArray *files; off_t totalsize; const GtUchar *sequence; char *desc; GtUword len; files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) gt_str_array_add_cstr(files, argv[i]); totalsize = gt_files_estimate_total_size(files); seqit = gt_seq_iterator_sequence_buffer_new(files, err); if (!seqit) had_err = -1; if (!had_err) { if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (!had_err) { GtMD5SetStatus retval; if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1) break; retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len, arguments->rev, err); if (retval == GT_MD5SET_NOT_FOUND) gt_fasta_show_entry(desc, (const char*) sequence, len, arguments->width, arguments->outfp); else if (retval != GT_MD5SET_ERROR) duplicates++; else had_err = -1; num_of_sequences++; } if (arguments->verbose) gt_progressbar_stop(); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); } /* show statistics */ if (!had_err) { fprintf(stderr, "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n", (GtUword)duplicates, (GtUword)num_of_sequences, ((double) duplicates / (double)num_of_sequences) * 100.0); } gt_md5set_delete(md5set); return had_err; }
int gt_extractkeysfromfastafile(bool verbose, GtFile *outfp, unsigned long width, const GtStr *fileofkeystoextract, GtStrArray *referencefiletab, GtError *err) { GtSeqIterator *seqit; const GtUchar *sequence; char *desc, *headerbufferspace = NULL, *keyspace = NULL; const char *keyptr; unsigned long allockeyspace = 0, len, keylen, numofqueries, keyposition, countmarkhit = 0; int had_err = 0; off_t totalsize; Fastakeyquery *fastakeyqueries; size_t headerbuffersize = 0, headerlength; gt_error_check(err); fastakeyqueries = readfileofkeystoextract(verbose,&numofqueries, fileofkeystoextract,err); if (fastakeyqueries == NULL) { return -1; } totalsize = gt_files_estimate_total_size(referencefiletab); if (verbose) { printf("# estimated total size is " Formatuint64_t "\n", PRINTuint64_tcast(totalsize)); } seqit = gt_seqiterator_sequence_buffer_new(referencefiletab, err); if (!seqit) { had_err = -1; } if (!had_err && verbose) { gt_progressbar_start(gt_seqiterator_getcurrentcounter(seqit, (unsigned long long) totalsize), (unsigned long long) totalsize); } while (had_err != -1 && countmarkhit < numofqueries) { had_err = gt_seqiterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) { break; } keyptr = desc2key(&keylen,desc,err); if (keyptr == NULL) { had_err = -1; } else { if (allockeyspace < keylen) { keyspace = gt_realloc(keyspace,sizeof (*keyspace) * (keylen+1)); allockeyspace = keylen; } gt_assert(keyspace != NULL); strncpy(keyspace,keyptr,(size_t) keylen); keyspace[keylen] = '\0'; keyposition = searchdesinfastakeyqueries(keyspace,fastakeyqueries, numofqueries); if (keyposition < numofqueries) { while (keyposition < numofqueries && strcmp(fastakeyqueries[keyposition].fastakey,keyspace) == 0) { #ifndef NDEBUG if (fastakeyqueries[keyposition].markhit) { fprintf(stderr,"key %s was already found before\n", fastakeyqueries[keyposition].fastakey); exit(GT_EXIT_PROGRAMMING_ERROR); } #endif headerlength = strlen(desc); if (headerbuffersize < headerlength + EXTRABUF + 1) { headerbuffersize = headerlength + EXTRABUF + 1; headerbufferspace = gt_realloc(headerbufferspace, sizeof (*headerbufferspace) * headerbuffersize); } if (COMPLETE(fastakeyqueries + keyposition)) { /* (void) snprintf(headerbufferspace,headerbuffersize, "%*.*s complete %s", (int) keylen,(int) keylen,keyspace, desc); */ gt_fasta_show_entry(desc, (const char *) sequence, len, width, outfp); } else { (void) snprintf(headerbufferspace,headerbuffersize, "%*.*s %lu %lu %s", (int) keylen,(int) keylen,keyspace, fastakeyqueries[keyposition].frompos, fastakeyqueries[keyposition].topos, desc); gt_fasta_show_entry(headerbufferspace, (const char *) (sequence+fastakeyqueries[keyposition]. frompos - 1), fastakeyqueries[keyposition].topos - fastakeyqueries[keyposition].frompos+1, width, outfp); } fastakeyqueries[keyposition].markhit = true; countmarkhit++; keyposition++; } } #ifdef SKDEBUG printf("%s 1 %lu\n",keyspace, len); #endif } } gt_free(headerbufferspace); gt_free(keyspace); if (verbose) { gt_progressbar_stop(); } if (verbose) { outputnonmarked(fastakeyqueries,numofqueries); } fastakeyqueries_delete(fastakeyqueries,numofqueries); gt_seqiterator_delete(seqit); return had_err; }
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args, void *tool_arguments, GtError *err) { GtConvertseqArguments *arguments = tool_arguments; int had_err = 0, i; GtFilelengthvalues *flv; GtSeqIterator *seqit; GtSequenceBuffer *sb = NULL; GtStrArray *files; const GtUchar *sequence; char *desc; GtUword len, j; off_t totalsize; gt_error_check(err); gt_assert(arguments != NULL); files = gt_str_array_new(); for (i = parsed_args; i < argc; i++) { gt_str_array_add_cstr(files, argv[i]); } totalsize = gt_files_estimate_total_size(files); flv = gt_calloc((size_t) gt_str_array_size(files), sizeof (GtFilelengthvalues)); sb = gt_sequence_buffer_new_guess_type(files, err); if (!sb) { had_err = -1; } if (!had_err) { gt_sequence_buffer_set_filelengthtab(sb, flv); /* read input using seqiterator */ seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb); if (arguments->verbose) { gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit, (GtUint64) totalsize), (GtUint64) totalsize); } while (true) { GtUchar *seq = NULL; desc = NULL; j = 0UL; had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err); if (had_err != 1) break; if (arguments->revcomp) { GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar)); memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar)); had_err = gt_reverse_complement((char*) newseq, len, err); if (had_err) break; seq = newseq; } else seq = (GtUchar*) sequence; if (!arguments->showseq) { bool in_wildcard = false; gt_file_xprintf(arguments->outfp, ">%s\n", desc); for (i = 0; (GtUword) i < len; i++) { if (arguments->reduce_wc_dna) { switch (seq[i]) { case 'a': case 'A': case 'c': case 'C': case 'g': case 'G': case 't': case 'u': case 'T': case 'U': in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; break; default: if (!in_wildcard) { in_wildcard = true; if (isupper((int) seq[i])) gt_file_xfputc((int) 'N', arguments->outfp); else gt_file_xfputc((int) 'n', arguments->outfp); j++; } } } else if (arguments->reduce_wc_prot) { switch (seq[i]) { case 'X': case 'B': case 'Z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'N', arguments->outfp); j++; } break; case 'x': case 'b': case 'z': if (!in_wildcard) { in_wildcard = true; gt_file_xfputc((int) 'n', arguments->outfp); j++; } break; default: in_wildcard = false; gt_file_xfputc((int) seq[i], arguments->outfp); j++; } } else { gt_file_xfputc((int) seq[i], arguments->outfp); j++; } if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) { j = 0; gt_file_xprintf(arguments->outfp, "\n"); } } if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0) gt_file_xprintf(arguments->outfp, "\n"); } if (arguments->revcomp) { gt_free(seq); } } if (arguments->showflv) { for (j=0;j<gt_str_array_size(files);j++) { fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n", j, gt_str_array_get(files, j), (GtUword) flv[j].length, (GtUword) flv[j].effectivelength); } } if (arguments->verbose) { gt_progressbar_stop(); } gt_sequence_buffer_delete(sb); gt_seq_iterator_delete(seqit); } gt_str_array_delete(files); gt_free(flv); return had_err; }
static int gff3_in_stream_plain_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err) { GtGFF3InStreamPlain *is = gff3_in_stream_plain_cast(ns); GtStr *filenamestr; int had_err = 0, status_code; gt_error_check(err); if (gt_queue_size(is->genome_node_buffer) > 1) { /* we still have at least two nodes in the buffer -> serve from there */ *gn = gt_queue_get(is->genome_node_buffer); return 0; } /* the buffer is empty or has one element */ gt_assert(gt_queue_size(is->genome_node_buffer) <= 1); for (;;) { /* open file if necessary */ if (!is->file_is_open) { if (gt_str_array_size(is->files) && is->next_file == gt_str_array_size(is->files)) { break; } if (gt_str_array_size(is->files)) { if (strcmp(gt_str_array_get(is->files, is->next_file), "-") == 0) { if (is->stdin_argument) { gt_error_set(err, "multiple specification of argument file \"-\""); had_err = -1; break; } is->fpin = gt_file_xopen(NULL, "r"); is->file_is_open = true; is->stdin_argument = true; } else { is->fpin = gt_file_xopen(gt_str_array_get(is->files, is->next_file), "r"); is->file_is_open = true; } is->next_file++; } else { if (is->stdin_processed) break; is->fpin = NULL; is->file_is_open = true; } is->line_number = 0; if (!had_err && is->progress_bar) { printf("processing file \"%s\"\n", gt_str_array_size(is->files) ? gt_str_array_get(is->files, is->next_file-1) : "stdin"); } if (!had_err && is->fpin && is->progress_bar) { gt_progressbar_start(&is->line_number, gt_file_number_of_lines(gt_str_array_get(is->files, is->next_file-1))); } } gt_assert(is->file_is_open); filenamestr = gt_str_array_size(is->files) ? gt_str_array_get_str(is->files, is->next_file-1) : is->stdinstr; /* read two nodes */ had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code, is->genome_node_buffer, is->used_types, filenamestr, &is->line_number, is->fpin, err); if (had_err) break; if (status_code != EOF) { had_err = gt_gff3_parser_parse_genome_nodes(is->gff3_parser, &status_code, is->genome_node_buffer, is->used_types, filenamestr, &is->line_number, is->fpin, err); if (had_err) break; } if (status_code == EOF) { /* end of current file */ if (is->progress_bar) gt_progressbar_stop(); gt_file_delete(is->fpin); is->fpin = NULL; is->file_is_open = false; gt_gff3_parser_reset(is->gff3_parser); if (!gt_str_array_size(is->files)) { is->stdin_processed = true; break; } continue; } gt_assert(gt_queue_size(is->genome_node_buffer)); /* make sure the parsed nodes are sorted */ if (is->ensure_sorting && gt_queue_size(is->genome_node_buffer) > 1) { GtGenomeNode *last_node = NULL; /* a sorted stream can have at most one input file */ gt_assert(gt_str_array_size(is->files) == 0 || gt_str_array_size(is->files) == 1); had_err = gt_queue_iterate(is->genome_node_buffer, buffer_is_sorted, &last_node, err); } if (!had_err) { *gn = gt_queue_get(is->genome_node_buffer); } return had_err; } gt_assert(!gt_queue_size(is->genome_node_buffer)); *gn = NULL; return had_err; }