typedef struct { int n, max; bam1_t **a; } tmp_stack_t; static inline void stack_insert(tmp_stack_t *stack, bam1_t *b) { if (stack->n == stack->max) { stack->max = stack->max? stack->max<<1 : 0x10000; stack->a = (bam1_t**)realloc(stack->a, sizeof(bam1_t*) * stack->max); } stack->a[stack->n++] = b; } static inline void dump_best(tmp_stack_t *stack, khash_t(pos) *best_hash, bamFile out) { int i; for (i = 0; i != stack->n; ++i) { bam_write1(out, stack->a[i]); bam_destroy1(stack->a[i]); } stack->n = 0; if (kh_size(best_hash) > BUFFER_SIZE) kh_clear(pos, best_hash); } static void clear_del_set(khash_t(name) *del_set) { khint_t k; for (k = kh_begin(del_set); k < kh_end(del_set); ++k) if (kh_exist(del_set, k))
if (is_word_token(token.type)) { add_normalized_token(array, str, token, LANGUAGE_CLASSIFIER_NORMALIZE_TOKEN_OPTIONS); } else { char_array_add(array, " "); } } static inline void append_prefix(char_array *array, char *prefix) { if (prefix != NULL) { char_array_append(array, prefix); char_array_append(array, NAMESPACE_SEPARATOR_CHAR); } } static inline void add_full_token_feature(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token) { if (features == NULL || feature_array == NULL) return; char_array_clear(feature_array); append_prefix(feature_array, prefix); char_array_add_len(feature_array, str + token.offset, token.len); if (feature_array->n <= 1) return; char *feature = char_array_get_string(feature_array); log_debug("full token feature=%s\n", feature); feature_counts_add(features, feature, 1.0); } static void add_ngram_features(khash_t(str_double) *features, char *prefix, char_array *feature_array, char *str, token_t token, size_t n) {
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) { int ret_code = 0; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } } char *output_directory = shared_options_data->output_directory; size_t output_directory_len = strlen(output_directory); ret_code = create_directory(output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", output_directory); } // Remove all .txt files in folder ret_code = delete_files_by_extension(output_directory, "txt"); if (ret_code != 0) { return ret_code; } // Initialize environment for connecting to the web service ret_code = init_http_environment(0); if (ret_code != 0) { return ret_code; } // Output file descriptors static cp_hashtable *output_files = NULL; // Lines of the output data in the main .txt files static list_t *output_list = NULL; // Consequence type counters (for summary, must be kept between web service calls) static cp_hashtable *summary_count = NULL; // Gene list (for genes-with-variants, must be kept between web service calls) static cp_hashtable *gene_list = NULL; // Initialize collections of file descriptors and summary counters ret_code = initialize_output_files(output_directory, output_directory_len, &output_files); if (ret_code != 0) { return ret_code; } initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list); initialize_ws_buffers(shared_options_data->num_threads); // Create job.status file char job_status_filename[output_directory_len + 10]; sprintf(job_status_filename, "%s/job.status", output_directory); FILE *job_status = new_job_status_file(job_status_filename); if (!job_status) { LOG_FATAL("Can't create job status file\n"); } else { update_job_status_file(0, job_status); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); start = omp_get_wtime(); ret_code = vcf_read(vcf_file, 1, (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines, shared_options_data->batch_bytes <= 0); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); // Filters and files for filtering output filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); // Pedigree information (used in some filters) individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; // Filename structure outdir/vcfname.errors char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char)); get_filename_from_path(shared_options_data->vcf_filename, prefix_filename); char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char)); sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename); non_processed_file = fopen(non_processed_filename, "w"); free(non_processed_filename); // Maximum size processed by each thread (never allow more than 1000 variants per query) if (shared_options_data->batch_lines > 0) { shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads)); } else { shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY; } LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread); int i = 0; vcf_batch_t *batch = NULL; int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0; start = omp_get_wtime(); while (batch = fetch_vcf_batch(vcf_file)) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], vcf_file); } // Write file format, header entries and delimiter if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); } if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); } if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); } LOG_DEBUG("VCF header written\n"); if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); } } // printf("batch loaded = '%.*s'\n", 50, batch->text); // printf("batch text len = %zu\n", strlen(batch->text)); // if (i % 10 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); // } int reconnections = 0; int max_reconnections = 3; // TODO allow to configure? // Write records that passed to a separate file, and query the WS with them as args array_list_t *failed_records = NULL; int num_variables = ped_file? get_num_variables(ped_file): 0; array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records); if (passed_records->size > 0) { // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes; int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes); do { // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { int tid = omp_get_thread_num(); LOG_DEBUG_F("[%d] WS invocation\n", tid); LOG_DEBUG_F("[%d] -- effect WS\n", tid); if (!reconnections || ret_ws_0) { ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j], options_data->excludes); parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list); free(effect_line[tid]); effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char)); } if (!options_data->no_phenotypes) { if (!reconnections || ret_ws_1) { LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num()); ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_snp_phenotype_response(tid, output_list); free(snp_line[tid]); snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char)); } if (!reconnections || ret_ws_2) { LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num()); ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_mutation_phenotype_response(tid, output_list); free(mutation_line[tid]); mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char)); } } } LOG_DEBUG_F("*** %dth web services invocation finished\n", i); if (ret_ws_0 || ret_ws_1 || ret_ws_2) { if (ret_ws_0) { LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0)); } if (ret_ws_1) { LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1)); } if (ret_ws_2) { LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2)); } // In presence of errors, wait 4 seconds before retrying reconnections++; LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections); sleep(4); } else { free(chunk_starts); free(chunk_sizes); } } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)); } // If the maximum number of reconnections was reached still with errors, // write the non-processed batch to the corresponding file if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) { #pragma omp critical { write_vcf_batch(batch, non_processed_file); } } // Write records that passed and failed filters to separate files, and free them write_filtering_output_files(passed_records, failed_records, passed_file, failed_file); free_filtered_records(passed_records, failed_records, batch->records); // Free batch and its contents vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (failed_file) { fclose(failed_file); } if (non_processed_file) { fclose(non_processed_file); } // Free filters for (i = 0; i < num_filters; i++) { filter_t *filter = filters[i]; filter->free_func(filter); } free(filters); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } } #pragma omp section { // Thread which writes the results to all_variants, summary and one file per consequence type int ret = 0; char *line; list_item_t* item = NULL; FILE *fd = NULL; FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants"); FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes"); FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes"); while ((item = list_remove_item(output_list)) != NULL) { line = item->data_p; // Type greater than 0: consequence type identified by its SO code // Type equals to -1: SNP phenotype // Type equals to -2: mutation phenotype if (item->type > 0) { // Write entry in the consequence type file fd = cp_hashtable_get(output_files, &(item->type)); int ret = fprintf(fd, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to file: '%s'\n", line); } // Write in all_variants ret = fprintf(all_variants_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to all_variants: '%s'\n", line); } } else if (item->type == SNP_PHENOTYPE) { ret = fprintf(snp_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line); } } else if (item->type == MUTATION_PHENOTYPE) { ret = fprintf(mutation_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line); } } free(line); list_item_free(item); } } } write_summary_file(summary_count, cp_hashtable_get(output_files, "summary")); write_genes_with_variants_file(gene_list, output_directory); write_result_file(shared_options_data, options_data, summary_count, output_directory); free_output_data_structures(output_files, summary_count, gene_list); free_ws_buffers(shared_options_data->num_threads); free(output_list); vcf_close(vcf_file); update_job_status_file(100, job_status); close_job_status_file(job_status); return ret_code; }
u_int32_t cacheClassificationLookup(dbClassificationObj *iLookup,cacheClassificationObj *iHead); u_int32_t dbClassificationLookup(dbClassificationObj *iLookup,cacheClassificationObj *iHead); /* LOOKUP FUNCTIONS */ /* CLASSIFICATION FUNCTIONS */ u_int32_t ClassificationPullDataStore(DatabaseData *data, dbClassificationObj **iArrayPtr,u_int32_t *array_length); u_int32_t ClassificationCacheUpdateDBid(dbClassificationObj *iDBList,u_int32_t array_length,cacheClassificationObj **cacheHead); u_int32_t ClassificationPopulateDatabase(DatabaseData *data,cacheClassificationObj *cacheHead); u_int32_t ClassificationCacheSynchronize(DatabaseData *data,cacheClassificationObj **cacheHead); /* CLASSIFICATION FUNCTIONS */ /* SIGNATURE FUNCTIONS */ static u_int32_t SignatureLookupCache(dbSignatureObj * lookup); static u_int32_t dbSignatureObjEquals(dbSignatureObj const * const sig1,dbSignatureObj const * const sig2); static u_int32_t SignatureCacheLazyInit(MasterCache * mc, khash_t(dbSigCacheNode) ** cache, sig_gid_t gid); /* SIGNATURE FUNCTIONS */ /* SIGNATURE REFERENCE FUNCTIONS */ static u_int32_t SignatureInsertReferences(DatabaseData * data, dbSignatureObj * sig); static u_int32_t SignatureInsertReference(DatabaseData * data, u_int32_t db_sig_id, int seq, ReferenceNode * ref); static u_int32_t ReferenceSystemLookupDbCache(MasterCache *mc, dbSystemObj * lookup); static u_int32_t ReferenceSystemCacheInsertObj(dbSystemObj * sys, MasterCache * mc ); static u_int32_t DbReferenceSystemLookup(DatabaseData * data, dbSystemObj * lookup); static u_int32_t ReferenceSystemLookupDatabase(DatabaseData * data, dbSystemObj * lookup); static u_int32_t ReferenceSystemPopulateDatabase(DatabaseData * data, dbSystemObj * sys); static u_int32_t ReferenceLookup(DatabaseData * data, dbReferenceObj * ref); static u_int32_t ReferencePopulateDatabase(DatabaseData * data, dbReferenceObj * ref); static u_int32_t ReferenceLookupDatabase(DatabaseData * data, dbReferenceObj * lookup);
int main(int argc, char **argv) { // compiler complains about unused function without these linese (void)kh_clear_ghash; (void)kh_del_ghash; if(argc < 2) print_usage(usage, NULL); char swap_alleles = 0; int c; while((c = getopt(argc, argv, "s")) >= 0) { switch (c) { case 's': swap_alleles = 1; break; default: die("Unknown option: %c", c); } } if(optind == argc) print_usage(usage, "Not enough arguments"); char *inputpath = argv[optind]; char **refpaths = argv + optind + 1; size_t num_refs = argc - optind - 1; gzFile gzin = gzopen(inputpath, "r"); if(gzin == NULL) die("Cannot read file: %s", inputpath); size_t i, nchroms = 0, capacity = 1024; khash_t(ghash) *genome = kh_init(ghash); read_t *reads = malloc(capacity * sizeof(read_t)), *r; int hret; khiter_t k; for(i = 0; i < num_refs; i++) { fprintf(stderr, "Loading %s\n", refpaths[i]); load_reads(refpaths[i], &reads, &capacity, &nchroms); } if(num_refs == 0) { fprintf(stderr, "Loading from stdin\n"); load_reads("-", &reads, &capacity, &nchroms); } if(nchroms == 0) die("No chromosomes loaded"); for(i = 0; i < nchroms; i++) { r = reads + i; fprintf(stderr, "Loaded: '%s'\n", r->name.b); k = kh_put(ghash, genome, r->name.b, &hret); if(hret == 0) warn("Duplicate read name (taking first): %s", r->name.b); else kh_value(genome, k) = r; } // Now read VCF StrBuf line; strbuf_alloc(&line, 1024); char *fields[9]; char *chr; int pos, reflen, altlen; while(strbuf_reset_gzreadline(&line, gzin) > 0) { if(line.b[0] == '#') fputs(line.b, stdout); else { strbuf_chomp(&line); vcf_columns(line.b, fields); fields[1][-1] = fields[2][-1] = '\0'; chr = line.b; pos = atoi(fields[1])-1; k = kh_get(ghash, genome, chr); r = kh_value(genome, k); fields[1][-1] = fields[2][-1] = '\t'; reflen = fields[4] - fields[3] - 1; altlen = fields[5] - fields[4] - 1; if(k == kh_end(genome)) warn("Cannot find chrom: %s", chr); else if(pos < 0) warn("Bad line: %s\n", line.b); else if((reflen == 1 && altlen == 1) || fields[3][0] == fields[4][0]) { if((unsigned)pos + reflen <= r->seq.end && strncasecmp(r->seq.b+pos,fields[3],reflen) == 0) { fputs(line.b, stdout); fputc('\n', stdout); } else if(swap_alleles && (unsigned)pos + altlen <= r->seq.end && strncasecmp(r->seq.b+pos,fields[4],altlen) == 0) { // swap alleles char tmp[altlen], *ref = fields[3], *alt = fields[4]; memcpy(tmp, alt, altlen); memmove(ref+altlen+1, ref, reflen); memcpy(ref, tmp, altlen); ref[altlen] = '\t'; fputs(line.b, stdout); fputc('\n', stdout); } // else printf("FAIL0\n"); } // else printf("FAIL1\n"); } } kh_destroy(ghash, genome); strbuf_dealloc(&line); gzclose(gzin); for(i = 0; i < nchroms; i++) seq_read_dealloc(reads+i); free(reads); fprintf(stderr, " Done.\n"); return 0; }
vertex_buffer_t* get_group_buffer(vertex_buffer_t* buff, const texture_t* tex, const font_t* font, const shader_t* program, blend_func blend) { tex_group_t* tg; blend_group_t* bg; unsigned int bkey; khiter_t i,j; khash_t(hmsp)* tgs; const char* atlas_name = NO_ATLAS; char buffer [128]; vec4* v; int z; shader_group_t* sg; khash_t(hmsp)* sgs; if (tex) atlas_name = sen_texture_atlas(tex); else if (font) atlas_name = sen_font_atlas(font); bkey = (unsigned int)blend; i = kh_get(hmip, g_bgs, bkey); if (i != kh_end(g_bgs)) bg = kh_val(g_bgs, i); else { bg = blend_group_new(bkey); kh_insert(hmip, g_bgs, bkey, bg); } bg->num++; tgs = bg->tgs; j = kh_get(hmsp, tgs, atlas_name); if (j != kh_end(tgs)) tg = kh_val(tgs, j); else { tg = tex_group_new(tex,font); kh_insert(hmsp, tgs, atlas_name, tg); } tg->num++; v = (vec4*) buff->vertices->items; z = (int) (v->z * 10000); sprintf (buffer, "%05d%s",z,program->name); sgs = tg->sgs; i = kh_get(hmsp, sgs, buffer); if (i != kh_end(sgs)) sg = kh_val(sgs, i); else { sg = shader_group_new(program, buff,buffer,z,tg,bg); kh_insert(hmsp, sgs, sg->name, sg); } if (sg->buff == NULL) { sg->buff = vertex_buffer_new(vertex_buffer_format(buff)); } sg->num++; return sg->buff; }
ERR_VALUE kmer_freq_distribution(const PROGRAM_OPTIONS *Options, const uint32_t KMerSize, const ONE_READ *Reads, const size_t ReadCount) { int err; size_t maxValue = 0; khiter_t it; size_t kmerCount = 0; char *kmerString = NULL; khash_t(kc) *table = kh_init(kc); ERR_VALUE ret = ERR_INTERNAL_ERROR; ret = utils_calloc(KMerSize + 1, sizeof(char), &kmerString); if (ret == ERR_SUCCESS) { const ONE_READ *r = Reads; kmerString[KMerSize] = '\0'; for (size_t i = 0; i < ReadCount; ++i) { const READ_PART *p = &r->Part; read_split(r); if (p->ReadSequenceLength >= KMerSize) { for (size_t j = 0; j < p->ReadSequenceLength - KMerSize + 1; ++j) { char *s = NULL; memcpy(kmerString, p->ReadSequence + j, KMerSize*sizeof(char)); ret = utils_copy_string(kmerString, &s); if (ret == ERR_SUCCESS) { it = kh_put(kc, table, s, &err); switch (err) { case 0: kh_value(table, it) += 1; if (kh_value(table, it) > maxValue) maxValue = kh_value(table, it); utils_free(s); break; case 1: case 2: kh_value(table, it) = 1; break; default: ret = ERR_OUT_OF_MEMORY; break; } ++kmerCount; if (ret != ERR_SUCCESS) utils_free(s); } if (ret != ERR_SUCCESS) break; } } if (ret != ERR_SUCCESS) break; ++r; } if (ret == ERR_SUCCESS) { size_t *freqArray = NULL; ++maxValue; ret = utils_calloc(maxValue, sizeof(size_t), &freqArray); if (ret == ERR_SUCCESS) { memset(freqArray, 0, maxValue*sizeof(size_t)); for (it = kh_begin(table); it != kh_end(table); ++it) { if (kh_exist(table, it)) ++freqArray[kh_value(table, it)]; } for (size_t i = 0; i < maxValue; ++i) { if (freqArray[i] > 0) fprintf(stdout, "%Iu, %Iu, %lf\n", i, freqArray[i], (double)freqArray[i]*100/ (double)kmerCount); } utils_free(freqArray); } } utils_free(kmerString); } for (size_t i = kh_begin(table); i < kh_end(table); ++i) { if (kh_exist(table, i)) utils_free(kh_key(table, i)); } kh_destroy(kc, table); return ret; }
int stk_maskseq(int argc, char *argv[]) { khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; int l, i, j, c, is_complement = 0, is_lower = 0; khint_t k; while ((c = getopt(argc, argv, "cl")) >= 0) { switch (c) { case 'c': is_complement = 1; break; case 'l': is_lower = 1; break; } } if (argc - optind < 2) { fprintf(pysamerr, "Usage: seqtk maskseq [-cl] <in.fa> <in.bed>\n\n"); fprintf(pysamerr, "Options: -c mask the complement regions\n"); fprintf(pysamerr, " -l soft mask (to lower cases)\n"); return 1; } h = stk_reg_read(argv[optind+1]); // maskseq fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { k = kh_get(reg, h, seq->name.s); if (k == kh_end(h)) { // not found in the hash table if (is_complement) { for (j = 0; j < l; ++j) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; } } else { reglist_t *p = &kh_val(h, k); if (!is_complement) { for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (beg >= seq->seq.l) { fprintf(pysamerr, "[maskseq] start position >= the sequence length.\n"); continue; } if (end >= seq->seq.l) end = seq->seq.l; if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; } } else { int8_t *mask = calloc(seq->seq.l, 1); for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (end >= seq->seq.l) end = seq->seq.l; for (j = beg; j < end; ++j) mask[j] = 1; } for (j = 0; j < l; ++j) if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; free(mask); } } printf(">%s", seq->name.s); for (j = 0; j < seq->seq.l; ++j) { if (j%60 == 0) putchar('\n'); putchar(seq->seq.s[j]); } putchar('\n'); }
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) { if ( !srt->grp_str2int ) { // first time here, initialize if ( !srt->pair ) { if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT; bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse); } bcf_sr_init_scores(srt); srt->grp_str2int = khash_str2int_init(); srt->var_str2int = khash_str2int_init(); } int k; khash_t(str2int) *hash; hash = srt->grp_str2int; for (k=0; k < kh_end(hash); k++) if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k)); hash = srt->var_str2int; for (k=0; k < kh_end(hash); k++) if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k)); kh_clear(str2int, srt->grp_str2int); kh_clear(str2int, srt->var_str2int); srt->ngrp = srt->nvar = srt->nvset = 0; grp_t grp; memset(&grp,0,sizeof(grp_t)); // group VCFs into groups, each with a unique combination of variants in the duplicate lines int ireader,ivar,irec,igrp,ivset,iact; for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0; for (iact=0; iact<srt->nactive; iact++) { ireader = srt->active[iact]; bcf_sr_t *reader = &readers->readers[ireader]; int rid = bcf_hdr_name2id(reader->header, chr); grp.nvar = 0; hts_expand(int,reader->nbuffer,srt->moff,srt->off); srt->noff = 0; srt->str.l = 0; for (irec=1; irec<=reader->nbuffer; irec++) { bcf1_t *line = reader->buffer[irec]; if ( line->rid!=rid || line->pos!=min_pos ) break; if ( srt->str.l ) kputc(';',&srt->str); srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; for (ivar=1; ivar<line->n_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); kputs(line->d.allele[0],&srt->str); kputc('>',&srt->str); kputs(line->d.allele[ivar],&srt->str); } if ( line->n_allele==1 ) { kputs(line->d.allele[0],&srt->str); kputsn(">.",2,&srt->str); } // Create new variant or attach to existing one. But careful, there can be duplicate // records with the same POS,REF,ALT (e.g. in dbSNP-b142) char *var_str = beg + srt->str.s; int ret, var_idx = 0, var_end = srt->str.l; while ( 1 ) { ret = khash_str2int_get(srt->var_str2int, var_str, &ivar); if ( ret==-1 ) break; var_t *var = &srt->var[ivar]; if ( var->vcf[var->nvcf-1] != ireader ) break; srt->str.l = var_end; kputw(var_idx, &srt->str); var_str = beg + srt->str.s; var_idx++; } if ( ret==-1 ) { ivar = srt->nvar++; hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); srt->var[ivar].nvcf = 0; khash_str2int_set(srt->var_str2int, strdup(var_str), ivar); free(srt->var[ivar].str); // possible left-over from the previous position } var_t *var = &srt->var[ivar]; var->nalt = line->n_allele - 1; var->type = bcf_get_variant_types(line); srt->str.s[var_end] = 0; if ( ret==-1 ) var->str = strdup(var_str); int mvcf = var->mvcf; var->nvcf++; hts_expand0(int*, var->nvcf, var->mvcf, var->vcf); if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf); var->vcf[var->nvcf-1] = ireader; var->rec[var->nvcf-1] = line; grp.nvar++; hts_expand(var_t,grp.nvar,grp.mvar,grp.var); grp.var[grp.nvar-1] = ivar; } char *grp_key = grp_create_key(srt); int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp); if ( ret==-1 ) { igrp = srt->ngrp++; hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp); free(srt->grp[igrp].var); srt->grp[igrp] = grp; srt->grp[igrp].key = grp_key; khash_str2int_set(srt->grp_str2int, grp_key, igrp); memset(&grp,0,sizeof(grp_t)); } else free(grp_key); srt->grp[igrp].nvcf++; } free(grp.var); // initialize bitmask - which groups is the variant present in for (ivar=0; ivar<srt->nvar; ivar++) { srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp); kbs_clear(srt->var[ivar].mask); } for (igrp=0; igrp<srt->ngrp; igrp++) { for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++) { int i = srt->grp[igrp].var[ivar]; kbs_insert(srt->var[i].mask, igrp); } } // create the initial list of variant sets for (ivar=0; ivar<srt->nvar; ivar++) { ivset = srt->nvset++; hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset); varset_t *vset = &srt->vset[ivset]; vset->nvar = 1; hts_expand0(var_t, vset->nvar, vset->mvar, vset->var); vset->var[vset->nvar-1] = ivar; var_t *var = &srt->var[ivar]; vset->cnt = var->nvcf; vset->mask = kbs_resize(vset->mask, srt->ngrp); kbs_clear(vset->mask); kbs_bitwise_or(vset->mask, var->mask); int type = 0; if ( var->type==VCF_REF ) type |= SR_REF; else { if ( var->type & VCF_SNP ) type |= SR_SNP; if ( var->type & VCF_MNP ) type |= SR_SNP; if ( var->type & VCF_INDEL ) type |= SR_INDEL; if ( var->type & VCF_OTHER ) type |= SR_OTHER; } var->type = type; } #if DEBUG_VSETS debug_vsets(srt); #endif // initialize the pairing matrix hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat); hts_expand(int, srt->nvset, srt->mcnt, srt->cnt); memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset); for (ivset=0; ivset<srt->nvset; ivset++) { varset_t *vset = &srt->vset[ivset]; for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0; srt->cnt[ivset] = vset->cnt; } // pair the lines while ( srt->nvset ) { #if DEBUG_VSETS fprintf(stderr,"\n"); debug_vsets(srt); #endif int imax = 0; for (ivset=1; ivset<srt->nvset; ivset++) if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset; int ipair = -1; uint32_t max_score = 0; for (ivset=0; ivset<srt->nvset; ivset++) { if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged uint32_t score = pairing_score(srt, imax, ivset); // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score); if ( max_score < score ) { max_score = score; ipair = ivset; } } // merge rows creating a new variant set this way if ( ipair!=-1 && ipair!=imax ) { imax = merge_vsets(srt, imax, ipair); continue; } push_vset(srt, imax); } srt->chr = chr; srt->pos = min_pos; }
static void gc_mark_phase(pic_state *pic) { struct context *cxt; size_t j; assert(pic->heap->weaks == NULL); /* context */ for (cxt = pic->cxt; cxt != NULL; cxt = cxt->prev) { if (cxt->fp) gc_mark_object(pic, (struct object *)cxt->fp); if (cxt->sp) gc_mark_object(pic, (struct object *)cxt->sp); if (cxt->irep) gc_mark_object(pic, (struct object *)cxt->irep); } /* arena */ for (j = 0; j < pic->ai; ++j) { gc_mark_object(pic, (struct object *)pic->arena[j]); } /* global variables */ gc_mark(pic, pic->globals); /* dynamic environment */ gc_mark(pic, pic->dyn_env); /* top continuation */ gc_mark(pic, pic->halt); /* features */ gc_mark(pic, pic->features); /* weak maps */ do { struct object *key; pic_value val; int it; khash_t(weak) *h; struct weak *weak; j = 0; weak = pic->heap->weaks; while (weak != NULL) { h = &weak->hash; for (it = kh_begin(h); it != kh_end(h); ++it) { if (! kh_exist(h, it)) continue; key = kh_key(h, it); val = kh_val(h, it); if (is_alive(key)) { if (obj_p(pic, val) && ! is_alive(obj_ptr(pic, val))) { gc_mark(pic, val); ++j; } } } weak = weak->prev; } } while (j > 0); }
char *fai_fetch(const faidx_t *fai, const char *str, int *len) { char *s, c; int i, l, k, name_end; khiter_t iter; faidx1_t val; khash_t(s) *h; int beg, end; beg = end = -1; h = fai->hash; name_end = l = strlen(str); s = (char*)malloc(l+1); // remove space for (i = k = 0; i < l; ++i) if (!isspace(str[i])) s[k++] = str[i]; s[k] = 0; l = k; // determine the sequence name for (i = l - 1; i >= 0; --i) if (s[i] == ':') break; // look for colon from the end if (i >= 0) name_end = i; if (name_end < l) { // check if this is really the end int n_hyphen = 0; for (i = name_end + 1; i < l; ++i) { if (s[i] == '-') ++n_hyphen; else if (!isdigit(s[i]) && s[i] != ',') break; } if (i < l || n_hyphen > 1) name_end = l; // malformated region string; then take str as the name s[name_end] = 0; iter = kh_get(s, h, s); if (iter == kh_end(h)) { // cannot find the sequence name iter = kh_get(s, h, str); // try str as the name if (iter == kh_end(h)) { *len = 0; free(s); return 0; } else s[name_end] = ':', name_end = l; } } else iter = kh_get(s, h, str); if(iter == kh_end(h)) { fprintf(pysamerr, "[fai_fetch] Warning - Reference %s not found in FASTA file, returning empty sequence\n", str); free(s); return 0; }; val = kh_value(h, iter); // parse the interval if (name_end < l) { for (i = k = name_end + 1; i < l; ++i) if (s[i] != ',') s[k++] = s[i]; s[k] = 0; beg = atoi(s + name_end + 1); for (i = name_end + 1; i != k; ++i) if (s[i] == '-') break; end = i < k? atoi(s + i + 1) : val.len; if (beg > 0) --beg; } else beg = 0, end = val.len; if (beg >= val.len) beg = val.len; if (end >= val.len) end = val.len; if (beg > end) beg = end; free(s); // now retrieve the sequence l = 0; s = (char*)malloc(end - beg + 2); razf_seek(fai->rz, val.offset + beg / val.line_blen * val.line_len + beg % val.line_blen, SEEK_SET); while (razf_read(fai->rz, &c, 1) == 1 && l < end - beg && !fai->rz->z_err) if (isgraph(c)) s[l++] = c; s[l] = '\0'; *len = l; return s; }
static void rmdupse_buf(buffer_t *buf) { khash_t(32) *h; uint32_t key; khint_t k; int mpos, i, upper; listelem_t *p; mpos = 0x7fffffff; mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff; upper = (buf->x < 0)? buf->n : buf->x; // fill the hash table h = kh_init(32); for (i = 0; i < upper; ++i) { elem_t *e = buf->buf + i; int ret; if (e->score < 0) continue; if (e->rpos >= 0) { if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1; else continue; } else { if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1; else continue; } k = kh_put(32, h, key, &ret); p = &kh_val(h, k); if (ret == 0) { // present in the hash table if (p->n == p->m) { p->m <<= 1; p->a = (int*)realloc(p->a, p->m * sizeof(int)); } p->a[p->n++] = i; } else { p->m = p->n = 1; p->a = (int*)calloc(p->m, sizeof(int)); p->a[0] = i; } } // rmdup for (k = kh_begin(h); k < kh_end(h); ++k) { if (kh_exist(h, k)) { int max, maxi; p = &kh_val(h, k); // get the max for (i = max = 0, maxi = -1; i < p->n; ++i) { if (buf->buf[p->a[i]].score > max) { max = buf->buf[p->a[i]].score; maxi = i; } } // mark the elements for (i = 0; i < p->n; ++i) { buf->buf[p->a[i]].score = -1; if (i != maxi) { bam_destroy1(buf->buf[p->a[i]].b); buf->buf[p->a[i]].b = 0; } } // free free(p->a); } } kh_destroy(32, h); }
static void trans_tbl_init(bam_hdr_t* out, bam_hdr_t* translate, trans_tbl_t* tbl, bool merge_rg, bool merge_pg) { tbl->n_targets = translate->n_targets; tbl->tid_trans = (int*)calloc(translate->n_targets, sizeof(int)); tbl->rg_trans = kh_init(c2c); tbl->pg_trans = kh_init(c2c); if (!tbl->tid_trans || !tbl->rg_trans || !tbl->pg_trans) { perror("out of memory"); exit(-1); } int32_t out_len = out->l_text; while (out_len > 0 && out->text[out_len-1] == '\n') {--out_len; } // strip trailing \n's kstring_t out_text = { 0, 0, NULL }; kputsn(out->text, out_len, &out_text); int i, min_tid = -1; tbl->lost_coord_sort = false; khash_t(c2i) *out_tid = kh_init(c2i); for (i = 0; i < out->n_targets; ++i) { int ret; khiter_t iter = kh_put(c2i, out_tid, out->target_name[i], &ret); if (ret <= 0) abort(); kh_value(out_tid, iter) = i; } for (i = 0; i < translate->n_targets; ++i) { khiter_t iter = kh_get(c2i, out_tid, translate->target_name[i]); if (iter == kh_end(out_tid)) { // Append missing entries to out tbl->tid_trans[i] = out->n_targets++; out->target_name = (char**)realloc(out->target_name, sizeof(char*)*out->n_targets); out->target_name[out->n_targets-1] = strdup(translate->target_name[i]); out->target_len = (uint32_t*)realloc(out->target_len, sizeof(uint32_t)*out->n_targets); out->target_len[out->n_targets-1] = translate->target_len[i]; // grep line with regex '^@SQ.*\tSN:%s(\t.*$|$)', translate->target_name[i] // from translate->text regex_t sq_id; regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t)); if (matches == NULL) { perror("out of memory"); exit(-1); } kstring_t seq_regex = { 0, 0, NULL }; ksprintf(&seq_regex, "^@SQ.*\tSN:%s(\t.*$|$)", translate->target_name[i]); regcomp(&sq_id, seq_regex.s, REG_EXTENDED|REG_NEWLINE); free(seq_regex.s); if (regexec(&sq_id, translate->text, 1, matches, 0) != 0) { fprintf(pysamerr, "[trans_tbl_init] @SQ SN (%s) found in binary header but not text header.\n",translate->target_name[i]); exit(1); } regfree(&sq_id); // Produce our output line and append it to out_text kputc('\n', &out_text); kputsn(translate->text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &out_text); free(matches); } else { tbl->tid_trans[i] = kh_value(out_tid, iter); } if (tbl->tid_trans[i] > min_tid) { min_tid = tbl->tid_trans[i]; } else { tbl->lost_coord_sort = true; } } kh_destroy(c2i, out_tid); // grep @RG id's regex_t rg_id; regmatch_t* matches = (regmatch_t*)calloc(2, sizeof(regmatch_t)); if (matches == NULL) { perror("out of memory"); exit(-1); } regcomp(&rg_id, "^@RG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); char* text = translate->text; klist_t(hdrln) *rg_list = kl_init(hdrln); while(1) { // foreach rg id in translate's header if (regexec(&rg_id, text, 2, matches, 0) != 0) break; // matches[0] is the whole @RG line; matches[1] is the ID field value kstring_t match_id = { 0, 0, NULL }; kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id); // is our matched ID in our output list already regex_t rg_id_search; kstring_t rg_regex = { 0, 0, NULL }; ksprintf(&rg_regex, "^@RG.*\tID:%s(\t.*$|$)", match_id.s); regcomp(&rg_id_search, rg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB); free(rg_regex.s); kstring_t transformed_id = { 0, 0, NULL }; bool transformed_equals_match; if (regexec(&rg_id_search, out->text, 0, NULL, 0) != 0 || merge_rg) { // Not in there so can add it as 1-1 mapping kputs(match_id.s, &transformed_id); transformed_equals_match = true; } else { // It's in there so we need to transform it by appending random number to id ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48()); transformed_equals_match = false; } regfree(&rg_id_search); // Insert it into our translation map int in_there = 0; khiter_t iter = kh_put(c2c, tbl->rg_trans, ks_release(&match_id), &in_there); char *transformed_id_s = ks_release(&transformed_id); kh_value(tbl->rg_trans,iter) = transformed_id_s; // take matched line and replace ID with transformed_id kstring_t transformed_line = { 0, 0, NULL }; if (transformed_equals_match) { kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line); } else { kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id_s, &transformed_line); kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } if (!(transformed_equals_match && merge_rg)) { // append line to linked list for PG processing char** ln = kl_pushp(hdrln, rg_list); *ln = ks_release(&transformed_line); // Give away to linked list } else free(transformed_line.s); text += matches[0].rm_eo; // next! } regfree(&rg_id); // Do same for PG id's regex_t pg_id; regcomp(&pg_id, "^@PG.*\tID:([!-)+-<>-~][ !-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); text = translate->text; klist_t(hdrln) *pg_list = kl_init(hdrln); while(1) { // foreach pg id in translate's header if (regexec(&pg_id, text, 2, matches, 0) != 0) break; kstring_t match_id = { 0, 0, NULL }; kputsn(text+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &match_id); // is our matched ID in our output list already regex_t pg_id_search; kstring_t pg_regex = { 0, 0, NULL }; ksprintf(&pg_regex, "^@PG.*\tID:%s(\t.*$|$)", match_id.s); regcomp(&pg_id_search, pg_regex.s, REG_EXTENDED|REG_NEWLINE|REG_NOSUB); free(pg_regex.s); kstring_t transformed_id = { 0, 0, NULL }; bool transformed_equals_match; if (regexec(&pg_id_search, out->text, 0, NULL, 0) != 0 || merge_pg) { // Not in there so can add it as 1-1 mapping kputs(match_id.s, &transformed_id); transformed_equals_match = true; } else { // It's in there so we need to transform it by appending random number to id ksprintf(&transformed_id, "%s-%0lX", match_id.s, lrand48()); transformed_equals_match = false; } regfree(&pg_id_search); // Insert it into our translation map int in_there = 0; khiter_t iter = kh_put(c2c, tbl->pg_trans, ks_release(&match_id), &in_there); char *transformed_id_s = ks_release(&transformed_id); kh_value(tbl->pg_trans,iter) = transformed_id_s; // take matched line and replace ID with transformed_id kstring_t transformed_line = { 0, 0, NULL }; if (transformed_equals_match) { kputsn(text+matches[0].rm_so, matches[0].rm_eo-matches[0].rm_so, &transformed_line); } else { kputsn(text+matches[0].rm_so, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id_s, &transformed_line); kputsn(text+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } if (!(transformed_equals_match && merge_pg)) { // append line to linked list for PP processing char** ln = kl_pushp(hdrln, pg_list); *ln = ks_release(&transformed_line); // Give away to linked list } else free(transformed_line.s); text += matches[0].rm_eo; // next! } regfree(&pg_id); // need to translate PP's on the fly in second pass because they may not be in correct order and need complete tbl->pg_trans to do this // for each line { // with ID replaced with tranformed_id and PP's transformed using the translation table // } regex_t pg_pp; regcomp(&pg_pp, "^@PG.*\tPP:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); kliter_t(hdrln) *iter = kl_begin(pg_list); while (iter != kl_end(pg_list)) { char* data = kl_val(iter); kstring_t transformed_line = { 0, 0, NULL }; // Find PP tag if (regexec(&pg_pp, data, 2, matches, 0) == 0) { // Lookup in hash table kstring_t pp_id = { 0, 0, NULL }; kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pp_id); khiter_t k = kh_get(c2c, tbl->pg_trans, pp_id.s); free(pp_id.s); char* transformed_id = kh_value(tbl->pg_trans,k); // Replace kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id, &transformed_line); kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } else { kputs(data, &transformed_line); } // Produce our output line and append it to out_text kputc('\n', &out_text); kputsn(transformed_line.s, transformed_line.l, &out_text); free(transformed_line.s); free(data); iter = kl_next(iter); } regfree(&pg_pp); // Need to also translate @RG PG's on the fly too regex_t rg_pg; regcomp(&rg_pg, "^@RG.*\tPG:([!-)+-<>-~][!-~]*)(\t.*$|$)", REG_EXTENDED|REG_NEWLINE); kliter_t(hdrln) *rg_iter = kl_begin(rg_list); while (rg_iter != kl_end(rg_list)) { char* data = kl_val(rg_iter); kstring_t transformed_line = { 0, 0, NULL }; // Find PG tag if (regexec(&rg_pg, data, 2, matches, 0) == 0) { // Lookup in hash table kstring_t pg_id = { 0, 0, NULL }; kputsn(data+matches[1].rm_so, matches[1].rm_eo-matches[1].rm_so, &pg_id); khiter_t k = kh_get(c2c, tbl->pg_trans, pg_id.s); free(pg_id.s); char* transformed_id = kh_value(tbl->pg_trans,k); // Replace kputsn(data, matches[1].rm_so-matches[0].rm_so, &transformed_line); kputs(transformed_id, &transformed_line); kputsn(data+matches[1].rm_eo, matches[0].rm_eo-matches[1].rm_eo, &transformed_line); } else { kputs(data, &transformed_line); } // Produce our output line and append it to out_text kputc('\n', &out_text); kputsn(transformed_line.s, transformed_line.l, &out_text); free(transformed_line.s); free(data); rg_iter = kl_next(rg_iter); } regfree(&rg_pg); kl_destroy(hdrln,pg_list); kl_destroy(hdrln,rg_list); free(matches); // Add trailing \n and write back to header free(out->text); kputc('\n', &out_text); out->l_text = out_text.l; out->text = ks_release(&out_text); }
have_phrase = true; break; } } if (!have_phrase) { add_normalized_strings_token(strings, str, token, options); } string_tree_finalize_token(tree); } } void expand_alternative(cstring_array *strings, khash_t(str_set) *unique_strings, char *str, normalize_options_t options) { size_t len = strlen(str); token_array *tokens = tokenize_keep_whitespace(str); string_tree_t *token_tree = string_tree_new_size(len); add_normalized_strings_tokenized(token_tree, str, tokens, options); string_tree_iterator_t *tokenized_iter = string_tree_iterator_new(token_tree); string_tree_iterator_t *iter; char_array *temp_string = char_array_new_size(len); char *token; char *lang;
khash_t (si32) * new_si32_ht (void) { khash_t (si32) * h = kh_init (si32); return h; }
static int output_stats_and_reset(struct bgpcorsaro_pfxmonitor_state_t *state, uint32_t interval_start) { khiter_t k; khiter_t p; khiter_t a; int khret; uint8_t pfx_visible; uint32_t unique_pfxs = 0; khash_t(peer_asn_map) * pam; /* origin_asn -> num peer ASns*/ khash_t(asn_count_map) *asn_np = NULL; if ((asn_np = kh_init(asn_count_map)) == NULL) { return -1; } /* for each prefix go through all peers */ for (k = kh_begin(state->pfx_info); k != kh_end(state->pfx_info); ++k) { if (kh_exist(state->pfx_info, k) == 0) { continue; } /* reset counters */ kh_clear(asn_count_map, asn_np); /* get peer-asn map for this prefix */ pam = kh_value(state->pfx_info, k); /* save the origin asn visibility (i.e. how many peers' ASns * observe such information */ /* for each peer, go through all origins */ for (p = kh_begin(pam); p != kh_end(pam); ++p) { if (kh_exist(pam, p) == 0) { continue; } /* increment the counter for this ASN */ if ((a = kh_get(asn_count_map, asn_np, kh_value(pam, p))) == kh_end(asn_np)) { a = kh_put(asn_count_map, asn_np, kh_value(pam, p), &khret); kh_value(asn_np, a) = 1; } else { kh_value(asn_np, a)++; } } /* now asn_np has a complete count of the number of peers' ASns that observed each origin ASN */ /* count the prefix and origins if their visibility * is above the threshold */ pfx_visible = 0; for (a = kh_begin(asn_np); a != kh_end(asn_np); ++a) { if (kh_exist(asn_np, a) == 0) { continue; } /* the information is accounted only if it is * consistent on at least threshold peers' ASns */ if (kh_value(asn_np, a) >= state->peer_asns_th) { pfx_visible = 1; bgpstream_id_set_insert(state->unique_origins, kh_key(asn_np, a)); } } /* updating counters */ unique_pfxs += pfx_visible; } DUMP_METRIC(unique_pfxs, state->interval_start, "%s.%s.%s.%s", state->metric_prefix, PLUGIN_NAME, state->ip_space_name, "prefixes_cnt"); DUMP_METRIC(bgpstream_id_set_size(state->unique_origins), state->interval_start, "%s.%s.%s.%s", state->metric_prefix, PLUGIN_NAME, state->ip_space_name, "origin_ASns_cnt"); bgpstream_id_set_clear(state->unique_origins); kh_destroy(asn_count_map, asn_np); return 0; }
ii->ap_prior = .01 * (n_ap + .01) / tot; if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior; fprintf(stderr, "[infer_isize] %s: qu(%d, %d, %d)", rg?rg:"(null)", p25, p50, p75); if (isnan(ii->std) || p75 > MAX_ISIZE) { ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0; fprintf(stderr, " -- not useable\n"); return -1; } fprintf(stderr, " bound(%d,%d), num/avg/std/kur/skw %d/%.3lf/%.3lf/%.3lf/%.3lf, ap %.2e, max %d, %.2lf sigma\n", ii->low, ii->high, n, ii->avg, ii->std, skewness, kurtosis, ii->ap_prior, ii->high_bayesian, y); return 0; } void improve_isize_est(khash_t(isize_infos) *iinfos, bam_pair_t *p, double ap_prior, int64_t L) { bwa_seq_t *s = p->bwa_seq ; if( p->kind<1 || s[0].mapQ < 20 ) return ; if( p->kind>1 && s[1].mapQ < 20 ) return ; int len = p->kind==1 ? s[0].len : // single read: isize is length (trimming/merging is assumed) s[0].pos < s[1].pos ? s[1].pos + s[1].len - s[0].pos : s[0].pos + s[0].len - s[1].pos; if( len < 0 || len >= MAX_ISIZE ) return ; int ret = 0 ; const char *rg = bam_get_rg(p->bam_rec); khiter_t it = kh_get(isize_infos, iinfos, rg) ; isize_info_t *ii = &kh_value(iinfos, it) ; if( it==kh_end(iinfos) ) {
this_coord = unclipped_end(bam); orientation = O_RR; } else { this_coord = unclipped_start(bam); orientation = O_FF; } key->single = 1; key->this_ref = this_ref; key->this_coord = this_coord; key->orientation = orientation; } /* Add the duplicate name to a hash if it does not exist. */ static int add_duplicate(khash_t(duplicates) *d_hash, bam1_t *dupe) { khiter_t d; int ret; d = kh_get(duplicates, d_hash, bam_get_qname(dupe)); if (d == kh_end(d_hash)) { d = kh_put(duplicates, d_hash, strdup(bam_get_qname(dupe)), &ret); if (ret > 0) { kh_value(d_hash, d) = 1; } else if (ret == 0) { kh_value(d_hash, d)++; } else { fprintf(stderr, "[markdup] error: unable to store supplementary duplicates.\n"); return 1;
size_t sen_render_flush(int clear_buff) { // gl_check_error(); //_logfi("1"); blend_group_t* bg; khint_t i,k,j; size_t total = 0; khash_t(hmsp)* tgs; camera_t* cam = sen_camera(); tex_group_t* tg; khash_t(hmsp)* sgs; shader_group_t* sg; vector_clear(zsorter); for (k = kh_begin(g_bgs); k != kh_end(g_bgs); ++k) { if (!kh_exist(g_bgs,k)) continue; bg = kh_val(g_bgs, k); if (bg->num == 0) { kh_del(hmip,g_bgs,k); continue; } tgs = bg->tgs; // set_blending( (blend_func) (kh_key(g_bgs, k)) ); for (i = kh_begin(tgs); i != kh_end(tgs); ++i) { if (!kh_exist(tgs,i)) continue; tg = kh_val(tgs, i); if (tg->num == 0) { kh_del(hmsp,tgs,i); continue; } /* if (tg->tex) sen_texture_bind(tg->tex); else if (tg->font) sen_font_bind(tg->font); */ sgs = tg->sgs; for (j = kh_begin(sgs); j != kh_end(sgs); ++j) { if (!kh_exist(sgs,j)) continue; sg = kh_val(sgs, j); if (sg->num == 0 || !sg->buff) { kh_del(hmsp,sgs,j); continue; } if (sg->buff) { /* sen_shader_use(sg->program); { if (tg->tex || tg->font) sen_uniform_1iN(sg->program, "u_tex0", 0); sen_uniform_m4fN(sg->program, "u_mvp", cam->view_proj.data); vertex_buffer_render( sg->buff, GL_TRIANGLES); total+=vertex_buffer_size(sg->buff); if (clear_buff) vertex_buffer_clear( sg->buff ); //sen_shader_use(NULL); }*/ vector_push_back( zsorter, &sg ); } sg->num = 0; } tg->num = 0; } bg->num = 0; } if (zsorter->size > 0) vector_sort(zsorter, zcmp); for (j = 0; j < zsorter->size; j++) { shader_group_t* sg = *(shader_group_t**)vector_get(zsorter, j); // _logfi("%s %d",sg->name, sg->z); set_blending( (blend_func) (sg->bg->key) ); if (sg->tg->tex) sen_texture_bind(sg->tg->tex); else if (sg->tg->font) sen_font_bind(sg->tg->font); sen_shader_use(sg->program); { if (sg->tg->tex || sg->tg->font) sen_uniform_1iN(sg->program, "u_tex0", 0); sen_uniform_m4fN(sg->program, "u_mvp", sg->z > 9500 ? cam->proj.data : cam->view_proj.data); vertex_buffer_render( sg->buff, GL_TRIANGLES); total+=vertex_buffer_size(sg->buff); if (clear_buff) vertex_buffer_clear( sg->buff ); //sen_shader_use(NULL); } } // _logfi("-------------------------------------------------"); return total; }
bam1_t *b; } elem_t, *elem_p; #define __free_elem(p) bam_destroy1((p)->data.b) KLIST_INIT(q, elem_t, __free_elem) typedef klist_t(q) queue_t; KHASH_MAP_INIT_INT(best, elem_p) typedef khash_t(best) besthash_t; typedef struct { uint64_t n_checked, n_removed; besthash_t *left, *rght; } lib_aux_t; KHASH_MAP_INIT_STR(lib, lib_aux_t) static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib) { khint_t k = kh_get(lib, aux, lib); if (k == kh_end(aux)) { int ret; char *p = strdup(lib); lib_aux_t *q; k = kh_put(lib, aux, p, &ret); q = &kh_val(aux, k); q->left = kh_init(best); q->rght = kh_init(best); q->n_checked = q->n_removed = 0; return q; } else return &kh_val(aux, k); }
#include "haywire.h" #include "hw_string.h" #include "khash.h" #include "http_response_cache.h" #include "http_server.h" #define CRLF "\r\n" KHASH_MAP_INIT_STR(string_hashmap, hw_string*) static uv_timer_t cache_invalidation_timer; static uv_key_t thread_cache_key; void initialize_http_request_cache(); void free_http_request_cache(); void http_request_cache_timer(uv_timer_t* handle, int status); void create_cached_http_request(khash_t(string_hashmap)* http_request_cache, char* http_status); void set_cached_request(khash_t(string_hashmap)* http_request_cache, char* http_status, hw_string* cache_entry); hw_string* get_cached_request(char* http_status); void initialize_http_request_cache() { uv_key_create(&thread_cache_key); } void http_request_cache_configure_listener(uv_loop_t* loop, uv_async_t* handle) { uv_timer_t* cache_invalidation_timer = malloc(sizeof(uv_timer_t)); uv_timer_init(loop, cache_invalidation_timer); uv_timer_start(cache_invalidation_timer, http_request_cache_timer, 500, 500); uv_unref((uv_handle_t*) cache_invalidation_timer); }
return h; } /* Initialize a new int key - uint64_t value hash table */ static khash_t (su64) * new_su64_ht (void) { khash_t (su64) * h = kh_init (su64); return h; } /* Destroys both the hash structure and the keys for a * string key - int value hash */ static void des_si32_free (khash_t (si32) * hash) { khint_t k; if (!hash) return; for (k = 0; k < kh_end (hash); ++k) { if (kh_exist (hash, k)) { free ((char *) kh_key (hash, k)); } } kh_destroy (si32, hash); } /* Destroys both the hash structure and its string values */
int split_pe(int argc, char *argv[], char *progname) { int c, bc_len = -1, ret, i, j, bc_idx, only_count = 0; unsigned num_mismatches = DEFAULT_NUM_MISMATCHES, num_spacer_bases = DEFAULT_NUM_SPACER_BASES, dna_alpha_len = strlen(DNA_ALPHA), num_undetermined = 0; char *out_prefix = NULL, *fn, **sptr, bc_id[1024], bc_seq[1024], bc_seq_cpy[1024]; // hello, buffer overflow clock_t t = clock(); BcRec bc; ArrayBcRec bcs; FILE *fp; kseq_t *seq1, *seq2; khash_t(str) *h = kh_init(str); khint_t k, k2; gzFile *fp1, *fp2; ARRAY_INIT(&bcs, BcRec, 1000); while ((c = getopt(argc, argv, "m:s:o:c")) >= 0) { switch (c) { case 'm': if (sscanf(optarg, "%u", &num_mismatches) != 1) { fprintf(stderr, "Error: option -m expects unsigned int\n"); return -1; } break; case 's': if (sscanf(optarg, "%u", &num_spacer_bases) != 1) { fprintf(stderr, "Error: option -s expects unsigned int\n"); return -1; } break; case 'o': out_prefix = strdup(optarg); break; case 'c': only_count = 1; break; } } if (optind + 3 != argc) { print_pe_usage(progname); return -1; } if (num_mismatches != 0 && num_mismatches != 1) { fprintf(stderr, "Error: argument -m has to be 0 or 1\n"); return -1; } if (out_prefix == NULL) { out_prefix = strdup(DEFAULT_OUTPUT_PREFIX); } for (sptr = argv+optind; sptr-argv<argc; sptr++) { if (access(*sptr, F_OK) == -1) { fprintf(stderr, "Error: file %s does not exist\n", *sptr); return -1; } } fprintf(stderr, "[barcode file: %s]\n", argv[optind]); fprintf(stderr, "[fastq file1: %s]\n", argv[optind+1]); fprintf(stderr, "[fastq file2: %s]\n", argv[optind+2]); fprintf(stderr, "[number of mismatches allowed: %u]\n", num_mismatches); fprintf(stderr, "[number of spacer bases: %u]\n", num_spacer_bases); fprintf(stderr, "[output prefix: %s]\n", out_prefix); fprintf(stderr, "[only count: %s]\n", only_count ? "true" : "false"); /* read barcode file */ if ((fp = fopen(argv[optind], "r")) == NULL) { fprintf(stderr, "Error: cannot open barcode file %s\n", argv[optind]); return -1; } while (fscanf(fp, "%s %s", bc_id, bc_seq) == 2) { bc_len = strlen(bc_seq); bc.id = strdup(bc_id); bc.seq = strdup(bc_seq); bc.num_found = 0; if (!only_count) { fn = (char*)calloc(strlen(out_prefix) + 3 + strlen(bc_id) + 6 + 1, sizeof(char)); strcpy(fn, out_prefix); strcat(fn, "_1_"); strcat(fn, bc_id); strcat(fn, ".fq.gz"); bc.fp1 = gzopen(fn, "w"); fn[strlen(out_prefix)+1] = '2'; bc.fp2 = gzopen(fn, "w"); free(fn); } else { bc.fp1 = NULL; bc.fp2 = NULL; } ARRAY_PUSH(&bcs, BcRec, bc); k = kh_put(str, h, strdup(bc_seq), &ret); if (num_mismatches == 0) { kh_val(h, k) = bcs.nextfree - 1; //printf("setting %s to %lu (%s %s)\n", bc_seq, bcs.nextfree - 1, bcs.elems[bcs.nextfree - 1].seq, bcs.elems[bcs.nextfree - 1].id); } else { for (i=0; i<strlen(bc_seq); i++) { strcpy(bc_seq_cpy, bc_seq); for (j=0; j<dna_alpha_len; j++) { bc_seq_cpy[i] = DNA_ALPHA[j]; k = kh_put(str, h, strdup(bc_seq_cpy), &ret); kh_val(h, k) = bcs.nextfree - 1; //printf("setting %s to %lu (%s %s)\n", bc_seq_cpy, bcs.nextfree - 1, bcs.elems[bcs.nextfree - 1].seq, bcs.elems[bcs.nextfree - 1].id); } } } } fclose(fp); if (bc_len == -1) { fprintf(stderr, "Error: could not find any barcodes in file %s\n", argv[optind]); return -1; } fp1 = gzopen(argv[optind+1], "r"); seq1 = kseq_init(fp1); fp2 = gzopen(argv[optind+2], "r"); seq2 = kseq_init(fp2); while (kseq_read(seq1) >= 0) { strncpy(bc_seq, seq1->seq.s, bc_len); k = kh_get(str, h, bc_seq); kseq_read(seq2); strncpy(bc_seq, seq2->seq.s, bc_len); k2 = kh_get(str, h, bc_seq); if (k != kh_end(h) || k2 != kh_end(h)) { bc_idx = k2 != kh_end(h) ? kh_val(h, k2) : kh_val(h, k); if (!only_count) { gzprintf(bcs.elems[bc_idx].fp1, "@%s %s\n%s\n+\n%s\n" , seq1->name.s , seq1->comment.s , seq1->seq.s+bc_len+num_spacer_bases , seq1->qual.s+bc_len+num_spacer_bases); gzprintf(bcs.elems[bc_idx].fp2, "@%s %s\n%s\n+\n%s\n" , seq2->name.s , seq2->comment.s , seq2->seq.s+bc_len+num_spacer_bases , seq2->qual.s+bc_len+num_spacer_bases); } bcs.elems[bc_idx].num_found += 2; } else { num_undetermined += 2; } } gzclose(fp1); gzclose(fp2); kseq_destroy(seq1); kseq_destroy(seq2); for (i=0; i<bcs.nextfree; i++) { printf("%s\t%s\t%u\n", bcs.elems[i].id, bcs.elems[i].seq, bcs.elems[i].num_found); if (!only_count) { gzclose(bcs.elems[i].fp1); gzclose(bcs.elems[i].fp2); } } printf("UNDETERMINED\tNONE\t%u\n", num_undetermined); ARRAY_FREE(&bcs); kh_destroy(str, h); fprintf(stderr, "[CPU time: %.2f sec]\n", (float)(clock() - t) / CLOCKS_PER_SEC); return 0; }
khash_t (igsl) * new_igsl_ht (void) { khash_t (igsl) * h = kh_init (igsl); return h; }
typedef struct { int32_t n, m; uint64_t *offset; } bam_lidx_t; KHASH_MAP_INIT_INT(i, bam_binlist_t) struct __bam_index_t { int32_t n; uint64_t n_no_coor; // unmapped reads without coordinate khash_t(i) **index; bam_lidx_t *index2; }; // requirement: len <= LEN_MASK static inline void insert_offset(khash_t(i) *h, int bin, uint64_t beg, uint64_t end) { khint_t k; bam_binlist_t *l; int ret; k = kh_put(i, h, bin, &ret); l = &kh_value(h, k); if (ret) { // not present l->m = 1; l->n = 0; l->list = (pair64_t*)calloc(l->m, 16); } if (l->n == l->m) { l->m <<= 1; l->list = (pair64_t*)realloc(l->list, l->m * 16); } l->list[l->n].u = beg; l->list[l->n++].v = end;
khash_t (is32) * new_is32_ht (void) { khash_t (is32) * h = kh_init (is32); return h; }
// Definitions/prototypes/initializations for data structures, functions, etc. //------------------------------------------------------------------------------ #define MAX_LINE_LENGTH 8192 #define MAX_ID_LENGTH 1024 KHASH_MAP_INIT_STR(m32, unsigned) typedef struct { char delim; const char *outfile; FILE *outstream; unsigned numfiles; } SmrOptions; void smr_init_options(SmrOptions *options); khash_t(m32) *smr_collect_molids(SmrOptions *options, khash_t(m32) **maps); khash_t(m32) *smr_load_file(const char *filename); void smr_parse_options(SmrOptions *options, int argc, char **argv); void smr_print_matrix(SmrOptions *options, khash_t(m32) **maps); void smr_print_usage(FILE *outstream); void smr_terminate(SmrOptions *options, khash_t(m32) **maps); //------------------------------------------------------------------------------ // Main method //------------------------------------------------------------------------------ int main(int argc, char **argv) { SmrOptions options; smr_init_options(&options); smr_parse_options(&options, argc, argv);
khash_t (iu64) * new_iu64_ht (void) { khash_t (iu64) * h = kh_init (iu64); return h; }
#else #define mrb_usascii_str_new2 mrb_str_new_cstr #endif KHASH_MAP_INIT_INT(mt, struct RProc*); KHASH_MAP_INIT_INT(iv, mrb_value); typedef struct fc_result { mrb_sym name; struct RClass * klass; mrb_value path; struct RClass * track; struct fc_result *prev; } fcresult_t; int kiv_lookup(khash_t(iv) *table, mrb_sym key, mrb_value *value); extern struct kh_iv *mrb_class_tbl; void mrb_gc_mark_mt(mrb_state *mrb, struct RClass *c) { khiter_t k; khash_t(mt) *h = c->mt; if (!h) return; for (k = kh_begin(h); k != kh_end(h); k++) { if (kh_exist(h, k)){ struct RProc *m = kh_value(h, k); if (m) { paint_black(m); }
/** * See Copyright Notice in picrin.h */ #include "picrin.h" KHASH_DECLARE(m, void *, int) KHASH_DEFINE2(m, void *, int, 0, kh_ptr_hash_func, kh_ptr_hash_equal) static bool internal_equal_p(pic_state *pic, pic_value x, pic_value y, size_t depth, khash_t(m) *h) { pic_value local = pic_nil_value(); size_t c = 0; if (depth > 10) { if (depth > 200) { pic_errorf(pic, "Stack overflow in equal\n"); } if (pic_pair_p(x) || pic_vec_p(x)) { int ret; kh_put(m, h, pic_obj_ptr(x), &ret); if (ret != 0) { return true; /* `x' was seen already. */ } } } LOOP: if (pic_eqv_p(x, y)) {