/** * @return 返回NULL表示到达末尾. * @note 可能返回英文单词. */ char * get_ch_word() { create_chunks(); if (g_nchunks > 1) choose_best_chunk(mm_cmp); if (g_nchunks > 1) choose_best_chunk(lawl_cmp); if (g_nchunks > 1) choose_best_chunk(svwl_cmp); if (g_nchunks > 1) choose_best_chunk(lsdmfocw_cmp); if (g_nchunks == 0) { g_pos += 2; /* no chinese words found, call next_token() recursively */ return next_token(); } else { g_pos += strlen(chunks[0].words[0]); return chunks[0].words[0]; } }
Token Algorithm::get_cjk_word(int len) { vector<Chunk> chunks = create_chunks(); if (chunks.size() > 1) mm_filter(chunks); if (chunks.size() > 1) lawl_filter(chunks); if (chunks.size() > 1) svwl_filter(chunks); if (chunks.size() > 1) lsdmfocw_filter(chunks); if (chunks.size() < 1) return Token(NULL, 0); Token token(m_text+m_pos, chunks[0].words[0]->nbytes); m_pos += chunks[0].words[0]->nbytes; return token; }
url_list_t *search_next_url(UriUriA **uri) #endif { url_list_t *elem; int update_pointer = 1; for (elem = download_ptr; elem; elem = elem->next) { if (!elem->assigned && elem->err_code == ERR_CODE_NOT_ASSIGNED) { if (elem->uri) { if (is_uri_compatible(elem->uri, -1)) { elem->assigned = 1; *uri = elem->uri; #ifdef ENABLE_METALINK *resource = NULL; *chunk = NULL; *header = 0; #endif if (update_pointer) download_ptr = elem->next; return elem; } else update_pointer = 0; } #ifdef ENABLE_METALINK else if (elem->metalink_uri) { if (is_valid_metalink(elem->metalink_uri->file)) { if (!elem->metalink_uri->chunk && elem->metalink_uri->size >= 0) { char *newfilename = NULL; if (create_chunks(elem->metalink_uri) != MULK_RET_OK) { elem->err_code = METALINK_RES_INVALID_METALINK; continue; } #ifdef ENABLE_CHECKSUM /* load a resume file if present */ if (init_chunks(elem->metalink_uri, &newfilename) == MULK_RET_OK) { elem->filename = newfilename; elem->err_code = METALINK_RES_OK; continue; } #endif /* ENABLE_CHECKSUM */ string_free(newfilename); } *uri = find_next_url(elem->metalink_uri, chunk, resource, header); if ((*chunk || *header) && *uri) { if (update_pointer) download_ptr = elem; return elem; } update_pointer = 0; } else elem->err_code = METALINK_RES_INVALID_METALINK; } #endif /* ENABLE_METALINK */ else elem->err_code = ERR_CODE_EMPTY_URL; } } if (update_pointer) download_ptr = NULL; *uri = NULL; #ifdef ENABLE_METALINK *chunk = NULL; *resource = NULL; *header = 0; #endif return NULL; }
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) { int ret_code = 0; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } } char *output_directory = shared_options_data->output_directory; size_t output_directory_len = strlen(output_directory); ret_code = create_directory(output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", output_directory); } // Remove all .txt files in folder ret_code = delete_files_by_extension(output_directory, "txt"); if (ret_code != 0) { return ret_code; } // Initialize environment for connecting to the web service ret_code = init_http_environment(0); if (ret_code != 0) { return ret_code; } // Output file descriptors static cp_hashtable *output_files = NULL; // Lines of the output data in the main .txt files static list_t *output_list = NULL; // Consequence type counters (for summary, must be kept between web service calls) static cp_hashtable *summary_count = NULL; // Gene list (for genes-with-variants, must be kept between web service calls) static cp_hashtable *gene_list = NULL; // Initialize collections of file descriptors and summary counters ret_code = initialize_output_files(output_directory, output_directory_len, &output_files); if (ret_code != 0) { return ret_code; } initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list); initialize_ws_buffers(shared_options_data->num_threads); // Create job.status file char job_status_filename[output_directory_len + 10]; sprintf(job_status_filename, "%s/job.status", output_directory); FILE *job_status = new_job_status_file(job_status_filename); if (!job_status) { LOG_FATAL("Can't create job status file\n"); } else { update_job_status_file(0, job_status); } #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); start = omp_get_wtime(); ret_code = vcf_read(vcf_file, 1, (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines, shared_options_data->batch_bytes <= 0); stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); // Filters and files for filtering output filter_t **filters = NULL; int num_filters = 0; if (shared_options_data->chain != NULL) { filters = sort_filter_chain(shared_options_data->chain, &num_filters); } FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL; get_filtering_output_files(shared_options_data, &passed_file, &failed_file); // Pedigree information (used in some filters) individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; // Filename structure outdir/vcfname.errors char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char)); get_filename_from_path(shared_options_data->vcf_filename, prefix_filename); char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char)); sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename); non_processed_file = fopen(non_processed_filename, "w"); free(non_processed_filename); // Maximum size processed by each thread (never allow more than 1000 variants per query) if (shared_options_data->batch_lines > 0) { shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads)); } else { shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY; } LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread); int i = 0; vcf_batch_t *batch = NULL; int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0; start = omp_get_wtime(); while (batch = fetch_vcf_batch(vcf_file)) { if (i == 0) { // Add headers associated to the defined filters vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters); for (int j = 0; j < num_filters; j++) { add_vcf_header_entry(filter_headers[j], vcf_file); } // Write file format, header entries and delimiter if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); } if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); } if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); } LOG_DEBUG("VCF header written\n"); if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); } } // printf("batch loaded = '%.*s'\n", 50, batch->text); // printf("batch text len = %zu\n", strlen(batch->text)); // if (i % 10 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); // } int reconnections = 0; int max_reconnections = 3; // TODO allow to configure? // Write records that passed to a separate file, and query the WS with them as args array_list_t *failed_records = NULL; int num_variables = ped_file? get_num_variables(ped_file): 0; array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records); if (passed_records->size > 0) { // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes; int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes); do { // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { int tid = omp_get_thread_num(); LOG_DEBUG_F("[%d] WS invocation\n", tid); LOG_DEBUG_F("[%d] -- effect WS\n", tid); if (!reconnections || ret_ws_0) { ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j], options_data->excludes); parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list); free(effect_line[tid]); effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char)); } if (!options_data->no_phenotypes) { if (!reconnections || ret_ws_1) { LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num()); ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_snp_phenotype_response(tid, output_list); free(snp_line[tid]); snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char)); } if (!reconnections || ret_ws_2) { LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num()); ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]); parse_mutation_phenotype_response(tid, output_list); free(mutation_line[tid]); mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char)); } } } LOG_DEBUG_F("*** %dth web services invocation finished\n", i); if (ret_ws_0 || ret_ws_1 || ret_ws_2) { if (ret_ws_0) { LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0)); } if (ret_ws_1) { LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1)); } if (ret_ws_2) { LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2)); } // In presence of errors, wait 4 seconds before retrying reconnections++; LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections); sleep(4); } else { free(chunk_starts); free(chunk_sizes); } } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)); } // If the maximum number of reconnections was reached still with errors, // write the non-processed batch to the corresponding file if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) { #pragma omp critical { write_vcf_batch(batch, non_processed_file); } } // Write records that passed and failed filters to separate files, and free them write_filtering_output_files(passed_records, failed_records, passed_file, failed_file); free_filtered_records(passed_records, failed_records, batch->records); // Free batch and its contents vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Free resources if (passed_file) { fclose(passed_file); } if (failed_file) { fclose(failed_file); } if (non_processed_file) { fclose(non_processed_file); } // Free filters for (i = 0; i < num_filters; i++) { filter_t *filter = filters[i]; filter->free_func(filter); } free(filters); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(output_list); } } #pragma omp section { // Thread which writes the results to all_variants, summary and one file per consequence type int ret = 0; char *line; list_item_t* item = NULL; FILE *fd = NULL; FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants"); FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes"); FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes"); while ((item = list_remove_item(output_list)) != NULL) { line = item->data_p; // Type greater than 0: consequence type identified by its SO code // Type equals to -1: SNP phenotype // Type equals to -2: mutation phenotype if (item->type > 0) { // Write entry in the consequence type file fd = cp_hashtable_get(output_files, &(item->type)); int ret = fprintf(fd, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to file: '%s'\n", line); } // Write in all_variants ret = fprintf(all_variants_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to all_variants: '%s'\n", line); } } else if (item->type == SNP_PHENOTYPE) { ret = fprintf(snp_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line); } } else if (item->type == MUTATION_PHENOTYPE) { ret = fprintf(mutation_phenotype_file, "%s\n", line); if (ret < 0) { LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line); } } free(line); list_item_free(item); } } } write_summary_file(summary_count, cp_hashtable_get(output_files, "summary")); write_genes_with_variants_file(gene_list, output_directory); write_result_file(shared_options_data, options_data, summary_count, output_directory); free_output_data_structures(output_files, summary_count, gene_list); free_ws_buffers(shared_options_data->num_threads); free(output_list); vcf_close(vcf_file); update_job_status_file(100, job_status); close_job_status_file(job_status); return ret_code; }
int main(int ac, char **av) { char *file; struct btrfs_root *root; struct btrfs_trans_handle *trans; char *label = NULL; char *first_file; u64 block_count = 0; u64 dev_block_count = 0; u64 blocks[7]; u64 alloc_start = 0; u64 metadata_profile = 0; u64 data_profile = 0; u32 leafsize = sysconf(_SC_PAGESIZE); u32 sectorsize = 4096; u32 nodesize = leafsize; u32 stripesize = 4096; int zero_end = 1; int option_index = 0; int fd; int ret; int i; int mixed = 0; int data_profile_opt = 0; int metadata_profile_opt = 0; int discard = 1; int ssd = 0; int force_overwrite = 0; char *source_dir = NULL; int source_dir_set = 0; u64 num_of_meta_chunks = 0; u64 size_of_data = 0; u64 source_dir_size = 0; int dev_cnt = 0; int saved_optind; char estr[100]; u64 features = 0; while(1) { int c; c = getopt_long(ac, av, "A:b:fl:n:s:m:d:L:O:r:VMK", long_options, &option_index); if (c < 0) break; switch(c) { case 'A': alloc_start = parse_size(optarg); break; case 'f': force_overwrite = 1; break; case 'd': data_profile = parse_profile(optarg); data_profile_opt = 1; break; case 'l': case 'n': nodesize = parse_size(optarg); leafsize = parse_size(optarg); break; case 'L': label = parse_label(optarg); break; case 'm': metadata_profile = parse_profile(optarg); metadata_profile_opt = 1; break; case 'M': mixed = 1; break; case 'O': { char *orig = strdup(optarg); char *tmp = orig; tmp = parse_fs_features(tmp, &features); if (tmp) { fprintf(stderr, "Unrecognized filesystem feature '%s'\n", tmp); free(orig); exit(1); } free(orig); if (features & BTRFS_FEATURE_LIST_ALL) { list_all_fs_features(); exit(0); } break; } case 's': sectorsize = parse_size(optarg); break; case 'b': block_count = parse_size(optarg); if (block_count <= 1024*1024*1024) { printf("SMALL VOLUME: forcing mixed " "metadata/data groups\n"); mixed = 1; } zero_end = 0; break; case 'V': print_version(); break; case 'r': source_dir = optarg; source_dir_set = 1; break; case 'K': discard = 0; break; default: print_usage(); } } sectorsize = max(sectorsize, (u32)sysconf(_SC_PAGESIZE)); if (check_leaf_or_node_size(leafsize, sectorsize)) exit(1); if (check_leaf_or_node_size(nodesize, sectorsize)) exit(1); saved_optind = optind; dev_cnt = ac - optind; if (dev_cnt == 0) print_usage(); if (source_dir_set && dev_cnt > 1) { fprintf(stderr, "The -r option is limited to a single device\n"); exit(1); } while (dev_cnt-- > 0) { file = av[optind++]; if (is_block_device(file)) if (test_dev_for_mkfs(file, force_overwrite, estr)) { fprintf(stderr, "Error: %s", estr); exit(1); } } optind = saved_optind; dev_cnt = ac - optind; file = av[optind++]; ssd = is_ssd(file); if (is_vol_small(file)) { printf("SMALL VOLUME: forcing mixed metadata/data groups\n"); mixed = 1; if (metadata_profile != data_profile) { if (metadata_profile_opt || data_profile_opt) { fprintf(stderr, "With mixed block groups data and metadata profiles must be the same\n"); exit(1); } } } /* * Set default profiles according to number of added devices. * For mixed groups defaults are single/single. */ if (!mixed) { if (!metadata_profile_opt) { if (dev_cnt == 1 && ssd) printf("Detected a SSD, turning off metadata " "duplication. Mkfs with -m dup if you want to " "force metadata duplication.\n"); metadata_profile = (dev_cnt > 1) ? BTRFS_BLOCK_GROUP_RAID1 : (ssd) ? 0: BTRFS_BLOCK_GROUP_DUP; } if (!data_profile_opt) { data_profile = (dev_cnt > 1) ? BTRFS_BLOCK_GROUP_RAID0 : 0; /* raid0 or single */ } } else { metadata_profile = 0; data_profile = 0; } ret = test_num_disk_vs_raid(metadata_profile, data_profile, dev_cnt, mixed, estr); if (ret) { fprintf(stderr, "Error: %s\n", estr); exit(1); } /* if we are here that means all devs are good to btrfsify */ printf("\nWARNING! - %s IS EXPERIMENTAL\n", BTRFS_BUILD_VERSION); printf("WARNING! - see http://btrfs.wiki.kernel.org before using\n\n"); dev_cnt--; if (!source_dir_set) { /* * open without O_EXCL so that the problem should not * occur by the following processing. * (btrfs_register_one_device() fails if O_EXCL is on) */ fd = open(file, O_RDWR); if (fd < 0) { fprintf(stderr, "unable to open %s: %s\n", file, strerror(errno)); exit(1); } first_file = file; ret = btrfs_prepare_device(fd, file, zero_end, &dev_block_count, block_count, &mixed, discard); if (block_count && block_count > dev_block_count) { fprintf(stderr, "%s is smaller than requested size\n", file); exit(1); } } else { fd = open_target(file); if (fd < 0) { fprintf(stderr, "unable to open the %s\n", file); exit(1); } first_file = file; source_dir_size = size_sourcedir(source_dir, sectorsize, &num_of_meta_chunks, &size_of_data); if(block_count < source_dir_size) block_count = source_dir_size; ret = zero_output_file(fd, block_count, sectorsize); if (ret) { fprintf(stderr, "unable to zero the output file\n"); exit(1); } /* our "device" is the new image file */ dev_block_count = block_count; } /* To create the first block group and chunk 0 in make_btrfs */ if (dev_block_count < BTRFS_MKFS_SYSTEM_GROUP_SIZE) { fprintf(stderr, "device is too small to make filesystem\n"); exit(1); } blocks[0] = BTRFS_SUPER_INFO_OFFSET; for (i = 1; i < 7; i++) { blocks[i] = BTRFS_SUPER_INFO_OFFSET + 1024 * 1024 + leafsize * i; } /* * FS features that can be set by other means than -O * just set the bit here */ if (mixed) features |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS; if ((data_profile | metadata_profile) & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { features |= BTRFS_FEATURE_INCOMPAT_RAID56; } process_fs_features(features); ret = make_btrfs(fd, file, label, blocks, dev_block_count, nodesize, leafsize, sectorsize, stripesize, features); if (ret) { fprintf(stderr, "error during mkfs: %s\n", strerror(-ret)); exit(1); } root = open_ctree(file, 0, OPEN_CTREE_WRITES); if (!root) { fprintf(stderr, "Open ctree failed\n"); close(fd); exit(1); } root->fs_info->alloc_start = alloc_start; ret = make_root_dir(root, mixed); if (ret) { fprintf(stderr, "failed to setup the root directory\n"); exit(1); } trans = btrfs_start_transaction(root, 1); if (dev_cnt == 0) goto raid_groups; btrfs_register_one_device(file); zero_end = 1; while (dev_cnt-- > 0) { int old_mixed = mixed; file = av[optind++]; /* * open without O_EXCL so that the problem should not * occur by the following processing. * (btrfs_register_one_device() fails if O_EXCL is on) */ fd = open(file, O_RDWR); if (fd < 0) { fprintf(stderr, "unable to open %s: %s\n", file, strerror(errno)); exit(1); } ret = btrfs_device_already_in_root(root, fd, BTRFS_SUPER_INFO_OFFSET); if (ret) { fprintf(stderr, "skipping duplicate device %s in FS\n", file); close(fd); continue; } ret = btrfs_prepare_device(fd, file, zero_end, &dev_block_count, block_count, &mixed, discard); mixed = old_mixed; BUG_ON(ret); ret = btrfs_add_to_fsid(trans, root, fd, file, dev_block_count, sectorsize, sectorsize, sectorsize); BUG_ON(ret); btrfs_register_one_device(file); } raid_groups: if (!source_dir_set) { ret = create_raid_groups(trans, root, data_profile, data_profile_opt, metadata_profile, metadata_profile_opt, mixed, ssd); BUG_ON(ret); } ret = create_data_reloc_tree(trans, root); BUG_ON(ret); printf("fs created label %s on %s\n\tnodesize %u leafsize %u " "sectorsize %u size %s\n", label, first_file, nodesize, leafsize, sectorsize, pretty_size(btrfs_super_total_bytes(root->fs_info->super_copy))); printf("%s\n", BTRFS_BUILD_VERSION); btrfs_commit_transaction(trans, root); if (source_dir_set) { trans = btrfs_start_transaction(root, 1); ret = create_chunks(trans, root, num_of_meta_chunks, size_of_data); BUG_ON(ret); btrfs_commit_transaction(trans, root); ret = make_image(source_dir, root, fd); BUG_ON(ret); } ret = close_ctree(root); BUG_ON(ret); free(label); return 0; }
int run_stats(shared_options_data_t *shared_options_data, stats_options_data_t *options_data) { file_stats_t *file_stats = file_stats_new(); sample_stats_t **sample_stats; // List that stores the batches of records filtered by each thread list_t *output_list[shared_options_data->num_threads]; // List that stores which thread filtered the next batch to save list_t *next_token_list = malloc(sizeof(list_t)); int ret_code; double start, stop, total; vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches); if (!vcf_file) { LOG_FATAL("VCF file does not exist!\n"); } ped_file_t *ped_file = NULL; if (shared_options_data->ped_filename) { ped_file = ped_open(shared_options_data->ped_filename); if (!ped_file) { LOG_FATAL("PED file does not exist!\n"); } if(options_data->variable) { set_variable_field(options_data->variable, 0, ped_file); } else { set_variable_field("PHENO", 6, ped_file); } if(options_data->variable_groups) { int n, m; char *variable_groups = strdup(options_data->variable_groups); char **groups; char **phenos_in_group; groups = split(variable_groups, ":", &n); for(int i = 0; i < n; i++){ phenos_in_group = split(groups[i], ",", &m); if(set_phenotype_group(phenos_in_group, m, ped_file) < 0) { LOG_ERROR("Variable can't appear in two groups\n"); return DUPLICATED_VARIABLE; } free(phenos_in_group); } ped_file->accept_new_values = 0; free(variable_groups); free(groups); } else { ped_file->accept_new_values = 1; } if(options_data->phenotype) { int n; char* phenotypes = strdup(options_data->phenotype); char** pheno_values = split(phenotypes, ",", &n); if(n != 2) { LOG_ERROR("To handle case-control test, only two phenotypes are supported\n"); return MORE_THAN_TWO_PHENOTYPES; } else { set_unaffected_phenotype(pheno_values[0],ped_file); set_affected_phenotype(pheno_values[1],ped_file); } } else { set_unaffected_phenotype("1", ped_file); set_affected_phenotype("2", ped_file); } LOG_INFO("About to read PED file...\n"); // Read PED file before doing any processing ret_code = ped_read(ped_file); if (ret_code != 0) { LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename); } if(!ped_file->num_field) { LOG_ERROR_F("Can't find the specified field \"%s\" in file: %s \n", options_data->variable, ped_file->filename); return VARIABLE_FIELD_NOT_FOUND; } } ret_code = create_directory(shared_options_data->output_directory); if (ret_code != 0 && errno != EEXIST) { LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory); } // Initialize variables related to the different threads for (int i = 0; i < shared_options_data->num_threads; i++) { output_list[i] = (list_t*) malloc(sizeof(list_t)); list_init("input", 1, shared_options_data->num_threads * shared_options_data->batch_lines, output_list[i]); } list_init("next_token", shared_options_data->num_threads, INT_MAX, next_token_list); LOG_INFO("About to retrieve statistics from VCF file...\n"); #pragma omp parallel sections private(start, stop, total) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); if (shared_options_data->batch_bytes > 0) { ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, vcf_file); } else if (shared_options_data->batch_lines > 0) { ret_code = vcf_parse_batches(shared_options_data->batch_lines, vcf_file); } stop = omp_get_wtime(); total = stop - start; if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); notify_end_parsing(vcf_file); } #pragma omp section { // Enable nested parallelism and set the number of threads the user has chosen omp_set_nested(1); LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num()); individual_t **individuals = NULL; khash_t(ids) *sample_ids = NULL; khash_t(str) *phenotype_ids = NULL; int num_phenotypes; start = omp_get_wtime(); int i = 0; vcf_batch_t *batch = NULL; while ((batch = fetch_vcf_batch(vcf_file)) != NULL) { if (i == 0) { sample_stats = malloc (get_num_vcf_samples(vcf_file) * sizeof(sample_stats_t*)); for (int j = 0; j < get_num_vcf_samples(vcf_file); j++) { sample_stats[j] = sample_stats_new(array_list_get(j, vcf_file->samples_names)); } if (ped_file) { // Create map to associate the position of individuals in the list of samples defined in the VCF file sample_ids = associate_samples_and_positions(vcf_file); // Sort individuals in PED as defined in the VCF file individuals = sort_individuals(vcf_file, ped_file); // Get the khash of the phenotypes in PED file phenotype_ids = get_phenotypes(ped_file); num_phenotypes = get_num_variables(ped_file); } } if (i % 50 == 0) { LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), batch->records->size, batch->records->capacity); } // Divide the list of passed records in ranges of size defined in config file int num_chunks; int *chunk_sizes = NULL; array_list_t *input_records = batch->records; int *chunk_starts = create_chunks(input_records->size, ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads), &num_chunks, &chunk_sizes); // OpenMP: Launch a thread for each range #pragma omp parallel for num_threads(shared_options_data->num_threads) for (int j = 0; j < num_chunks; j++) { LOG_DEBUG_F("[%d] Stats invocation\n", omp_get_thread_num()); // Invoke variant stats and/or sample stats when applies if (options_data->variant_stats) { int index = omp_get_thread_num() % shared_options_data->num_threads; ret_code = get_variants_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), chunk_sizes[j], individuals, sample_ids,num_phenotypes, output_list[index], file_stats); } if (options_data->sample_stats) { ret_code |= get_sample_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), chunk_sizes[j], individuals, sample_ids, sample_stats, file_stats); } } if (options_data->variant_stats) { // Insert as many tokens as elements correspond to each thread for (int t = 0; t < num_chunks; t++) { for (int s = 0; s < chunk_sizes[t]; s++) { list_item_t *token_item = list_item_new(t, 0, NULL); list_insert_item(token_item, next_token_list); } } } free(chunk_starts); free(chunk_sizes); vcf_batch_free(batch); i++; } stop = omp_get_wtime(); total = stop - start; LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Decrease list writers count for (i = 0; i < shared_options_data->num_threads; i++) { list_decr_writers(next_token_list); list_decr_writers(output_list[i]); } if (sample_ids) { kh_destroy(ids, sample_ids); } if (individuals) { free(individuals); } } #pragma omp section { LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num()); char *stats_prefix = get_vcf_stats_filename_prefix(shared_options_data->vcf_filename, shared_options_data->output_filename, shared_options_data->output_directory); // File names and descriptors for output to plain text files char *stats_filename, *summary_filename, *phenotype_filename; FILE *stats_fd, *summary_fd, **phenotype_fd; char *stats_db_name; sqlite3 *db = NULL; khash_t(stats_chunks) *hash; khash_t(str) *phenotype_ids; int num_phenotypes; if(ped_file){ phenotype_ids = get_phenotypes(ped_file); num_phenotypes = get_num_variables(ped_file); } if (options_data->save_db) { delete_files_by_extension(shared_options_data->output_directory, "db"); stats_db_name = calloc(strlen(stats_prefix) + strlen(".db") + 2, sizeof(char)); sprintf(stats_db_name, "%s.db", stats_prefix); create_stats_db(stats_db_name, VCF_CHUNKSIZE, create_vcf_query_fields, &db); hash = kh_init(stats_chunks); } // Write variant (and global) statistics if (options_data->variant_stats) { stats_filename = get_variant_stats_output_filename(stats_prefix); if (!(stats_fd = fopen(stats_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants: %s\n", stats_filename); } //Open one file for each phenotype if(ped_file){ phenotype_fd = malloc(sizeof(FILE*)*num_phenotypes); if(options_data->variable_groups){ int n; char *variable_groups = strdup(options_data->variable_groups); char ** names = split(variable_groups, ":", &n); for(int i = 0; i < n; i++) { phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, names[i]); if(!(phenotype_fd[i] = fopen(phenotype_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename); } free(phenotype_filename); } free(names); free(variable_groups); } else { for (khint_t i = kh_begin(phenotype_ids); i != kh_end(phenotype_ids); ++i) { if (!kh_exist(phenotype_ids,i)) continue; phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, kh_key(phenotype_ids,i)); if(!(phenotype_fd[kh_val(phenotype_ids,i)] = fopen(phenotype_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename); } free(phenotype_filename); } } } // Write header report_vcf_variant_stats_header(stats_fd); if(ped_file){ for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats_header(phenotype_fd[i]); } // For each variant, generate a new line int avail_stats = 0; variant_stats_t *var_stats_batch[VCF_CHUNKSIZE]; list_item_t *token_item = NULL, *output_item = NULL; while ( token_item = list_remove_item(next_token_list) ) { output_item = list_remove_item(output_list[token_item->id]); assert(output_item); var_stats_batch[avail_stats] = output_item->data_p; avail_stats++; // Run only when certain amount of stats is available if (avail_stats >= VCF_CHUNKSIZE) { report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch); if(ped_file) for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i); // Free all stats from the "batch" for (int i = 0; i < avail_stats; i++) { variant_stats_free(var_stats_batch[i]); } avail_stats = 0; } // Free resources list_item_free(output_item); list_item_free(token_item); } if (avail_stats > 0) { report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch); if(ped_file) for(int i = 0; i < num_phenotypes; i++) report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i); // Free all stats from the "batch" for (int i = 0; i < avail_stats; i++) { variant_stats_free(var_stats_batch[i]); } avail_stats = 0; } // Write whole file stats (data only got when launching variant stats) summary_filename = get_vcf_file_stats_output_filename(stats_prefix); if (!(summary_fd = fopen(summary_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics summary: %s\n", summary_filename); } report_vcf_summary_stats(summary_fd, db, file_stats); free(stats_filename); free(summary_filename); // Close variant stats file if (stats_fd) { fclose(stats_fd); } if (summary_fd) { fclose(summary_fd); } if(ped_file){ for(int i = 0; i < num_phenotypes; i++) if(phenotype_fd[i]) fclose(phenotype_fd[i]); free(phenotype_fd); } } // Write sample statistics if (options_data->sample_stats) { stats_filename = get_sample_stats_output_filename(stats_prefix); if (!(stats_fd = fopen(stats_filename, "w"))) { LOG_FATAL_F("Can't open file for writing statistics of samples: %s\n", stats_filename); } report_vcf_sample_stats_header(stats_fd); report_vcf_sample_stats(stats_fd, NULL, vcf_file->samples_names->size, sample_stats); // Close sample stats file free(stats_filename); if (stats_fd) { fclose(stats_fd); } } free(stats_prefix); if (db) { insert_chunk_hash(VCF_CHUNKSIZE, hash, db); create_stats_index(create_vcf_index, db); close_stats_db(db, hash); } } } for (int i = 0; i < get_num_vcf_samples(vcf_file); i++) { sample_stats_free(sample_stats[i]); } free(sample_stats); free(file_stats); free(next_token_list); for (int i = 0; i < shared_options_data->num_threads; i++) { free(output_list[i]); } vcf_close(vcf_file); if (ped_file) { ped_close(ped_file, 1,1); } return 0; }