Example #1
0
/**
 * @return 返回NULL表示到达末尾.
 * @note 可能返回英文单词.
 */
char *
get_ch_word()
{
	create_chunks();

	if (g_nchunks > 1)
		choose_best_chunk(mm_cmp);
	
	if (g_nchunks > 1)
		choose_best_chunk(lawl_cmp);
	
	if (g_nchunks > 1)
		choose_best_chunk(svwl_cmp);
	
	if (g_nchunks > 1)
		choose_best_chunk(lsdmfocw_cmp);
	
	if (g_nchunks == 0) {
		g_pos += 2;
		/* no chinese words found, call next_token() recursively */
		return next_token();
	} else {
		g_pos += strlen(chunks[0].words[0]);
		return chunks[0].words[0];
	}
}
Example #2
0
    Token Algorithm::get_cjk_word(int len)
    {
        vector<Chunk> chunks = create_chunks();

        if (chunks.size() > 1)
            mm_filter(chunks);
        if (chunks.size() > 1)
            lawl_filter(chunks);
        if (chunks.size() > 1)
            svwl_filter(chunks);
        if (chunks.size() > 1)
            lsdmfocw_filter(chunks);

        if (chunks.size() < 1)
            return Token(NULL, 0);

        Token token(m_text+m_pos, chunks[0].words[0]->nbytes);
        m_pos += chunks[0].words[0]->nbytes;
        return token;
    }
Example #3
0
url_list_t *search_next_url(UriUriA **uri)
#endif
{
	url_list_t *elem;
	int update_pointer = 1;

	for (elem = download_ptr; elem; elem = elem->next) {
		if (!elem->assigned && elem->err_code == ERR_CODE_NOT_ASSIGNED) {
			if (elem->uri) {
				if (is_uri_compatible(elem->uri, -1)) {
					elem->assigned = 1;
					*uri = elem->uri;
#ifdef ENABLE_METALINK
					*resource = NULL;
					*chunk = NULL;
					*header = 0;
#endif
					if (update_pointer)
						download_ptr = elem->next;
					return elem;
				}
				else 
					update_pointer = 0;
			}
#ifdef ENABLE_METALINK
			else if (elem->metalink_uri) {
				if (is_valid_metalink(elem->metalink_uri->file))
				{
					if (!elem->metalink_uri->chunk && elem->metalink_uri->size >= 0) {
						char *newfilename = NULL;

						if (create_chunks(elem->metalink_uri) != MULK_RET_OK) {
							elem->err_code = METALINK_RES_INVALID_METALINK;
							continue;
						}

#ifdef ENABLE_CHECKSUM
						/* load a resume file if present */
						if (init_chunks(elem->metalink_uri, &newfilename) == MULK_RET_OK) {
							elem->filename = newfilename;
							elem->err_code = METALINK_RES_OK;
							continue;
						}
#endif /* ENABLE_CHECKSUM */
						string_free(newfilename);
					}

					*uri = find_next_url(elem->metalink_uri, chunk, resource, header);

					if ((*chunk || *header) && *uri) {
						if (update_pointer)
							download_ptr = elem;
						return elem;
					}

					update_pointer = 0;
				}
				else 
					elem->err_code = METALINK_RES_INVALID_METALINK;
			}
#endif /* ENABLE_METALINK */
			else 
				elem->err_code = ERR_CODE_EMPTY_URL;
		}
	}

	if (update_pointer)
		download_ptr = NULL;
	*uri = NULL;
#ifdef ENABLE_METALINK
	*chunk = NULL;
	*resource = NULL;
	*header = 0;
#endif
	return NULL;
}
Example #4
0
int run_effect(char **urls, shared_options_data_t *shared_options_data, effect_options_data_t *options_data) {
    int ret_code = 0;
    double start, stop, total;
    
    vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches);
    if (!vcf_file) {
        LOG_FATAL("VCF file does not exist!\n");
    }
    
    ped_file_t *ped_file = NULL;
    if (shared_options_data->ped_filename) {
        ped_file = ped_open(shared_options_data->ped_filename);
        if (!ped_file) {
            LOG_FATAL("PED file does not exist!\n");
        }
        LOG_INFO("About to read PED file...\n");
        // Read PED file before doing any processing
        ret_code = ped_read(ped_file);
        if (ret_code != 0) {
            LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename);
        }
    }
    
    char *output_directory = shared_options_data->output_directory;
    size_t output_directory_len = strlen(output_directory);
    
    ret_code = create_directory(output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", output_directory);
    }
    
    // Remove all .txt files in folder
    ret_code = delete_files_by_extension(output_directory, "txt");
    if (ret_code != 0) {
        return ret_code;
    }
    
    // Initialize environment for connecting to the web service
    ret_code = init_http_environment(0);
    if (ret_code != 0) {
        return ret_code;
    }
    
    // Output file descriptors
    static cp_hashtable *output_files = NULL;
    // Lines of the output data in the main .txt files
    static list_t *output_list = NULL;
    // Consequence type counters (for summary, must be kept between web service calls)
    static cp_hashtable *summary_count = NULL;
    // Gene list (for genes-with-variants, must be kept between web service calls)
    static cp_hashtable *gene_list = NULL;

    // Initialize collections of file descriptors and summary counters
    ret_code = initialize_output_files(output_directory, output_directory_len, &output_files);
    if (ret_code != 0) {
        return ret_code;
    }
    initialize_output_data_structures(shared_options_data, &output_list, &summary_count, &gene_list);
    initialize_ws_buffers(shared_options_data->num_threads);
    
    // Create job.status file
    char job_status_filename[output_directory_len + 10];
    sprintf(job_status_filename, "%s/job.status", output_directory);
    FILE *job_status = new_job_status_file(job_status_filename);
    if (!job_status) {
        LOG_FATAL("Can't create job status file\n");
    } else {
        update_job_status_file(0, job_status);
    }
    
 
#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            
            start = omp_get_wtime();
            
            ret_code = vcf_read(vcf_file, 1,
                                (shared_options_data->batch_bytes > 0) ? shared_options_data->batch_bytes : shared_options_data->batch_lines,
                                shared_options_data->batch_bytes <= 0);

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) {
                LOG_ERROR_F("Error %d while reading the file %s\n", ret_code, vcf_file->filename);
            }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            notify_end_parsing(vcf_file);
        }
        
#pragma omp section
        {
            // Enable nested parallelism and set the number of threads the user has chosen
            omp_set_nested(1);
            
            LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num());
            
            // Filters and files for filtering output
            filter_t **filters = NULL;
            int num_filters = 0;
            if (shared_options_data->chain != NULL) {
                filters = sort_filter_chain(shared_options_data->chain, &num_filters);
            }
            FILE *passed_file = NULL, *failed_file = NULL, *non_processed_file = NULL;
            get_filtering_output_files(shared_options_data, &passed_file, &failed_file);
            
            // Pedigree information (used in some filters)
            individual_t **individuals = NULL;
            khash_t(ids) *sample_ids = NULL;
            
            // Filename structure outdir/vcfname.errors
            char *prefix_filename = calloc(strlen(shared_options_data->vcf_filename), sizeof(char));
            get_filename_from_path(shared_options_data->vcf_filename, prefix_filename);
            char *non_processed_filename = malloc((strlen(shared_options_data->output_directory) + strlen(prefix_filename) + 9) * sizeof(char));
            sprintf(non_processed_filename, "%s/%s.errors", shared_options_data->output_directory, prefix_filename);
            non_processed_file = fopen(non_processed_filename, "w");
            free(non_processed_filename);
            
            // Maximum size processed by each thread (never allow more than 1000 variants per query)
            if (shared_options_data->batch_lines > 0) {
                shared_options_data->entries_per_thread = MIN(MAX_VARIANTS_PER_QUERY, 
                            ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads));
            } else {
                shared_options_data->entries_per_thread = MAX_VARIANTS_PER_QUERY;
            }
            LOG_DEBUG_F("entries-per-thread = %d\n", shared_options_data->entries_per_thread);
    
            int i = 0;
            vcf_batch_t *batch = NULL;
            int ret_ws_0 = 0, ret_ws_1 = 0, ret_ws_2 = 0;
            
            start = omp_get_wtime();

            while (batch = fetch_vcf_batch(vcf_file)) {
                if (i == 0) {
                    // Add headers associated to the defined filters
                    vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters);
                    for (int j = 0; j < num_filters; j++) {
                        add_vcf_header_entry(filter_headers[j], vcf_file);
                    }
                        
                    // Write file format, header entries and delimiter
                    if (passed_file != NULL) { write_vcf_header(vcf_file, passed_file); }
                    if (failed_file != NULL) { write_vcf_header(vcf_file, failed_file); }
                    if (non_processed_file != NULL) { write_vcf_header(vcf_file, non_processed_file); }
                    
                    LOG_DEBUG("VCF header written\n");
                    
                    if (ped_file) {
                        // Create map to associate the position of individuals in the list of samples defined in the VCF file
                        sample_ids = associate_samples_and_positions(vcf_file);
                        // Sort individuals in PED as defined in the VCF file
                        individuals = sort_individuals(vcf_file, ped_file);
                    }
                }
                
//                     printf("batch loaded = '%.*s'\n", 50, batch->text);
//                     printf("batch text len = %zu\n", strlen(batch->text));

//                 if (i % 10 == 0) {
                    LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", 
                            i, omp_get_thread_num(),
                            batch->records->size, batch->records->capacity);
//                 }

                int reconnections = 0;
                int max_reconnections = 3; // TODO allow to configure?

                // Write records that passed to a separate file, and query the WS with them as args
                array_list_t *failed_records = NULL;
                int num_variables = ped_file? get_num_variables(ped_file): 0;
                array_list_t *passed_records = filter_records(filters, num_filters, individuals, sample_ids, num_variables, batch->records, &failed_records);
                if (passed_records->size > 0) {
                    // Divide the list of passed records in ranges of size defined in config file
                    int num_chunks;
                    int *chunk_sizes;
                    int *chunk_starts = create_chunks(passed_records->size, shared_options_data->entries_per_thread, &num_chunks, &chunk_sizes);
                    
                    do {
                        // OpenMP: Launch a thread for each range
                        #pragma omp parallel for num_threads(shared_options_data->num_threads)
                        for (int j = 0; j < num_chunks; j++) {
                            int tid = omp_get_thread_num();
                            LOG_DEBUG_F("[%d] WS invocation\n", tid);
                            LOG_DEBUG_F("[%d] -- effect WS\n", tid);
                            if (!reconnections || ret_ws_0) {
                                ret_ws_0 = invoke_effect_ws(urls[0], (vcf_record_t**) (passed_records->items + chunk_starts[j]), 
                                                            chunk_sizes[j], options_data->excludes);
                                parse_effect_response(tid, output_directory, output_directory_len, output_files, output_list, summary_count, gene_list);
                                free(effect_line[tid]);
                                effect_line[tid] = (char*) calloc (max_line_size[tid], sizeof(char));
                            }
                            
                            if (!options_data->no_phenotypes) {
                                if (!reconnections || ret_ws_1) {
                                    LOG_DEBUG_F("[%d] -- snp WS\n", omp_get_thread_num());
                                    ret_ws_1 = invoke_snp_phenotype_ws(urls[1], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]);
                                    parse_snp_phenotype_response(tid, output_list);
                                    free(snp_line[tid]);
                                    snp_line[tid] = (char*) calloc (snp_max_line_size[tid], sizeof(char));
                                }
                                 
                                if (!reconnections || ret_ws_2) {
                                    LOG_DEBUG_F("[%d] -- mutation WS\n", omp_get_thread_num());
                                    ret_ws_2 = invoke_mutation_phenotype_ws(urls[2], (vcf_record_t**) (passed_records->items + chunk_starts[j]), chunk_sizes[j]);
                                    parse_mutation_phenotype_response(tid, output_list);
                                    free(mutation_line[tid]);
                                    mutation_line[tid] = (char*) calloc (mutation_max_line_size[tid], sizeof(char));
                                }
                            }
                        }
                        
                        LOG_DEBUG_F("*** %dth web services invocation finished\n", i);
                        
                        if (ret_ws_0 || ret_ws_1 || ret_ws_2) {
                            if (ret_ws_0) {
                                LOG_ERROR_F("Effect web service error: %s\n", get_last_http_error(ret_ws_0));
                            }
                            if (ret_ws_1) {
                                LOG_ERROR_F("SNP phenotype web service error: %s\n", get_last_http_error(ret_ws_1));
                            }
                            if (ret_ws_2) {
                                LOG_ERROR_F("Mutations phenotype web service error: %s\n", get_last_http_error(ret_ws_2));
                            }
                            
                            // In presence of errors, wait 4 seconds before retrying
                            reconnections++;
                            LOG_ERROR_F("Some errors ocurred, reconnection #%d\n", reconnections);
                            sleep(4);
                        } else {
                            free(chunk_starts);
                            free(chunk_sizes);
                        }
                    } while (reconnections < max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2));
                }
                
                // If the maximum number of reconnections was reached still with errors, 
                // write the non-processed batch to the corresponding file
                if (reconnections == max_reconnections && (ret_ws_0 || ret_ws_1 || ret_ws_2)) {
                #pragma omp critical
                    {
                        write_vcf_batch(batch, non_processed_file);
                    }
                }
                
                // Write records that passed and failed filters to separate files, and free them
                write_filtering_output_files(passed_records, failed_records, passed_file, failed_file);
                free_filtered_records(passed_records, failed_records, batch->records);
                
                // Free batch and its contents
                vcf_batch_free(batch);
                
                i++;
            }

            stop = omp_get_wtime();

            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            // Free resources
            if (passed_file) { fclose(passed_file); }
            if (failed_file) { fclose(failed_file); }
            if (non_processed_file) { fclose(non_processed_file); }
            
            // Free filters
            for (i = 0; i < num_filters; i++) {
                filter_t *filter = filters[i];
                filter->free_func(filter);
            }
            free(filters);
            
            // Decrease list writers count
            for (i = 0; i < shared_options_data->num_threads; i++) {
                list_decr_writers(output_list);
            }
        }
        
#pragma omp section
        {
            // Thread which writes the results to all_variants, summary and one file per consequence type
            int ret = 0;
            char *line;
            list_item_t* item = NULL;
            FILE *fd = NULL;
            
            FILE *all_variants_file = cp_hashtable_get(output_files, "all_variants");
            FILE *snp_phenotype_file = cp_hashtable_get(output_files, "snp_phenotypes");
            FILE *mutation_phenotype_file = cp_hashtable_get(output_files, "mutation_phenotypes");
            
            while ((item = list_remove_item(output_list)) != NULL) {
                line = item->data_p;
                
                // Type greater than 0: consequence type identified by its SO code
                // Type equals to -1: SNP phenotype
                // Type equals to -2: mutation phenotype
                if (item->type > 0) {
                    // Write entry in the consequence type file
                    fd = cp_hashtable_get(output_files, &(item->type));
                    int ret = fprintf(fd, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to file: '%s'\n", line);
                    }
                    
                    // Write in all_variants
                    ret = fprintf(all_variants_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to all_variants: '%s'\n", line);
                    }
                    
                } else if (item->type == SNP_PHENOTYPE) {
                    ret = fprintf(snp_phenotype_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to snp_phenotypes: '%s'\n", line);
                    }
                    
                } else if (item->type == MUTATION_PHENOTYPE) {
                    ret = fprintf(mutation_phenotype_file, "%s\n", line);
                    if (ret < 0) {
                        LOG_ERROR_F("Error writing to mutation_phenotypes: '%s'\n", line);
                    }
                }
                
                free(line);
                list_item_free(item);
            }
            
        }
    }

    write_summary_file(summary_count, cp_hashtable_get(output_files, "summary"));
    write_genes_with_variants_file(gene_list, output_directory);
    write_result_file(shared_options_data, options_data, summary_count, output_directory);

    free_output_data_structures(output_files, summary_count, gene_list);
    free_ws_buffers(shared_options_data->num_threads);
    free(output_list);
    vcf_close(vcf_file);
    
    update_job_status_file(100, job_status);
    close_job_status_file(job_status);
    
    return ret_code;
}
Example #5
0
int main(int ac, char **av)
{
	char *file;
	struct btrfs_root *root;
	struct btrfs_trans_handle *trans;
	char *label = NULL;
	char *first_file;
	u64 block_count = 0;
	u64 dev_block_count = 0;
	u64 blocks[7];
	u64 alloc_start = 0;
	u64 metadata_profile = 0;
	u64 data_profile = 0;
	u32 leafsize = sysconf(_SC_PAGESIZE);
	u32 sectorsize = 4096;
	u32 nodesize = leafsize;
	u32 stripesize = 4096;
	int zero_end = 1;
	int option_index = 0;
	int fd;
	int ret;
	int i;
	int mixed = 0;
	int data_profile_opt = 0;
	int metadata_profile_opt = 0;
	int discard = 1;
	int ssd = 0;
	int force_overwrite = 0;

	char *source_dir = NULL;
	int source_dir_set = 0;
	u64 num_of_meta_chunks = 0;
	u64 size_of_data = 0;
	u64 source_dir_size = 0;
	int dev_cnt = 0;
	int saved_optind;
	char estr[100];
	u64 features = 0;

	while(1) {
		int c;
		c = getopt_long(ac, av, "A:b:fl:n:s:m:d:L:O:r:VMK",
				long_options, &option_index);
		if (c < 0)
			break;
		switch(c) {
			case 'A':
				alloc_start = parse_size(optarg);
				break;
			case 'f':
				force_overwrite = 1;
				break;
			case 'd':
				data_profile = parse_profile(optarg);
				data_profile_opt = 1;
				break;
			case 'l':
			case 'n':
				nodesize = parse_size(optarg);
				leafsize = parse_size(optarg);
				break;
			case 'L':
				label = parse_label(optarg);
				break;
			case 'm':
				metadata_profile = parse_profile(optarg);
				metadata_profile_opt = 1;
				break;
			case 'M':
				mixed = 1;
				break;
			case 'O': {
				char *orig = strdup(optarg);
				char *tmp = orig;

				tmp = parse_fs_features(tmp, &features);
				if (tmp) {
					fprintf(stderr,
						"Unrecognized filesystem feature '%s'\n",
							tmp);
					free(orig);
					exit(1);
				}
				free(orig);
				if (features & BTRFS_FEATURE_LIST_ALL) {
					list_all_fs_features();
					exit(0);
				}
				break;
				}
			case 's':
				sectorsize = parse_size(optarg);
				break;
			case 'b':
				block_count = parse_size(optarg);
				if (block_count <= 1024*1024*1024) {
					printf("SMALL VOLUME: forcing mixed "
					       "metadata/data groups\n");
					mixed = 1;
				}
				zero_end = 0;
				break;
			case 'V':
				print_version();
				break;
			case 'r':
				source_dir = optarg;
				source_dir_set = 1;
				break;
			case 'K':
				discard = 0;
				break;
			default:
				print_usage();
		}
	}
	sectorsize = max(sectorsize, (u32)sysconf(_SC_PAGESIZE));
	if (check_leaf_or_node_size(leafsize, sectorsize))
		exit(1);
	if (check_leaf_or_node_size(nodesize, sectorsize))
		exit(1);
	saved_optind = optind;
	dev_cnt = ac - optind;
	if (dev_cnt == 0)
		print_usage();

	if (source_dir_set && dev_cnt > 1) {
		fprintf(stderr,
			"The -r option is limited to a single device\n");
		exit(1);
	}
	while (dev_cnt-- > 0) {
		file = av[optind++];
		if (is_block_device(file))
			if (test_dev_for_mkfs(file, force_overwrite, estr)) {
				fprintf(stderr, "Error: %s", estr);
				exit(1);
			}
	}

	optind = saved_optind;
	dev_cnt = ac - optind;

	file = av[optind++];
	ssd = is_ssd(file);

	if (is_vol_small(file)) {
		printf("SMALL VOLUME: forcing mixed metadata/data groups\n");
		mixed = 1;
		if (metadata_profile != data_profile) {
			if (metadata_profile_opt || data_profile_opt) {
				fprintf(stderr,
	"With mixed block groups data and metadata profiles must be the same\n");
				exit(1);
			}
		}
	}
	/*
	* Set default profiles according to number of added devices.
	* For mixed groups defaults are single/single.
	*/
	if (!mixed) {
		if (!metadata_profile_opt) {
			if (dev_cnt == 1 && ssd)
				printf("Detected a SSD, turning off metadata "
				"duplication.  Mkfs with -m dup if you want to "
				"force metadata duplication.\n");

			metadata_profile = (dev_cnt > 1) ?
					BTRFS_BLOCK_GROUP_RAID1 : (ssd) ?
					0: BTRFS_BLOCK_GROUP_DUP;
		}
		if (!data_profile_opt) {
			data_profile = (dev_cnt > 1) ?
				BTRFS_BLOCK_GROUP_RAID0 : 0; /* raid0 or single */
		}
	} else {
		metadata_profile = 0;
		data_profile = 0;
	}

	ret = test_num_disk_vs_raid(metadata_profile, data_profile,
			dev_cnt, mixed, estr);
	if (ret) {
		fprintf(stderr, "Error: %s\n", estr);
		exit(1);
	}

	/* if we are here that means all devs are good to btrfsify */
	printf("\nWARNING! - %s IS EXPERIMENTAL\n", BTRFS_BUILD_VERSION);
	printf("WARNING! - see http://btrfs.wiki.kernel.org before using\n\n");

	dev_cnt--;

	if (!source_dir_set) {
		/*
		 * open without O_EXCL so that the problem should not
		 * occur by the following processing.
		 * (btrfs_register_one_device() fails if O_EXCL is on)
		 */
		fd = open(file, O_RDWR);
		if (fd < 0) {
			fprintf(stderr, "unable to open %s: %s\n", file,
				strerror(errno));
			exit(1);
		}
		first_file = file;
		ret = btrfs_prepare_device(fd, file, zero_end, &dev_block_count,
					   block_count, &mixed, discard);
		if (block_count && block_count > dev_block_count) {
			fprintf(stderr, "%s is smaller than requested size\n", file);
			exit(1);
		}
	} else {
		fd = open_target(file);
		if (fd < 0) {
			fprintf(stderr, "unable to open the %s\n", file);
			exit(1);
		}

		first_file = file;
		source_dir_size = size_sourcedir(source_dir, sectorsize,
					     &num_of_meta_chunks, &size_of_data);
		if(block_count < source_dir_size)
			block_count = source_dir_size;
		ret = zero_output_file(fd, block_count, sectorsize);
		if (ret) {
			fprintf(stderr, "unable to zero the output file\n");
			exit(1);
		}
		/* our "device" is the new image file */
		dev_block_count = block_count;
	}

	/* To create the first block group and chunk 0 in make_btrfs */
	if (dev_block_count < BTRFS_MKFS_SYSTEM_GROUP_SIZE) {
		fprintf(stderr, "device is too small to make filesystem\n");
		exit(1);
	}

	blocks[0] = BTRFS_SUPER_INFO_OFFSET;
	for (i = 1; i < 7; i++) {
		blocks[i] = BTRFS_SUPER_INFO_OFFSET + 1024 * 1024 +
			leafsize * i;
	}

	/*
	 * FS features that can be set by other means than -O
	 * just set the bit here
	 */
	if (mixed)
		features |= BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS;

	if ((data_profile | metadata_profile) &
	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
		features |= BTRFS_FEATURE_INCOMPAT_RAID56;
	}

	process_fs_features(features);

	ret = make_btrfs(fd, file, label, blocks, dev_block_count,
			 nodesize, leafsize,
			 sectorsize, stripesize, features);
	if (ret) {
		fprintf(stderr, "error during mkfs: %s\n", strerror(-ret));
		exit(1);
	}

	root = open_ctree(file, 0, OPEN_CTREE_WRITES);
	if (!root) {
		fprintf(stderr, "Open ctree failed\n");
		close(fd);
		exit(1);
	}
	root->fs_info->alloc_start = alloc_start;

	ret = make_root_dir(root, mixed);
	if (ret) {
		fprintf(stderr, "failed to setup the root directory\n");
		exit(1);
	}

	trans = btrfs_start_transaction(root, 1);

	if (dev_cnt == 0)
		goto raid_groups;

	btrfs_register_one_device(file);

	zero_end = 1;
	while (dev_cnt-- > 0) {
		int old_mixed = mixed;

		file = av[optind++];

		/*
		 * open without O_EXCL so that the problem should not
		 * occur by the following processing.
		 * (btrfs_register_one_device() fails if O_EXCL is on)
		 */
		fd = open(file, O_RDWR);
		if (fd < 0) {
			fprintf(stderr, "unable to open %s: %s\n", file,
				strerror(errno));
			exit(1);
		}
		ret = btrfs_device_already_in_root(root, fd,
						   BTRFS_SUPER_INFO_OFFSET);
		if (ret) {
			fprintf(stderr, "skipping duplicate device %s in FS\n",
				file);
			close(fd);
			continue;
		}
		ret = btrfs_prepare_device(fd, file, zero_end, &dev_block_count,
					   block_count, &mixed, discard);
		mixed = old_mixed;
		BUG_ON(ret);

		ret = btrfs_add_to_fsid(trans, root, fd, file, dev_block_count,
					sectorsize, sectorsize, sectorsize);
		BUG_ON(ret);
		btrfs_register_one_device(file);
	}

raid_groups:
	if (!source_dir_set) {
		ret = create_raid_groups(trans, root, data_profile,
				 data_profile_opt, metadata_profile,
				 metadata_profile_opt, mixed, ssd);
		BUG_ON(ret);
	}

	ret = create_data_reloc_tree(trans, root);
	BUG_ON(ret);

	printf("fs created label %s on %s\n\tnodesize %u leafsize %u "
	    "sectorsize %u size %s\n",
	    label, first_file, nodesize, leafsize, sectorsize,
	    pretty_size(btrfs_super_total_bytes(root->fs_info->super_copy)));

	printf("%s\n", BTRFS_BUILD_VERSION);
	btrfs_commit_transaction(trans, root);

	if (source_dir_set) {
		trans = btrfs_start_transaction(root, 1);
		ret = create_chunks(trans, root,
				    num_of_meta_chunks, size_of_data);
		BUG_ON(ret);
		btrfs_commit_transaction(trans, root);

		ret = make_image(source_dir, root, fd);
		BUG_ON(ret);
	}

	ret = close_ctree(root);
	BUG_ON(ret);
	free(label);
	return 0;
}
Example #6
0
int run_stats(shared_options_data_t *shared_options_data, stats_options_data_t *options_data) {
    file_stats_t *file_stats = file_stats_new();
    sample_stats_t **sample_stats;
    
    // List that stores the batches of records filtered by each thread
    list_t *output_list[shared_options_data->num_threads];
    // List that stores which thread filtered the next batch to save
    list_t *next_token_list = malloc(sizeof(list_t));

    int ret_code;
    double start, stop, total;
    
    vcf_file_t *vcf_file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches);
    if (!vcf_file) {
        LOG_FATAL("VCF file does not exist!\n");
    }
    
    ped_file_t *ped_file = NULL;
    if (shared_options_data->ped_filename) {
        ped_file = ped_open(shared_options_data->ped_filename);
        if (!ped_file) {
            LOG_FATAL("PED file does not exist!\n");
        }
        if(options_data->variable) {
            set_variable_field(options_data->variable, 0, ped_file);
        } else {
            set_variable_field("PHENO", 6, ped_file);
        }
        
        if(options_data->variable_groups) {
            int n, m;
            char *variable_groups = strdup(options_data->variable_groups);
            char **groups;
            char **phenos_in_group;
            groups = split(variable_groups, ":", &n);
            for(int i = 0; i < n; i++){
                phenos_in_group = split(groups[i], ",", &m);
                if(set_phenotype_group(phenos_in_group, m, ped_file) < 0) {
                    LOG_ERROR("Variable can't appear in two groups\n");
                    return DUPLICATED_VARIABLE;
                }
                free(phenos_in_group);
            }
            ped_file->accept_new_values = 0;
            
            free(variable_groups);
            free(groups);
        } else {
            ped_file->accept_new_values = 1;
        }
        if(options_data->phenotype) {
            int n;
            char* phenotypes = strdup(options_data->phenotype);
            char** pheno_values = split(phenotypes, ",", &n);
            if(n != 2) {
                LOG_ERROR("To handle case-control test, only two phenotypes are supported\n");
                return MORE_THAN_TWO_PHENOTYPES;
            } else {
                set_unaffected_phenotype(pheno_values[0],ped_file);
                set_affected_phenotype(pheno_values[1],ped_file);
            }
        } else {
            set_unaffected_phenotype("1", ped_file);
            set_affected_phenotype("2", ped_file);
        }
        
        LOG_INFO("About to read PED file...\n");
        // Read PED file before doing any processing
        ret_code = ped_read(ped_file);
        if (ret_code != 0) {
            LOG_FATAL_F("Can't read PED file: %s\n", ped_file->filename);
        }
        if(!ped_file->num_field) {
            LOG_ERROR_F("Can't find the specified field \"%s\" in file: %s \n", options_data->variable, ped_file->filename);
            return VARIABLE_FIELD_NOT_FOUND;
        }
    }
    
    ret_code = create_directory(shared_options_data->output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory);
    }
    
    // Initialize variables related to the different threads
    for (int i = 0; i < shared_options_data->num_threads; i++) {
        output_list[i] = (list_t*) malloc(sizeof(list_t));
        list_init("input", 1, shared_options_data->num_threads * shared_options_data->batch_lines, output_list[i]);
    }
    list_init("next_token", shared_options_data->num_threads, INT_MAX, next_token_list);
    
    LOG_INFO("About to retrieve statistics from VCF file...\n");

#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            // Reading
            start = omp_get_wtime();

            if (shared_options_data->batch_bytes > 0) {
                ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, vcf_file);
            } else if (shared_options_data->batch_lines > 0) {
                ret_code = vcf_parse_batches(shared_options_data->batch_lines, vcf_file);
            }

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            notify_end_parsing(vcf_file);
        }
        
#pragma omp section
        {
            // Enable nested parallelism and set the number of threads the user has chosen
            omp_set_nested(1);
            LOG_DEBUG_F("Thread %d processes data\n", omp_get_thread_num());
            
            individual_t **individuals = NULL;
            khash_t(ids) *sample_ids = NULL;
            khash_t(str) *phenotype_ids = NULL;
            int num_phenotypes;
            
            start = omp_get_wtime();
            
            int i = 0;
            vcf_batch_t *batch = NULL;
            while ((batch = fetch_vcf_batch(vcf_file)) != NULL) {
                if (i == 0) {
                    sample_stats = malloc (get_num_vcf_samples(vcf_file) * sizeof(sample_stats_t*));
                    for (int j = 0; j < get_num_vcf_samples(vcf_file); j++) {
                        sample_stats[j] = sample_stats_new(array_list_get(j, vcf_file->samples_names));
                    }
                    
                    if (ped_file) {
                        // Create map to associate the position of individuals in the list of samples defined in the VCF file
                        sample_ids = associate_samples_and_positions(vcf_file);
                        // Sort individuals in PED as defined in the VCF file
                        individuals = sort_individuals(vcf_file, ped_file);
                        // Get the khash of the phenotypes in PED file
                        phenotype_ids = get_phenotypes(ped_file);
                        num_phenotypes = get_num_variables(ped_file);
                    }
                }
                
                if (i % 50 == 0) {
                    LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", 
                                i, omp_get_thread_num(),
                                batch->records->size, batch->records->capacity);
                }

                // Divide the list of passed records in ranges of size defined in config file
                int num_chunks;
                int *chunk_sizes = NULL;
                array_list_t *input_records = batch->records;
                int *chunk_starts = create_chunks(input_records->size, 
                                                  ceil((float) shared_options_data->batch_lines / shared_options_data->num_threads), 
                                                  &num_chunks, &chunk_sizes);
                
                // OpenMP: Launch a thread for each range
                #pragma omp parallel for num_threads(shared_options_data->num_threads)
                for (int j = 0; j < num_chunks; j++) {
                    LOG_DEBUG_F("[%d] Stats invocation\n", omp_get_thread_num());
                    // Invoke variant stats and/or sample stats when applies
                    if (options_data->variant_stats) {
                        int index = omp_get_thread_num() % shared_options_data->num_threads;
                        ret_code = get_variants_stats((vcf_record_t**) (input_records->items + chunk_starts[j]),
                                                      chunk_sizes[j], individuals, sample_ids,num_phenotypes, output_list[index], file_stats); 
                    }
                    
                    if (options_data->sample_stats) {
                        ret_code |= get_sample_stats((vcf_record_t**) (input_records->items + chunk_starts[j]), 
                                                      chunk_sizes[j], individuals, sample_ids, sample_stats, file_stats);
                    }
                }
                
                if (options_data->variant_stats) {
                    // Insert as many tokens as elements correspond to each thread
                    for (int t = 0; t < num_chunks; t++) {
                        for (int s = 0; s < chunk_sizes[t]; s++) {
                            list_item_t *token_item = list_item_new(t, 0, NULL);
                            list_insert_item(token_item, next_token_list);
                        }
                    }
                }
                
                free(chunk_starts);
                free(chunk_sizes);
                vcf_batch_free(batch);
                
                i++;
            }
            
            stop = omp_get_wtime();
            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);
            
            // Decrease list writers count
            for (i = 0; i < shared_options_data->num_threads; i++) {
                list_decr_writers(next_token_list);
                list_decr_writers(output_list[i]);
            }
            
            if (sample_ids) { kh_destroy(ids, sample_ids); }
            if (individuals) { free(individuals); }
        }
        
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d writes the output\n", omp_get_thread_num());
            
            char *stats_prefix = get_vcf_stats_filename_prefix(shared_options_data->vcf_filename, 
                                                               shared_options_data->output_filename, 
                                                               shared_options_data->output_directory);
            
            // File names and descriptors for output to plain text files
            char *stats_filename, *summary_filename, *phenotype_filename;
            FILE *stats_fd, *summary_fd, **phenotype_fd;
            
            char *stats_db_name;
            sqlite3 *db = NULL;
            khash_t(stats_chunks) *hash;
            
            khash_t(str) *phenotype_ids;
            int num_phenotypes;
            if(ped_file){
                phenotype_ids = get_phenotypes(ped_file);
                num_phenotypes = get_num_variables(ped_file);
            }
            
            if (options_data->save_db) {
                delete_files_by_extension(shared_options_data->output_directory, "db");
                stats_db_name = calloc(strlen(stats_prefix) + strlen(".db") + 2, sizeof(char));
                sprintf(stats_db_name, "%s.db", stats_prefix);
                create_stats_db(stats_db_name, VCF_CHUNKSIZE, create_vcf_query_fields, &db);
                hash = kh_init(stats_chunks);
            }
            
            // Write variant (and global) statistics
            if (options_data->variant_stats) {
                stats_filename = get_variant_stats_output_filename(stats_prefix);
                if (!(stats_fd = fopen(stats_filename, "w"))) {
                    LOG_FATAL_F("Can't open file for writing statistics of variants: %s\n", stats_filename);
                }
                
                //Open one file for each phenotype
                if(ped_file){
                    phenotype_fd = malloc(sizeof(FILE*)*num_phenotypes);
                    if(options_data->variable_groups){
                        int n;
                        char *variable_groups = strdup(options_data->variable_groups);
                        char ** names = split(variable_groups, ":", &n);
                        for(int i = 0; i < n; i++) {
                            phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, names[i]);
                            if(!(phenotype_fd[i] = fopen(phenotype_filename, "w"))) {
                                LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename);
                            }
                            free(phenotype_filename);
                        }
                        free(names);
                        free(variable_groups);
                    } else {
                 
                        for (khint_t i = kh_begin(phenotype_ids); i != kh_end(phenotype_ids); ++i) {
                            if (!kh_exist(phenotype_ids,i)) continue;
                            
                            phenotype_filename = get_variant_phenotype_stats_output_filename(stats_prefix, kh_key(phenotype_ids,i));
                            if(!(phenotype_fd[kh_val(phenotype_ids,i)] = fopen(phenotype_filename, "w"))) {
                                LOG_FATAL_F("Can't open file for writing statistics of variants per phenotype: %s\n", stats_filename);
                            }
                            free(phenotype_filename);
                        }
                    }
                }
                // Write header
                report_vcf_variant_stats_header(stats_fd);
                if(ped_file){
                    for(int i = 0; i < num_phenotypes; i++)
                        report_vcf_variant_phenotype_stats_header(phenotype_fd[i]);
                }
                
                // For each variant, generate a new line
                int avail_stats = 0;
                variant_stats_t *var_stats_batch[VCF_CHUNKSIZE];
                list_item_t *token_item = NULL, *output_item = NULL;
                while ( token_item = list_remove_item(next_token_list) ) {
                    output_item = list_remove_item(output_list[token_item->id]);
                    assert(output_item);
                    var_stats_batch[avail_stats] = output_item->data_p;
                    avail_stats++;
                    
                    // Run only when certain amount of stats is available
                    if (avail_stats >= VCF_CHUNKSIZE) {
                        report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch);
                        
                        if(ped_file)
                            for(int i = 0; i < num_phenotypes; i++)
                                report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i);

                        // Free all stats from the "batch"
                        for (int i = 0; i < avail_stats; i++) {
                            variant_stats_free(var_stats_batch[i]);
                        }
                        avail_stats = 0;
                    }
                    
                    // Free resources
                    list_item_free(output_item);
                    list_item_free(token_item);
                }
                
                if (avail_stats > 0) {
                    report_vcf_variant_stats(stats_fd, db, hash, avail_stats, var_stats_batch);
                    
                    if(ped_file)
                        for(int i = 0; i < num_phenotypes; i++)
                            report_vcf_variant_phenotype_stats(phenotype_fd[i], avail_stats, var_stats_batch, i);

                    // Free all stats from the "batch"
                    for (int i = 0; i < avail_stats; i++) {
                        variant_stats_free(var_stats_batch[i]);
                    }
                    avail_stats = 0;
                }
                
                // Write whole file stats (data only got when launching variant stats)
                summary_filename = get_vcf_file_stats_output_filename(stats_prefix);
                if (!(summary_fd = fopen(summary_filename, "w"))) {
                    LOG_FATAL_F("Can't open file for writing statistics summary: %s\n", summary_filename);
                }
                report_vcf_summary_stats(summary_fd, db, file_stats);
                
                free(stats_filename);
                free(summary_filename);
                
                // Close variant stats file
                if (stats_fd) { fclose(stats_fd); }
                if (summary_fd) { fclose(summary_fd); }
				if(ped_file){
		            for(int i = 0; i < num_phenotypes; i++)
		                if(phenotype_fd[i]) fclose(phenotype_fd[i]);
					free(phenotype_fd);
				}
            }
            
            // Write sample statistics
            if (options_data->sample_stats) {
                stats_filename = get_sample_stats_output_filename(stats_prefix);
                if (!(stats_fd = fopen(stats_filename, "w"))) {
                    LOG_FATAL_F("Can't open file for writing statistics of samples: %s\n", stats_filename);
                }
                
                report_vcf_sample_stats_header(stats_fd);
                report_vcf_sample_stats(stats_fd, NULL, vcf_file->samples_names->size, sample_stats);
                
                // Close sample stats file
                free(stats_filename);
                if (stats_fd) { fclose(stats_fd); }
            }
            
            free(stats_prefix);
            
            if (db) {
                insert_chunk_hash(VCF_CHUNKSIZE, hash, db);
                create_stats_index(create_vcf_index, db);
                close_stats_db(db, hash);
            }
            
        }
    }
    
    for (int i = 0; i < get_num_vcf_samples(vcf_file); i++) {
        sample_stats_free(sample_stats[i]);
    }
    free(sample_stats);
    free(file_stats);
    
    free(next_token_list);
    for (int i = 0; i < shared_options_data->num_threads; i++) {
        free(output_list[i]);
    }
    
    vcf_close(vcf_file);
    if (ped_file) { ped_close(ped_file, 1,1); }
    
    return 0;
}