region_table_t *parse_regions_from_gff_file(char *filename, const char *url, const char *species, const char *version) { gff_file_t *file = gff_open(filename); if (file == NULL) { return NULL; } region_table_t *regions_table = create_table(url, species, version); int ret_code = 0; size_t max_batches = 20; size_t batch_size = 2000; list_t *read_list = (list_t*) malloc (sizeof(list_t)); list_init("batches", 1, max_batches, read_list); #pragma omp parallel sections { // The producer reads the GFF file #pragma omp section { LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num()); ret_code = gff_read_batches(read_list, batch_size, file); list_decr_writers(read_list); if (ret_code) { LOG_FATAL_F("Error while reading GFF file %s (%d)\n", filename, ret_code); } } // The consumer inserts regions in the structure #pragma omp section { list_item_t *item = NULL, *batch_item = NULL; gff_batch_t *batch; gff_record_t *record; while ( (item = list_remove_item(read_list)) != NULL ) { batch = item->data_p; // For each record in the batch, generate a new region for (batch_item = batch->first_p; batch_item != NULL; batch_item = batch_item->next_p) { record = batch_item->data_p; region_t *region = (region_t*) malloc (sizeof(region_t)); region->chromosome = (char*) calloc ((strlen(record->sequence)+1), sizeof(char)); strncat(region->chromosome, record->sequence, strlen(record->sequence)); region->start_position = record->start; region->end_position = record->end; LOG_DEBUG_F("region '%s:%u-%u'\n", region->chromosome, region->start_position, region->end_position); insert_region(region, regions_table); } gff_batch_free(item->data_p); list_item_free(item); } } } gff_close(file, 0); return regions_table; }
region_table_t *parse_regions_from_gff_file(char *filename, const char *url, const char *species, const char *version) { gff_file_t *file = gff_open(filename); if (file == NULL) { return NULL; } region_table_t *regions_table = new_region_table_from_ws(url, species, version); int ret_code = 0; size_t max_batches = 20, batch_size = 2000; list_t *read_list = (list_t*) malloc (sizeof(list_t)); list_init("batches", 1, max_batches, read_list); #pragma omp parallel sections { // The producer reads the GFF file #pragma omp section { LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num()); ret_code = gff_read_batches(read_list, batch_size, file); list_decr_writers(read_list); if (ret_code) { LOG_FATAL_F("Error while reading GFF file %s (%d)\n", filename, ret_code); } } // The consumer inserts regions in the structure #pragma omp section { list_item_t *item = NULL; gff_batch_t *batch; gff_record_t *record; region_t *regions_batch[REGIONS_CHUNKSIZE]; int avail_regions = 0; while ( item = list_remove_item(read_list) ) { batch = item->data_p; // For each record in the batch, generate a new region for (int i = 0; i < batch->records->size; i++) { record = batch->records->items[i]; region_t *region = region_new(strndup(record->sequence, record->sequence_len), record->start, record->end, record->strand ? strndup(&record->strand, 1) : NULL, record->feature ? strndup(record->feature, record->feature_len) : NULL); LOG_DEBUG_F("region '%s:%u-%u'\n", region->chromosome, region->start_position, region->end_position); regions_batch[avail_regions++] = region; // Save when the recommended size is reached if (avail_regions == REGIONS_CHUNKSIZE) { insert_regions(regions_batch, avail_regions, regions_table); for (int i = 0; i < avail_regions; i++) { free(regions_batch[i]); } avail_regions = 0; } } gff_batch_free(batch); list_item_free(item); } // Save the remaining regions that did not fill a batch if (avail_regions > 0) { insert_regions(regions_batch, avail_regions, regions_table); for (int i = 0; i < avail_regions; i++) { free(regions_batch[i]); } avail_regions = 0; } } } finish_region_table_loading(regions_table); list_free_deep(read_list, NULL); gff_close(file, 1); return regions_table; }
int main (int argc, char *argv[]) { size_t max_batches = 20; size_t batch_size = 2000; list_t *read_list = (list_t*) malloc (sizeof(list_t)); list_init("batches", 1, max_batches, read_list); int ret_code; double start, stop, total; char *filename = (char*) malloc ((strlen(argv[1])+1) * sizeof(char)); strncat(filename, argv[1], strlen(argv[1])); gff_file_t* file; init_log_custom(LOG_LEVEL_DEBUG, 1, NULL, "w"); #pragma omp parallel sections private(start, stop, total) lastprivate(file) { #pragma omp section { LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num()); // Reading start = omp_get_wtime(); file = gff_open(filename); ret_code = gff_read_batches(read_list, batch_size, file); stop = omp_get_wtime(); total = (stop - start); if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); // Writing to a new file if (argc == 3) { start = omp_get_wtime(); ret_code = gff_write(file, argv[2]); stop = omp_get_wtime(); total = (stop - start); if (ret_code) { LOG_ERROR_F("[%dW] Error code = %d\n", omp_get_thread_num(), ret_code); } LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } list_decr_writers(read_list); gff_close(file, 0); } #pragma omp section { printf("1st log debug\n"); LOG_DEBUG_F("OMP num threads = %d\n", omp_get_num_threads()); LOG_DEBUG_F("Thread %d prints info\n", omp_get_thread_num()); printf("after 1st log debug\n"); start = omp_get_wtime(); int i = 0; list_item_t* item = NULL; FILE *out = fopen("result.gff", "w"); while ( (item = list_remove_item(read_list)) != NULL ) { if (i % 200 == 0) { int debug = 1; LOG_DEBUG_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(), ((gff_batch_t*) item->data_p)->length, ((gff_batch_t*) item->data_p)->max_length); } // gff_write_to_file(file, out); // gff_batch_print(stdout, item->data_p); write_gff_batch(item->data_p, out); gff_batch_free(item->data_p); list_item_free(item); i++; } fclose(out); stop = omp_get_wtime(); total = (stop - start); LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total); LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000); } } free(read_list); return 0; }