Example #1
0
region_table_t *parse_regions_from_gff_file(char *filename, const char *url, const char *species, const char *version) {
    gff_file_t *file = gff_open(filename);
    if (file == NULL) {
        return NULL;
    } 
    
    region_table_t *regions_table = create_table(url, species, version);
    
    int ret_code = 0;
    size_t max_batches = 20;
    size_t batch_size = 2000;
    list_t *read_list = (list_t*) malloc (sizeof(list_t));
    list_init("batches", 1, max_batches, read_list);
    
    #pragma omp parallel sections
    {
        // The producer reads the GFF file
        #pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num());
            ret_code = gff_read_batches(read_list, batch_size, file);
            list_decr_writers(read_list);
            
            if (ret_code) {
                LOG_FATAL_F("Error while reading GFF file %s (%d)\n", filename, ret_code);
            }
        }
        
        // The consumer inserts regions in the structure 
        #pragma omp section
        {    
            list_item_t *item = NULL, *batch_item = NULL;
            gff_batch_t *batch;
            gff_record_t *record;
            while ( (item = list_remove_item(read_list)) != NULL ) {
                batch = item->data_p;
                // For each record in the batch, generate a new region
                for (batch_item = batch->first_p; batch_item != NULL; batch_item = batch_item->next_p) {
                    record = batch_item->data_p;
                    
                    region_t *region = (region_t*) malloc (sizeof(region_t));
                    region->chromosome = (char*) calloc ((strlen(record->sequence)+1), sizeof(char));
                    strncat(region->chromosome, record->sequence, strlen(record->sequence));
                    region->start_position = record->start;
                    region->end_position = record->end;
                    LOG_DEBUG_F("region '%s:%u-%u'\n", region->chromosome, region->start_position, region->end_position);
                    
                    insert_region(region, regions_table);
                }
               
                gff_batch_free(item->data_p);
                list_item_free(item);
            }
        }
    }
    
    gff_close(file, 0);
    
    return regions_table;
}
Example #2
0
region_table_t *parse_regions_from_gff_file(char *filename, const char *url, const char *species, const char *version) {
    gff_file_t *file = gff_open(filename);
    if (file == NULL) {
        return NULL;
    }

    region_table_t *regions_table = new_region_table_from_ws(url, species, version);

    int ret_code = 0;
    size_t max_batches = 20, batch_size = 2000;
    list_t *read_list = (list_t*) malloc (sizeof(list_t));
    list_init("batches", 1, max_batches, read_list);

    #pragma omp parallel sections
    {
        // The producer reads the GFF file
        #pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num());
            ret_code = gff_read_batches(read_list, batch_size, file);
            list_decr_writers(read_list);

            if (ret_code) {
                LOG_FATAL_F("Error while reading GFF file %s (%d)\n", filename, ret_code);
            }
        }

        // The consumer inserts regions in the structure
        #pragma omp section
        {
            list_item_t *item = NULL;
            gff_batch_t *batch;
            gff_record_t *record;

            region_t *regions_batch[REGIONS_CHUNKSIZE];
            int avail_regions = 0;

            while ( item = list_remove_item(read_list) ) {
                batch = item->data_p;
                // For each record in the batch, generate a new region
                for (int i = 0; i < batch->records->size; i++) {
                    record = batch->records->items[i];

                    region_t *region = region_new(strndup(record->sequence, record->sequence_len),
                                                  record->start, record->end,
                                                  record->strand ? strndup(&record->strand, 1) : NULL,
                                                  record->feature ? strndup(record->feature, record->feature_len) : NULL);

                    LOG_DEBUG_F("region '%s:%u-%u'\n", region->chromosome, region->start_position, region->end_position);

                    regions_batch[avail_regions++] = region;

                    // Save when the recommended size is reached
                    if (avail_regions == REGIONS_CHUNKSIZE) {
                        insert_regions(regions_batch, avail_regions, regions_table);
                        for (int i = 0; i < avail_regions; i++) {
                            free(regions_batch[i]);
                        }
                        avail_regions = 0;
                    }
                }

                gff_batch_free(batch);
                list_item_free(item);
            }

            // Save the remaining regions that did not fill a batch
            if (avail_regions > 0) {
                insert_regions(regions_batch, avail_regions, regions_table);
                for (int i = 0; i < avail_regions; i++) {
                    free(regions_batch[i]);
                }
                avail_regions = 0;
            }
        }
    }

    finish_region_table_loading(regions_table);

    list_free_deep(read_list, NULL);

    gff_close(file, 1);

    return regions_table;
}
Example #3
0
int main (int argc, char *argv[])
{
    size_t max_batches = 20;
    size_t batch_size = 2000;
    list_t *read_list = (list_t*) malloc (sizeof(list_t));
    list_init("batches", 1, max_batches, read_list);

    int ret_code;
    double start, stop, total;
    char *filename = (char*) malloc ((strlen(argv[1])+1) * sizeof(char));
    strncat(filename, argv[1], strlen(argv[1]));
    gff_file_t* file;

    init_log_custom(LOG_LEVEL_DEBUG, 1, NULL, "w");

    #pragma omp parallel sections private(start, stop, total) lastprivate(file)
    {
        #pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the GFF file\n", omp_get_thread_num());
            // Reading
            start = omp_get_wtime();

            file = gff_open(filename);
            ret_code = gff_read_batches(read_list, batch_size, file);

            stop = omp_get_wtime();
            total = (stop - start);

            if (ret_code) {
                LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code);
            }
            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            // Writing to a new file
            if (argc == 3)
            {
                start = omp_get_wtime();

                ret_code = gff_write(file, argv[2]);

                stop = omp_get_wtime();
                total = (stop - start);

                if (ret_code) {
                    LOG_ERROR_F("[%dW] Error code = %d\n", omp_get_thread_num(), ret_code);
                }
                LOG_INFO_F("[%dW] Time elapsed = %f s\n", omp_get_thread_num(), total);
                LOG_INFO_F("[%dW] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);
            }

            list_decr_writers(read_list);

            gff_close(file, 0);
        }
        #pragma omp section
        {
            printf("1st log debug\n");
            LOG_DEBUG_F("OMP num threads = %d\n", omp_get_num_threads());
            LOG_DEBUG_F("Thread %d prints info\n", omp_get_thread_num());
            printf("after 1st log debug\n");

            start = omp_get_wtime();

            int i = 0;
            list_item_t* item = NULL;
            FILE *out = fopen("result.gff", "w");
            while ( (item = list_remove_item(read_list)) != NULL ) {
                if (i % 200 == 0)
                {
                    int debug = 1;
                    LOG_DEBUG_F("Batch %d reached by thread %d - %zu/%zu records \n", i, omp_get_thread_num(),
                    ((gff_batch_t*) item->data_p)->length, ((gff_batch_t*) item->data_p)->max_length);
                }

//             gff_write_to_file(file, out);
//             gff_batch_print(stdout, item->data_p);
                write_gff_batch(item->data_p, out);
                gff_batch_free(item->data_p);
                list_item_free(item);
                i++;
            }
            fclose(out);

            stop = omp_get_wtime();
            total = (stop - start);

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);
        }
    }

    free(read_list);

    return 0;
}