예제 #1
0
파일: import.c 프로젝트: jamie-pate/jstruct
struct jstruct_result
_jstruct_import(struct json_object *obj, const void *data,
        const struct jstruct_object_property *properties, struct json_object *errors) {
    _init_importers();
    if (errors != NULL && json_object_get_type(errors) != json_type_array) {
        return jstruct_error_new(jstruct_error_errors_not_array_or_null, NULL, json_object_get_type(errors));
    }
    const struct jstruct_object_property *property;
    struct json_object *prop;
    struct jstruct_result result = JSTRUCT_OK;
    result.allocated = array_list_new(jstruct_allocated_free);
    for (property = properties; property->name; ++property) {
        void *ptr = jstruct_prop_ptr(data, property, JSTRUCT_PROP_PTR_GET_NO_DEREF);
        struct jstruct_result err;
        if (json_object_object_get_ex(obj, property->name, &prop)) {
            if (json_object_get_type(prop) != property->type.json) {
                err = jstruct_error_new(jstruct_error_incorrect_type, property->name, json_object_get_type(prop));
            } else {
                jstruct_import_importer import = importers[json_type_index(property->type.json)];
                err = import(prop, data, ptr, property);
            }
        } else {
            if (!set_null(ptr, property)) {
                err = jstruct_error_array_add(errors, jstruct_error_not_nullable, property->name, 0);
            }
        }
        jstruct_error_consume(&result, &err, errors, property->name, -1);
    }
    if (result.allocated->length == 0) {
        array_list_free(result.allocated);
        result.allocated = NULL;
    }
    return result;
}
예제 #2
0
파일: gff_batch.c 프로젝트: mrG7/hpg-libs
void gff_batch_free(gff_batch_t* batch) {
    assert(batch);
    
    if (batch->text) { free(batch->text); }
    array_list_free(batch->records, (void *)gff_record_free);
    free(batch);
}
void write_mapped_read(array_list_t *array_list, bam_file_t *bam_file) {
  size_t num_items = array_list_size(array_list);
  alignment_t *alig;
  bam1_t *bam1;
  for (size_t j = 0; j < num_items; j++) {
    alig = (alignment_t *) array_list_get(j, array_list);

    //printf("\t******** %i(%i)\n", j, num_items);
    //printf("is null alig->name %i\n", (alig->query_name == NULL));
    //printf("name = %s\n", alig->query_name);
    //printf("read = %s\n", alig->sequence);
    //printf("\t-----> %s\n", alig->cigar);
    LOG_DEBUG("writting bam..\n");
    //alignment_print(alig);
    //exit(-1);
    if (alig != NULL) {
      bam1 = convert_to_bam(alig, 33);
      bam_fwrite(bam1, bam_file);
      bam_destroy1(bam1);	 
      alignment_free(alig);
    } else {
      LOG_FATAL_F("alig is NULL, num_items = %lu\n", num_items);
    }
    //printf("\t**************** %i(%i)\n", j, num_items);
  }
  if (array_list) { array_list_free(array_list, NULL); }
}
예제 #4
0
json_object * tg_shared_preferences_find_parent_of_leaf(SharedPreferences* thiz,const CHAR* key_path,CHAR** leaf_key)
{
    struct json_object *jso=NULL;
    struct array_list* key_list = NULL;
    INT32 key_list_len = 0;
    INT32 idx = 0;
    return_val_if_fail((thiz&&key_path),NULL);
    key_list = array_list_new(tg_shared_preferences_key_free);
    return_val_if_fail((key_list),NULL);
    return_val_if_fail(tg_shared_preferences_parse_keypath(key_path,key_list,&key_list_len),NULL);
    for (jso=thiz->obj; idx<key_list_len-1; idx++)
    {
        jso = json_object_object_get(jso,(CHAR*)array_list_get_idx(key_list,idx));
        if (jso==NULL)
            break;

    }
    if (jso!=NULL)
    {
        CHAR* key = (CHAR*)array_list_get_idx(key_list,key_list_len-1);
        //ASSERT(key);
        *leaf_key = TG_CALLOC((strlen(key)+1),1);
        strcpy(*leaf_key,key);
    }
    array_list_free(key_list);
    return jso;

}
예제 #5
0
int merge_remaining_interval(kh_pos_t* positions_read, vcf_file_t **files, shared_options_data_t *shared_options_data,
                              merge_options_data_t *options_data, list_t *output_list) {
	int num_entries = 0;

    #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries)
    for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) {
        if (kh_exist(positions_read, k)) {
            array_list_t *records_in_position = kh_value(positions_read, k);
            assert(records_in_position);
            
            // Launch merge
            int err_code = 0;
            vcf_record_t *merged = merge_position((vcf_record_file_link **) records_in_position->items, records_in_position->size, 
                                                  files, options_data->num_files, options_data, &err_code);
            
            if (!err_code) {
                list_item_t *item = list_item_new(k, MERGED_RECORD, merged);
                list_insert_item(item, output_list);
                num_entries += 1;
            }
            
            // Free empty nodes (lists of records in the same position)
            array_list_free(records_in_position, vcf_record_file_link_free);
            kh_del(pos, positions_read, k);
        }
    }

    return num_entries;
}
예제 #6
0
void vcf_batch_free(vcf_batch_t* batch) {
    assert(batch);
    
    if (batch->text && !mmap_vcf) {
//         printf("text to free = '%.*s'\n", 50, batch->text);
        free(batch->text);
    }
    array_list_free(batch->records, vcf_record_free);
    free(batch);
}
예제 #7
0
void vcf_record_free_deep(vcf_record_t *record) {
    assert(record);
    free(record->chromosome);
    free(record->id);
    free(record->reference);
    free(record->alternate);
    free(record->filter);
    free(record->info);
    free(record->format);
    array_list_free(record->samples, free);
    free(record);
}
예제 #8
0
void jstruct_allocated_free(void *data) {
    struct jstruct_allocated *allocated = (struct jstruct_allocated *)data;
    switch (allocated->type) {
        case jstruct_allocated_type_raw:
            free(allocated->data);
            break;
        case jstruct_allocated_type_arraylist:
            array_list_free((array_list *)allocated->data);
            break;
    }
    free(data);
}
void *fastq_reader(void *input) {
     struct timeval start, end;
     double time;
     extern size_t fd_read_bytes;
     size_t read_bytes;
     //if (time_on) { start_timer(start); }

     wf_input_t *wf_input = (wf_input_t *) input;
     batch_t *new_batch = NULL;
     batch_t *batch = wf_input->batch;
     fastq_batch_reader_input_t *fq_reader_input = wf_input->fq_reader_input;
     array_list_t *reads = array_list_new(10000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

     if (fq_reader_input->gzip) {
       //Gzip fastq file
       if (fq_reader_input->flags == SINGLE_END_MODE) {
	 fastq_gzread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1);
       } else {
	 //printf("Gzip Reader for pair-end not implemented\n");;
	 fastq_gzread_bytes_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2);
	 //fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, 
	 //		      fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2);
       }
     } else {
       //Fastq file
       if (fq_reader_input->flags == SINGLE_END_MODE) {
	 read_bytes = fastq_fread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1);
       } else {
	 read_bytes = fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, 
				      fq_reader_input->fq_file1, fq_reader_input->fq_file2);
       }
       fd_read_bytes += read_bytes;
     }

     size_t num_reads = array_list_size(reads);

     if (num_reads == 0) {
	  array_list_free(reads, (void *)fastq_read_free);
     } else {
	  mapping_batch_t *mapping_batch = mapping_batch_new(reads, 
							     batch->pair_input->pair_mng);

	  new_batch = batch_new(batch->bwt_input, batch->region_input, batch->cal_input, 
				batch->pair_input, batch->preprocess_rna, batch->sw_input, batch->writer_input, 
				batch->mapping_mode, mapping_batch);
     }

     //if (time_on) { stop_timer(start, end, time); timing_add(time, FASTQ_READER, timing); }
     //printf("Read batch %i\n", num_reads);
     
     return new_batch;
}
예제 #10
0
void workflow_free(workflow_t *wf) {
     if (wf == NULL) return;

     if (wf->stage_times) {
       free(wf->stage_times);
     }
     
     if (wf->pending_items) {
	  for (int i = 0; i < wf->num_stages; i++) {
	       array_list_free(wf->pending_items[i], NULL);
	  }
	  free(wf->pending_items);
     }
     
     if (wf->completed_items) array_list_free(wf->completed_items, NULL);
     
     if (wf->num_stages && wf->stage_labels) {
	  for (int i = 0; i < wf->num_stages; i++) {
	       if (wf->stage_labels[i]) {
		    free(wf->stage_labels[i]);
	       }
	  }
	  free(wf->stage_labels);
     }
     
     if (wf->producer_label) {
	  free(wf->producer_label);
     }
     
     if (wf->consumer_label) {
	  free(wf->consumer_label);
     }

     if (wf->stage_times_mutex) { free(wf->stage_times_mutex); }
     
     free(wf);
}
예제 #11
0
void *sa_fq_reader(void *input) {
  sa_wf_input_t *wf_input = (sa_wf_input_t *) input;
  
  sa_wf_batch_t *new_wf_batch = NULL;
  sa_wf_batch_t *curr_wf_batch = wf_input->wf_batch;
  
  fastq_batch_reader_input_t *fq_reader_input = wf_input->fq_reader_input;
  array_list_t *reads = array_list_new(fq_reader_input->batch_size, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

  if (fq_reader_input->gzip) {
    // Gzip fastq file
    if (fq_reader_input->flags == SINGLE_END_MODE) {
      fastq_gzread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1);
    } else {
      fastq_gzread_bytes_pe(reads, fq_reader_input->batch_size, fq_reader_input->fq_gzip_file1, fq_reader_input->fq_gzip_file2);
    }
  } else {
    // Fastq file
    if (fq_reader_input->flags == SINGLE_END_MODE) {
      fastq_fread_bytes_se(reads, fq_reader_input->batch_size, fq_reader_input->fq_file1);
    } else {
      fastq_fread_bytes_aligner_pe(reads, fq_reader_input->batch_size, 
				   fq_reader_input->fq_file1, fq_reader_input->fq_file2);
    }
  }
  
  size_t num_reads = array_list_size(reads);
  
  if (num_reads == 0) {
    array_list_free(reads, (void *)fastq_read_free);
  } else {
    sa_mapping_batch_t *sa_mapping_batch = sa_mapping_batch_new(reads);
    sa_mapping_batch->bam_format = wf_input->bam_format;

    new_wf_batch = sa_wf_batch_new(curr_wf_batch->options,
				   curr_wf_batch->sa_index,
				   curr_wf_batch->writer_input, 
				   sa_mapping_batch,
				   NULL);
  }

  return new_wf_batch;

}
예제 #12
0
int merge_interval(kh_pos_t* positions_read, char *max_chromosome_merged, unsigned long max_position_merged,
                    char **chromosome_order, int num_chromosomes, vcf_file_t **files, 
                    shared_options_data_t *shared_options_data, merge_options_data_t *options_data, list_t *output_list) {
	int num_entries = 0;

    #pragma omp parallel for num_threads(shared_options_data->num_threads) reduction(+:num_entries)
    for (int k = kh_begin(positions_read); k < kh_end(positions_read); k++) {
        if (kh_exist(positions_read, k)) {
            array_list_t *records_in_position = kh_value(positions_read, k);
            assert(records_in_position);
            
            vcf_record_t *record = ((vcf_record_file_link*) array_list_get(0, records_in_position))->record;
            vcf_record_file_link **links = NULL;
            int num_links = 0;
            
            // Remove positions prior to the last chromosome:position to merge
            int cmp_chrom = compare_chromosomes(record->chromosome, max_chromosome_merged, chromosome_order, num_chromosomes);
            if (cmp_chrom < 0 || (cmp_chrom == 0 && compare_positions(record->position, max_position_merged) <= 0)) {
                links = records_in_position->items;
                num_links = records_in_position->size;
            }
            
            // Launch merge
            if (num_links > 0) {
//                 printf("links[0] = %s:%ld in file %s\n", links[0]->record->chromosome, links[0]->record->position, links[0]->file->filename);
                int err_code = 0;
                vcf_record_t *merged = merge_position(links, num_links, files, options_data->num_files, options_data, &err_code);
                
                if (!err_code) {
                    list_item_t *item = list_item_new(k, MERGED_RECORD, merged);
                    list_insert_item(item, output_list);
                    num_entries += 1;
                }
                
                // Free empty nodes (lists of records in the same position)
                array_list_free(records_in_position, vcf_record_file_link_free);
                kh_del(pos, positions_read, k);
            }
        } // End kh_exist
    }

    return num_entries;
}
예제 #13
0
static void report_vcf_variant_stats_sqlite3(sqlite3 *db, int num_variants, variant_stats_t **stats_batch) {
    array_list_t *fields = array_list_new(num_variants + 1, 1.1, COLLECTION_MODE_ASYNCHRONIZED);

    variant_stats_t *var_stats;
    for (int i = 0; i < num_variants; i++) {
        var_stats = stats_batch[i];
        variant_stats_db_fields_t *f = variant_stats_db_fields_new(var_stats->chromosome, var_stats->position, var_stats->ref_allele, var_stats->alt_alleles,
                                       var_stats->maf_allele, var_stats->maf, var_stats->mgf_genotype, var_stats->mgf,
                                       var_stats->missing_alleles, var_stats->missing_genotypes,
                                       var_stats->mendelian_errors, var_stats->is_indel,
                                       var_stats->cases_percent_dominant, var_stats->controls_percent_dominant,
                                       var_stats->cases_percent_recessive, var_stats->controls_percent_recessive);

        array_list_insert(f, fields);
    }

    insert_variant_stats_db_fields_list(fields, db);

    array_list_free(fields, (void *)variant_stats_db_fields_free);
}
예제 #14
0
/*
require interface
*/
json_object * tg_shared_preferences_find_leaf_obj(SharedPreferences* thiz,const CHAR* key_path)
{
    struct json_object *jso=NULL;
    struct array_list* key_list = NULL;
    INT32 key_list_len = 0;
    INT32 idx = 0;

    return_val_if_fail((thiz&&key_path),NULL);
    if (strcmp(key_path,"/")==0)
        return thiz->obj;
    key_list = array_list_new(tg_shared_preferences_key_free);
    return_val_if_fail((key_list),NULL);
    return_val_if_fail(tg_shared_preferences_parse_keypath(key_path,key_list,&key_list_len),NULL);
    for (jso=thiz->obj; idx<key_list_len &&jso; idx++)
    {
        jso = json_object_object_get(jso,(CHAR*)array_list_get_idx(key_list,idx));

    }
    array_list_free(key_list);
    return jso;

}
예제 #15
0
파일: filesystem.c 프로젝트: jucs/musicfs
static int filesystem_getattr(const char *path, struct stat *statbuf) {
    int retstat;

    logging_log("Filesystem", LOGGING_LEVEL_INFO,
                "filesystem_getattr(path=\"%s\", statbuf=0x%08x)", path, statbuf);

    struct timeval tv;
    gettimeofday(&tv, NULL);

    struct timespec ts;
    ts.tv_nsec = tv.tv_usec * 1000;
    ts.tv_sec = tv.tv_sec;

    statbuf->st_uid = getuid();
    statbuf->st_size = 0;
    statbuf->st_rdev = 0;
    statbuf->st_nlink = 0;
    statbuf->st_mtime = ts.tv_sec;
    statbuf->st_mtimensec = ts.tv_nsec;
    statbuf->st_ino = 0;
    statbuf->st_gid = getgid();
    statbuf->st_dev = 0;
    statbuf->st_ctime = ts.tv_sec;
    statbuf->st_ctimensec = ts.tv_nsec;
    statbuf->st_blocks = 0;
    statbuf->st_blksize = 0;
    statbuf->st_atime = ts.tv_sec;
    statbuf->st_atimensec = ts.tv_nsec;

    path_t *path_parsed = path_parse(path);

    logging_log("Filesystem", LOGGING_LEVEL_INFO, "path_parsed->parts_length: %lu...",
                path_parsed->parts_length);

    switch(path_parsed->parts_length) {
    case 0: {
        logging_log("Filesystem", LOGGING_LEVEL_INFO, "filesystem_getattr() - /...");
        statbuf->st_mode = configuration->filesystem_directory_mode;
        retstat = 0;
        break;
    }
    case 1: {
        if(!strcmp(path_parsed->parts[0], "search")) {
            retstat = 0;
            statbuf->st_mode = configuration->filesystem_directory_mode;
            break;
        }
        //			if(path_parsed->parts[0][0] == '.')
        //				retstat = -1;
        //			else {
        //				statbuf->st_mode = FILESYSTEM_DIRECTORY_MODE;
        //				retstat = 0;
        //			}
        pthread_mutex_lock(filesystem_io_mutex);
        ArrayList *search_dirs = searcher_get_searches();
        pthread_mutex_unlock(filesystem_io_mutex);
        if(search_dirs != NULL) {
            retstat = -1;
            for(size_t i = 0; i < array_list_get_length(search_dirs); i++) {
                char *name;
                array_list_get(search_dirs, (const void **)&name, i);

                if(!strcmp(path_parsed->parts[0], name)) {
                    retstat = 0;
                    statbuf->st_mode = configuration->filesystem_directory_mode;
                    goto for_end;
                }
            }
for_end:
            ;
            array_list_free(search_dirs);
        } else
            retstat = -1;
        break;
    }
    case 2: {
        if(!strcmp(path_parsed->parts[0], "search"))
            if(strlen(path_parsed->parts[0]) > 3) {
                ALDictionary *value = searcher_file_name_url_dictionary_get(path_parsed->parts[1]);
                pthread_mutex_lock(filesystem_io_mutex);
                searcher_add_search(path_parsed->parts[1], value);
                pthread_mutex_unlock(filesystem_io_mutex);
                retstat = 0;
                statbuf->st_mode = configuration->filesystem_file_mode;
                break;
            }

        pthread_mutex_lock(filesystem_io_mutex);
        ALDictionary *results = searcher_get_search_results(path_parsed->parts[0]);
        if(results != NULL) {
            logging_log("Filesystem", LOGGING_LEVEL_INFO, "results != NULL...");

            char result;
            char *url = (char*)al_dictionary_get(results, &result, path_parsed->parts[1]);

            if(result) {
                logging_log("Filesystem", LOGGING_LEVEL_INFO, "Invalid file %s...", path);
                retstat = -1;
            } else {
                logging_log("Filesystem", LOGGING_LEVEL_INFO, "Url is %s...", url);

                retstat = 0;
                statbuf->st_mode = configuration->filesystem_file_mode;
                statbuf->st_size = downloader_file_size_try_get(url);
            }
        } else
            retstat = -1;
        pthread_mutex_unlock(filesystem_io_mutex);
        break;
    }
    default: {
        break;
    }
    }
    path_free(path_parsed);

    logging_log("Filesystem", LOGGING_LEVEL_INFO, "filesystem_getattr() finished...");

    return retstat;
}
예제 #16
0
void vcf_record_free(vcf_record_t *record) {
    assert(record);
    array_list_free(record->samples, free);
    free(record);
}
예제 #17
0
int apply_seeding(region_seeker_input_t* input, batch_t *batch) {
  //printf("APPLY SEEDING...\n");


  //if (time_on) { start_timer(start); }


  mapping_batch_t *mapping_batch = batch->mapping_batch;

  size_t num_mappings;


  int seed_size = input->cal_optarg_p->seed_size;
  size_t min_seed_size = input->cal_optarg_p->min_seed_size;






  size_t num_targets = mapping_batch->num_targets;
  size_t *targets = mapping_batch->targets;
  size_t new_num_targets = 0;
  fastq_read_t *read;

  int min_intron_size = 40;

  int target;
  bwt_anchor_t *bwt_anchor = NULL;
  region_t *region;
  int gap_nt;
  int start_search;
  int end_search;

  // set to zero
  mapping_batch->num_to_do = 0;
  
  //TODO: omp parallel for !!
  /*if (batch->mapping_mode == 1000) {
    for (size_t i = 0; i < num_targets; i++) {
      //printf("Seq (i=%i)(target=%i): %s\n", i, targets[i], read->sequence);
      read = array_list_get(targets[i], mapping_batch->fq_batch);
      num_mappings = bwt_map_exact_seeds_seq(padding_left,
					     padding_right,
					     read->sequence,
					     seed_size,
					     min_seed_size,
					     input->bwt_optarg_p, 
					     input->bwt_index_p, 
					     mapping_batch->mapping_lists[targets[i]],
					     mapping_batch->extra_stage_id[targets[i]]);
      
      //printf("Num mappings %i\n", num_mappings);
      if (num_mappings > 0) {
	array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]);
	targets[new_num_targets++] = targets[i];
	mapping_batch->num_to_do += num_mappings;
      }
    }
    } else {*/
  
  //size_t new_num_targets = 0;
  //size_t *new_targets = (size_t *)malloc(array_list_size(fq_batch)*sizeof(size_t));
  array_list_t *array_list_aux = array_list_new(256, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  
  //Flag 0: The read has simple anchor or any, and need seeds and normal Cal_Seeker 
  //Flag 1: The read has double anchor and the gap is smaller than MIN_INTRON_SIZE. Cal_Seeker will be make one CAL
  //Flag 2: The read has double anchor but the gap is bigger than MIN_INTRON_SIZE. 
  for (size_t i = 0; i < num_targets; i++) {
    read = array_list_get(targets[i], mapping_batch->fq_batch);    
    //printf("Read Region %s: \n", read->id);
    /* if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 ||
	array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      array_list_clear(mapping_batch->mapping_lists[targets[i]], bwt_anchor_free);
      continue;
      }
    */
    if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 0 || 
	array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      //Flag 0 Case, Not anchors found, Make normal seeds      
      //      printf("***** Normal Case 0. Not anchors found!\n");
      for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) {
	bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	array_list_insert(bwt_anchor, array_list_aux);
      }
      num_mappings = 0;

      num_mappings = bwt_map_exact_seeds_seq(0,
					     0,
					     read->sequence,
					     seed_size,
					     min_seed_size,
					     input->bwt_optarg_p, 
					     input->bwt_index_p, 
					     mapping_batch->mapping_lists[targets[i]],
					     0);
      if (num_mappings > 0) {
	array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]);
	targets[new_num_targets++] = targets[i];
	//mapping_batch->num_to_do += num_mappings;
      }
    } else if (array_list_get_flag(mapping_batch->mapping_lists[targets[i]]) == 1) {
      //Flag 1 Case, One anchor found, Make displacements seeds                  
      printf("***** Case 1. One anchor found!\n");
      for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j--) {
	bwt_anchor = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	array_list_insert(bwt_anchor, array_list_aux);
      }

      int anchor_nt = bwt_anchor->end - bwt_anchor->start;
      int seed_id = 0;
      int seed_start, seed_end;
      int extra_seed;

      if ((bwt_anchor->type == FORWARD_ANCHOR && bwt_anchor->strand == 0) || 
	  (bwt_anchor->type == BACKWARD_ANCHOR && bwt_anchor->strand == 1 )) {
	start_search = anchor_nt + 1;
	end_search = read->length - 1;
	extra_seed = EXTRA_SEED_END;
      } else {
	start_search = 0;
	end_search = read->length - anchor_nt - 2;
	extra_seed = EXTRA_SEED_START;
      }

      printf("end_start %i - start_search %i = %i >= seed_size %i\n", end_search, start_search, end_search - start_search, seed_size);
      if (end_search - start_search >= seed_size) {
	printf("00 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search);
	/*
		num_mappings = bwt_map_exact_seeds_between_coords(start_search,
							  end_search,
							  read->sequence, 
							  seed_size, min_seed_size,
							  input->bwt_optarg_p, 
							  input->bwt_index_p, 
							  mapping_batch->mapping_lists[targets[i]],
							  extra_seed, &seed_id);
	*/
      }

      if (bwt_anchor->type == FORWARD_ANCHOR) {
	seed_id = 0;
	seed_start = 0;
	seed_end = anchor_nt;
      } else {
	seed_id += 1;
	seed_start = read->length - anchor_nt - 1;
	seed_end = read->length - 1;
      }

      for (int j = 0; j < array_list_size(array_list_aux); j++) {
	bwt_anchor_t *bwt_anchor = array_list_get(j, array_list_aux);
	//	printf("\tCreate seed Anchor [%i:%lu|%i-%i|%lu]\n", bwt_anchor->chromosome + 1, bwt_anchor->start, 
	//	       seed_start,seed_end,bwt_anchor->end);
	region = region_bwt_new(bwt_anchor->chromosome + 1,
				bwt_anchor->strand,
				bwt_anchor->start,
				bwt_anchor->end,
				seed_start,
				seed_end,
				read->length,
				seed_id);	  
	array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);
      } 
      array_list_clear(array_list_aux, (void *)bwt_anchor_free); 
      array_list_set_flag(0, mapping_batch->mapping_lists[targets[i]]);
      targets[new_num_targets++] = targets[i];
    } else {
      //Flag 2 Case, Pair of anchors found
      printf("***** Case 2. Double anchor found!\n");
      bwt_anchor_t *bwt_anchor;
      bwt_anchor_t *bwt_anchor_forw, *bwt_anchor_back;

      int read_nt, genome_nt;


      int distance;
      int found = 0;
      region_t *region;
      int seed_id = 0;
      //if (array_list_size(mapping_batch->mapping_lists[targets[i]]) > 2) {
      int *anchors_targets = (int *)calloc(array_list_size(mapping_batch->mapping_lists[targets[i]]), sizeof(int));
      int num = 0;

      //min_intron_size = 0;

      //Search if one anchor is at the same distance from the reference and the read
      for (int b = 0; b < array_list_size(mapping_batch->mapping_lists[targets[i]]); b += 2) {
	bwt_anchor_forw = array_list_get(b, mapping_batch->mapping_lists[targets[i]]);
	bwt_anchor_back = array_list_get(b + 1, mapping_batch->mapping_lists[targets[i]]);
	//printf("FORW=%i:%lu-%lu BACK=%i:%lu-%lu\n", bwt_anchor_forw->chromosome, bwt_anchor_forw->start, bwt_anchor_forw->end,
	//     bwt_anchor_back->chromosome, bwt_anchor_back->start, bwt_anchor_back->end);
	read_nt = read->length - ((bwt_anchor_forw->end - bwt_anchor_forw->start) + (bwt_anchor_back->end - bwt_anchor_back->start));
	genome_nt = bwt_anchor_back->start - bwt_anchor_forw->end;	  
	distance = abs(genome_nt - read_nt);
	//printf("\t%i:Distance %i\n", b, distance);
	if (distance < min_intron_size) {
	  found = 1;
	} else {
	  anchors_targets[num++] = b;
	}
      }

      if (found) {
	//printf("\tFound Exact Case... Delete other anchors\n");
	for (int t = num - 1; t >= 0; t--) {
	  target = anchors_targets[t];
	  //printf("\tDelete %i, %i-->\n", target, target + 1);
	  bwt_anchor = array_list_remove_at(target + 1, mapping_batch->mapping_lists[targets[i]]);
	  bwt_anchor_free(bwt_anchor);
	  
	  bwt_anchor = array_list_remove_at(target, mapping_batch->mapping_lists[targets[i]]);
	  bwt_anchor_free(bwt_anchor);
	}
	array_list_set_flag(1, mapping_batch->mapping_lists[targets[i]]);
      } else {
	//Seeding between anchors
	//printf("\tFound gap between anchors \n");
	array_list_t *anchors_forward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]),
						       1.25f, COLLECTION_MODE_ASYNCHRONIZED);
	array_list_t *anchors_backward = array_list_new(array_list_size(mapping_batch->mapping_lists[targets[i]]),
							1.25f, COLLECTION_MODE_ASYNCHRONIZED);
	int big_gap = 0;
	int final_anchor_nt = 0;
	int anchor_nt;
	int anchor_type;
	int anchor_strand;
	for (int j = array_list_size(mapping_batch->mapping_lists[targets[i]]) - 1; j >= 0; j -= 2) {
	  bwt_anchor_back = array_list_remove_at(j, mapping_batch->mapping_lists[targets[i]]);
	  array_list_insert(bwt_anchor_back, anchors_backward);

	  bwt_anchor_forw = array_list_remove_at(j - 1, mapping_batch->mapping_lists[targets[i]]);
	  array_list_insert(bwt_anchor_forw, anchors_forward);

	  if (bwt_anchor_forw->strand == 0) {
	    anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	    gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));
	  } else {
	    anchor_nt = bwt_anchor_back->end - bwt_anchor_back->start;
	    gap_nt = read->length - (anchor_nt + (bwt_anchor_forw->end - bwt_anchor_forw->start));
	  }
	  if (gap_nt < 0) { gap_nt = 0; }
	  //printf("Gap nt (%i - %i): %i\n", anchor_nt, bwt_anchor_back->end - bwt_anchor_back->start, gap_nt);
	  if (gap_nt > big_gap) {
	    big_gap = gap_nt;
	    final_anchor_nt = anchor_nt;
	    anchor_type = bwt_anchor_back->type;
	    anchor_strand = bwt_anchor_back->strand;
	  }
	}

	printf("%i, %i\n", big_gap - 2, seed_size);
	if (big_gap - 2 > seed_size) {
	  //if (anchor_type == FORWARD_ANCHOR && anchor_strand == 0 || 
	  //  anchor_type == BACKWARD_ANCHOR && anchor_strand == 1 ) {
	    start_search = final_anchor_nt + 1;
	    end_search = final_anchor_nt + big_gap - 1;
	    //} else {
	    // start_search = final_anchor_nt + big_gap - 1;
	    //end_search = final_anchor_nt + 1;
	    //}
	  
	    //printf("Seeding between anchors... gap=%i\n", big_gap);
	    printf("11 bwt_map_exact_seeds_between_coords --> searching from %i to %i\n", start_search, end_search);
	    /*
	    num_mappings = bwt_map_exact_seeds_between_coords(start_search,
							      end_search,
							      read->sequence, seed_size, min_seed_size,
							      input->bwt_optarg_p, 
							      input->bwt_index_p, 
							      mapping_batch->mapping_lists[targets[i]],
							      EXTRA_SEED_NONE,
							      &seed_id);
	    */
	}

	//printf("Making seeds anchors...\n");
	for (int a = 0; a < array_list_size(anchors_forward); a++) {
	  //Insert the last anchor. (Create new seed)
	  bwt_anchor_forw = array_list_get(a, anchors_forward);
	  bwt_anchor_back = array_list_get(a, anchors_backward);

	  anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	  gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));

	  //printf("\t --> Big Seed: %i, gap_nt: %i, anchor_nt = %i\n", a, gap_nt, anchor_nt);
	  if (gap_nt < 0) {
	    //gap_nt = 0;
	    bwt_anchor_forw->end   += gap_nt;
	    bwt_anchor_back->start -= gap_nt;
	    anchor_nt += gap_nt;
	    gap_nt = 0;
	  } else if (gap_nt == 0) {
	    bwt_anchor_forw->end -= 1;
	    bwt_anchor_back->start += 1;
	    anchor_nt -= 1;
	    gap_nt = 1;	    
	  }


	  region = region_bwt_new(bwt_anchor_forw->chromosome + 1,
				  bwt_anchor_forw->strand,
				  bwt_anchor_forw->start,
				  bwt_anchor_forw->end,
				  0,
				  anchor_nt,
				  read->length,
				  0);
	  //printf("Region: %i-%i\n", region->seq_start, region->seq_end);
	  array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);

	  region = region_bwt_new(bwt_anchor_back->chromosome + 1,
				  bwt_anchor_back->strand,
				  bwt_anchor_back->start,
				  bwt_anchor_back->end,
				  anchor_nt + gap_nt,
				  read->length - 1,
				  read->length,
				  seed_id + 1);
	  //printf("Region: %i-%i\n", region->seq_start, region->seq_end);
	  array_list_insert(region, mapping_batch->mapping_lists[targets[i]]);

	  //printf("\tMaking seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]]));

	  bwt_anchor_free(bwt_anchor_back);
	  bwt_anchor_free(bwt_anchor_forw);
	}
	array_list_free(anchors_forward, NULL);
	array_list_free(anchors_backward, NULL);
	//printf("Making seeds anchors end, %i seeds\n", array_list_size(mapping_batch->mapping_lists[targets[i]]));

	array_list_set_flag(2, mapping_batch->mapping_lists[targets[i]]);
      }
      free(anchors_targets);
      targets[new_num_targets++] = targets[i];
    }
  }

  mapping_batch->num_targets = new_num_targets;

  array_list_free(array_list_aux, NULL);

  //if (time_on) { stop_timer(start, end, time); timing_add(time, REGION_SEEKER, timing); }

  //printf("APPLY SEEDING DONE!\n");
  
  return CAL_STAGE;

}
예제 #18
0
void vcf_header_entry_free(vcf_header_entry_t *header_entry) {
    assert(header_entry);
    free(header_entry->name);
    array_list_free(header_entry->values, free);
    free(header_entry);
}
예제 #19
0
int apply_sw_bs_4nt(sw_server_input_t* input, batch_t *batch) {

  mapping_batch_t *mapping_batch = batch->mapping_batch;
  genome_t *genome1 = input->genome1_p;
  genome_t *genome2 = input->genome2_p;
  sw_optarg_t *sw_optarg = &input->sw_optarg;

  {
    char r[1024];
    size_t start = 169312417;
    size_t end = start + 99;
    genome_read_sequence_by_chr_index(r, 0,
				      0, &start, &end, genome2);
    printf("+++++++++++++ genome2 = %s \n", r);
    genome_read_sequence_by_chr_index(r, 0,
				      0, &start, &end, genome1);
    printf("+++++++++++++ genome1 = %s \n", r);

  }

  // fill gaps between seeds
  fill_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 5, 1);
  merge_seed_regions_bs(mapping_batch, 1);
  fill_end_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 400, 1);
  
  fill_gaps_bs(mapping_batch, sw_optarg, genome1, genome2, 20, 5, 0);
  merge_seed_regions_bs(mapping_batch, 0);
  fill_end_gaps_bs(mapping_batch, sw_optarg, genome2, genome1, 20, 400, 0);

  // now we can create the alignments
  fastq_read_t *read;
  array_list_t *fq_batch = mapping_batch->fq_batch;
  
  char *match_seq, *match_qual;
  size_t read_index, read_len, match_len, match_start;
  
  cal_t *cal;
  array_list_t *cal_list = NULL;
  size_t num_cals;
  
  seed_region_t *s;
  cigar_code_t *cigar_code;
  cigar_op_t *first_op;

  float score, norm_score, min_score = input->min_score;

  alignment_t *alignment;
  array_list_t *alignment_list;

  char *p, *optional_fields;
  int optional_fields_length, AS;

  array_list_t **mapping_lists;
  size_t num_targets;
  size_t *targets;

  for (int bs_id = 0; bs_id < 2; bs_id++) {

    if (bs_id == 0) {
      mapping_lists = mapping_batch->mapping_lists;
      num_targets = mapping_batch->num_targets;
      targets = mapping_batch->targets;
    } else {
      mapping_lists = mapping_batch->mapping_lists2;
      num_targets = mapping_batch->num_targets2;
      targets = mapping_batch->targets2;
    }

    for (size_t i = 0; i < num_targets; i++) {
      read_index = targets[i];
      read = (fastq_read_t *) array_list_get(read_index, fq_batch);
      
      cal_list = mapping_lists[read_index];
      num_cals = array_list_size(cal_list);
      
      if (num_cals <= 0) continue;
    
      read_len = read->length;
    
      alignment_list = array_list_new(num_cals, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

      // processing each CAL from this read
      for(size_t j = 0; j < num_cals; j++) {

	// get cal and read index
	cal = array_list_get(j, cal_list);
	if (cal->sr_list->size == 0) continue;
	
	s = (seed_region_t *) linked_list_get_first(cal->sr_list);
	cigar_code = (cigar_code_t *) s->info;
	
	norm_score = cigar_code_get_score(read_len, cigar_code);
	score = norm_score * 100; //read_len;
	LOG_DEBUG_F("score = %0.2f\n", norm_score);

	// filter by SW score
	if (norm_score > min_score) {

	  // update cigar and sequence and quality strings
	  cigar_code_update(cigar_code);
	  LOG_DEBUG_F("\tcigar code = %s\n", new_cigar_code_string(cigar_code));
	  match_start = 0;
	  match_len = cigar_code_nt_length(cigar_code); 
	  first_op = cigar_code_get_first_op(cigar_code);
	  match_start = (first_op && first_op->name == 'H' ? first_op->number : 0);
	  
	  match_seq = (char *) malloc((match_len + 1)* sizeof(char));
	  memcpy(match_seq, &read->sequence[match_start], match_len);
	  match_seq[match_len] = 0;
	  
	  match_qual = (char *) malloc((match_len + 1)* sizeof(char));
	  memcpy(match_qual, &read->quality[match_start], match_len);
	  match_qual[match_len] = 0;
	  
	  // set optional fields
	  optional_fields_length = 100;
	  optional_fields = (char *) calloc(optional_fields_length, sizeof(char));
	  
	  p = optional_fields;
	  AS = (int) norm_score * 100;
	
	  sprintf(p, "ASi");
	  p += 3;
	  memcpy(p, &AS, sizeof(int));
	  p += sizeof(int);
	  
	  sprintf(p, "NHi");
	  p += 3;
	  memcpy(p, &num_cals, sizeof(int));
	  p += sizeof(int);
	  
	  sprintf(p, "NMi");
	  p += 3;
	  memcpy(p, &cigar_code->distance, sizeof(int));
	  p += sizeof(int);
	  
	  assert(read->length == cigar_code_nt_length(cigar_code));
	  
	  // create an alignment and insert it into the list
	  alignment = alignment_new();

	  //read_id = malloc(read->length);
	  size_t header_len = strlen(read->id);
	  char *head_id = (char *) malloc(header_len + 1);
	  
	  get_to_first_blank(read->id, header_len, head_id);
	
	  alignment_init_single_end(head_id, match_seq, match_qual, 
				    cal->strand, cal->chromosome_id - 1, cal->start - 1,
				    new_cigar_code_string(cigar_code), 
				    cigar_code_get_num_ops(cigar_code), 
				    norm_score * 254, 1, (num_cals > 1),
				    optional_fields_length, optional_fields, alignment);
	  
	  array_list_insert(alignment, alignment_list);

	  LOG_DEBUG_F("creating alignment (bs_id = %i)...\n", bs_id);
	  //alignment_print(alignment);

	}
      }
      
      // free the cal list, and update the mapping list with the alignment list
      array_list_free(cal_list, (void *) cal_free);
      mapping_lists[read_index] = alignment_list;
    }
  }

  // go to the next stage
  return BS_POST_PAIR_STAGE;
}
예제 #20
0
void fill_gaps(mapping_batch_t *mapping_batch, sw_optarg_t *sw_optarg, 
	       genome_t *genome, int min_gap, int min_distance) {

  int sw_count = 0;

  fastq_read_t *read;
  array_list_t *fq_batch = mapping_batch->fq_batch;

  size_t read_index, read_len;

  cal_t *cal;
  array_list_t *cal_list = NULL;
  size_t num_cals, num_targets = mapping_batch->num_targets;

  char *revcomp_seq = NULL;

  seed_region_t *s, *prev_s, *new_s;
  linked_list_iterator_t* itr;

  cigar_code_t *cigar_code;

  size_t start, end;
  size_t gap_read_start, gap_read_end, gap_read_len;
  size_t gap_genome_start, gap_genome_end, gap_genome_len;

  int left_flank, right_flank;
  sw_prepare_t *sw_prepare;
  array_list_t *sw_prepare_list = array_list_new(1000, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);

  char *query,  *ref;
  int distance, first = 0, last = 0;

  //  LOG_DEBUG("\n\n P R E   -   P R O C E S S\n");

  // initialize query and reference sequences to Smith-Waterman
  for (size_t i = 0; i < num_targets; i++) {

    read_index = mapping_batch->targets[i];
    read = (fastq_read_t *) array_list_get(read_index, fq_batch);
    
    cal_list = mapping_batch->mapping_lists[read_index];
    num_cals = array_list_size(cal_list);
    
    if (num_cals <= 0) continue;

    read_len = read->length;

    min_distance = read_len*0.2;

    LOG_DEBUG_F(">>>>> read %s\n", read->id);
    //    printf(">>>>> read %s\n", read->id);

    // processing each CAL from this read
    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      LOG_DEBUG_F("CAL #%i of %i (strand %i), sr_list size = %i, sr_duplicate_list size = %i\n", 
		  j, num_cals, cal->strand, cal->sr_list->size, cal->sr_duplicate_list->size);

      prev_s = NULL;
      itr = linked_list_iterator_new(cal->sr_list);
      s = (seed_region_t *) linked_list_iterator_curr(itr);
      while (s != NULL) {
	{
	  // for debugging
	  size_t start = s->genome_start;// + 1;
	  size_t end = s->genome_end;// + 1;
	  size_t len = end - start + 1;
	  //	  printf(":::::::::: %lu - %lu = %i ::::::::::::\n", end, start, len );
	  char *ref = (char *) malloc((len + 1) * sizeof(char));
	  genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					    &start, &end, genome);
	  ref[len] = '\0';
	  //
	  LOG_DEBUG_F("\tseed: [%i|%i - %i|%i] %s (len = %i)\n", 
		      s->genome_start, s->read_start, s->read_end, s->genome_end, ref, len);
	  free(ref);
	}

	// set the cigar for the current region
	gap_read_len = s->read_end - s->read_start + 1;
	cigar_code = cigar_code_new();
	cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code);
	s->info = (void *) cigar_code;

	cigar_code = NULL;
	sw_prepare = NULL;

	if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) {
	  distance = 0;
	  mapping_batch->num_gaps++;
	  if (prev_s == NULL) {
	    // gap at the first position
	    gap_read_start = 0;
	    gap_read_end = s->read_start - 1;

	    gap_genome_start = s->genome_start - s->read_start;
	    gap_genome_end = s->genome_start - 1;

	    gap_read_len = gap_read_end - gap_read_start + 1;
	    gap_genome_len = gap_genome_end - gap_genome_start + 1;

	    cal->start = gap_genome_start;

	    assert(gap_read_len != 0);
	    assert(gap_genome_len != 0);

	    if (gap_read_len > min_gap) {
	      // the gap is too big, may be there's another CAL to cover it
	      cigar_code = cigar_code_new();
	      cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code);	      
	    } else {
	      left_flank = 0;
	      right_flank = DOUBLE_FLANK;
	    }
	  } else {
	    assert(prev_s->read_end < s->read_start);

	    // gap in a middle position
	    gap_read_start = prev_s->read_end + 1;
	    gap_read_end = s->read_start - 1;

	    gap_genome_start = prev_s->genome_end + 1;
	    gap_genome_end = s->genome_start - 1;

	    gap_read_len = gap_read_end - gap_read_start + 1;
	    gap_genome_len = gap_genome_end - gap_genome_start + 1;

	    LOG_DEBUG_F("gap (read, genome) = (%i, %i)\n", gap_read_len, gap_genome_len);

	    if (gap_genome_len == 0) { printf("#@#: %s\n", read->id); }
	    assert(gap_genome_len != 0);

	    if (gap_read_len == 0) {
	      // there's a deletion just between two consecutives seeds
	      cigar_code = (cigar_code_t *)prev_s->info;

	      cigar_code_append_op(cigar_op_new(gap_genome_len, 'D'), cigar_code);
	      cigar_code->distance += gap_genome_len;

	      cigar_code_append_op(cigar_op_new(s->read_end - s->read_start + 1, 'M'), cigar_code);
	      cigar_code->distance += ((cigar_code_t *)s->info)->distance;

	      prev_s->read_end = s->read_end;
	      prev_s->genome_end = s->genome_end;

	      LOG_DEBUG_F("prev cigar = %s\n", new_cigar_code_string((cigar_code_t *)prev_s->info));

	      // continue loop...
	      linked_list_iterator_remove(itr);
	      s = linked_list_iterator_curr(itr);
	      continue;
	    }
	      
	    left_flank = SINGLE_FLANK;
	    right_flank = SINGLE_FLANK;
	  }

	  if (!cigar_code) {
	    // we have to try to fill this gap and get a cigar
	    if (gap_read_len == gap_genome_len) {
	      //    1) first, for from  begin -> end, and begin <- end
	      start = gap_genome_start;// + 1;
	      end = gap_genome_end;// + 1;
	      first = -1;
	      last = -1;
	      ref = (char *) malloc((gap_genome_len + 5) * sizeof(char));
	      genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
						&start, &end, genome);
	      // handle strand -
	      if (cal->strand) {
		if (revcomp_seq == NULL) {
		  revcomp_seq = strdup(read->sequence);
		  seq_reverse_complementary(revcomp_seq, read_len);
		}
		query = &revcomp_seq[gap_read_start];
	      } else {
		query = &read->sequence[gap_read_start];
	      }
	      
	      for (int k = 0; k < gap_read_len; k++) {
		if (query[k] != ref[k]) {
		  distance++;
		  if (first == -1) first = k;
		  last = k;
		}
	      }

	      if (distance < min_distance) {
		cigar_code = cigar_code_new();
		cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code);
		cigar_code_inc_distance(distance, cigar_code);
	      }
	    }
	    if (!cigar_code) {
	      //    2) second, prepare SW to run

	      // get query sequence, revcomp if necessary
	      size_t read_start = gap_read_start - left_flank;
	      size_t read_end = gap_read_end + right_flank;
	      int gap_read_len_ex = read_end - read_start + 1;
	      query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char));
	      // handle strand -
	      if (cal->strand) {
		if (revcomp_seq == NULL) {
		  revcomp_seq = strdup(read->sequence);
		  seq_reverse_complementary(revcomp_seq, read_len);
		}
		memcpy(query, &revcomp_seq[read_start], gap_read_len_ex);
	      } else {
		memcpy(query, &read->sequence[read_start], gap_read_len_ex);
	      }
	      query[gap_read_len_ex] = '\0';
	      
	      // get ref. sequence
	      size_t genome_start = gap_genome_start - left_flank;// + 1;
	      size_t genome_end = gap_genome_end + right_flank;// + 1;
	      int gap_genome_len_ex = genome_end - genome_start + 1;
	      ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));;
	      genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
						&genome_start, &genome_end, genome);	      
	      ref[gap_genome_len_ex] = '\0';

	      if (prev_s == NULL) {
		sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, FIRST_SW);
	      } else {
		sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, MIDDLE_SW);
	      }

	      array_list_insert(sw_prepare, sw_prepare_list);
	      
	      // increase counter
	      sw_count++;	  

	      LOG_DEBUG_F("query: %s\n", query);
	      LOG_DEBUG_F("ref  : %s\n", ref);
	      LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", 
			  distance, min_distance, gap_read_len, first, last);
	      LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", 
			  gap_read_start, gap_read_end, gap_genome_start, gap_genome_end,
			  gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, 
			  read->id);

	    }
	  }
	  
	  // insert gap in the list
	  new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0);
	  new_s->info = (void *) cigar_code;
	  linked_list_iterator_insert(new_s, itr);

	  if (sw_prepare) {
	    sw_prepare->seed_region = new_s;
	    sw_prepare->cal = cal;
	    sw_prepare->read = read;
	  }
	}

	// continue loop...
	prev_s = s;
	linked_list_iterator_next(itr);
	s = linked_list_iterator_curr(itr);
      }

      // check for a gap at the last position
      sw_prepare = NULL;
      if (prev_s != NULL && prev_s->read_end < read_len - 1) { 
	cigar_code = NULL;
	mapping_batch->num_gaps++;
	//	mapping_batch->num_sws++;
	//	mapping_batch->num_ext_sws++;

	// gap at the last position
	gap_read_start = prev_s->read_end + 1;
	gap_read_end = read_len - 1;
	gap_read_len = gap_read_end - gap_read_start + 1;

	assert(gap_read_len != 0);

	gap_genome_len = gap_read_len;
	gap_genome_start = prev_s->genome_end + 1;
	gap_genome_end = gap_genome_start + gap_genome_len - 1;

	cal->end = gap_genome_end;

	assert(gap_genome_len != 0);

	//	LOG_DEBUG_F("\t\tgap_read_len = %i, gap_genome_len = %i\n", gap_read_len, gap_genome_len);
	//	LOG_DEBUG_F("\t\t%i : [%lu|%lu - %lu|%lu]\n", 
	//		    sw_count, gap_genome_start, gap_read_start, gap_read_end, gap_genome_end);

	if (gap_read_len > min_gap) {
	  // the gap is too big, may be there's another CAL to cover it
	  cigar_code = cigar_code_new();
	  cigar_code_append_op(cigar_op_new(gap_read_len, 'H'), cigar_code);	      
	} else {
	  // we have to try to fill this gap and get a cigar
	  
	  //    1) first, for from  begin -> end, and begin <- end
	  start = gap_genome_start;// + 1;
	  end = gap_genome_end;// + 1;
	  first = -1;
	  last = -1;
	  ref = (char *) malloc((gap_genome_len + 1) * sizeof(char));;
	  genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					    &start, &end, genome);
	  // handle strand -
	  if (cal->strand) {
	    if (revcomp_seq == NULL) {
	      revcomp_seq = strdup(read->sequence);
	      seq_reverse_complementary(revcomp_seq, read_len);
	    }
	    query = &revcomp_seq[gap_read_start];
	  } else {
	    query = &read->sequence[gap_read_start];
	  }
	  
	  distance = 0;
	  for (int k = 0; k < gap_read_len; k++) {
	    if (query[k] != ref[k]) {
	      distance++;
	      if (first == -1) first = k;
	      last = k;
	    }
	  }
	  if (distance < min_distance) {
	    cigar_code = cigar_code_new();
	    cigar_code_append_op(cigar_op_new(gap_read_len, 'M'), cigar_code);
	    cigar_code_inc_distance(distance, cigar_code);
	  } else {
	    //    2) second, prepare SW to run

	    left_flank = DOUBLE_FLANK;
	    right_flank = 0;
	    
	    // get query sequence, revcomp if necessary
	    size_t read_start = gap_read_start - left_flank;
	    size_t read_end = gap_read_end + right_flank;
	    int gap_read_len_ex = read_end - read_start + 1;
	    query = (char *) malloc((gap_read_len_ex + 1) * sizeof(char));
	    // handle strand -
	    if (cal->strand) {
	      if (revcomp_seq == NULL) {
		revcomp_seq = strdup(read->sequence);
		seq_reverse_complementary(revcomp_seq, read_len);
	      }
	      memcpy(query, &revcomp_seq[read_start], gap_read_len_ex);
	    } else {
	      memcpy(query, &read->sequence[read_start], gap_read_len_ex);
	    }
	    query[gap_read_len_ex] = '\0';
	    
	    // get ref. sequence
	    size_t genome_start = gap_genome_start - left_flank;// + 1;
	    size_t genome_end = gap_genome_end + right_flank;// + 1;
	    int gap_genome_len_ex = genome_end - genome_start + 1;
	    ref = (char *) malloc((gap_genome_len_ex + 1) * sizeof(char));;
	    genome_read_sequence_by_chr_index(ref, 0, cal->chromosome_id - 1, 
					      &genome_start, &genome_end, genome);
	    query[gap_genome_len_ex] = '\0';

	    sw_prepare = sw_prepare_new(query, ref, left_flank, right_flank, LAST_SW);
	    array_list_insert(sw_prepare, sw_prepare_list);
	    
	    // increase counter
	    sw_count++;	  

	    LOG_DEBUG_F("query: %s\n", query);
	    LOG_DEBUG_F("ref  : %s\n", ref);
	    LOG_DEBUG_F("dist.: %i (min. %i) of %i (first = %i, last = %i)\n", 
			distance, min_distance, gap_read_len, first, last);
	    LOG_DEBUG_F("\tto SW (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", 
			gap_read_start, gap_read_end, gap_genome_start, gap_genome_end,
			gap_read_end - gap_read_start + 1, gap_genome_end - gap_genome_start + 1, 
			read->id);
	  }
	}
	
	// insert gap in the list
	new_s = seed_region_new(gap_read_start, gap_read_end, gap_genome_start, gap_genome_end, 0, 0, 0);
	new_s->info = (void *) cigar_code;
	linked_list_insert_last(new_s, cal->sr_list);

	if (sw_prepare) {
	  sw_prepare->seed_region = new_s;
	  sw_prepare->cal = cal;
	  sw_prepare->read = read;
	}
      }
      linked_list_iterator_free(itr);      
    }

    // free memory
    if (revcomp_seq) {
      free(revcomp_seq);
      revcomp_seq = NULL;
    }
  }

  //  display_sr_lists("ATER pre-process in fill_gaps", mapping_batch);

  LOG_DEBUG_F("\nR U N   S W (sw_count = %i, sw_prepare_list size = %i)\n", sw_count, array_list_size(sw_prepare_list));
  assert(sw_count == array_list_size(sw_prepare_list));

  char *q[sw_count], *r[sw_count];
  for (int i = 0; i < sw_count; i++) {
    sw_prepare = array_list_get(i, sw_prepare_list);
    q[i] = sw_prepare->query;
    r[i] = sw_prepare->ref;
  }
  sw_multi_output_t *output = sw_multi_output_new(sw_count);

  // run Smith-Waterman
  smith_waterman_mqmr(q, r, sw_count, sw_optarg, 1, output);
  
  LOG_DEBUG("P O S T   -   P R O C E S S\n");
  cigar_op_t* cigar_op;
  for (int i = 0; i < sw_count; i++) {
    sw_prepare = array_list_get(i, sw_prepare_list);
    s = sw_prepare->seed_region;

    int read_gap_len = s->read_end - s->read_start + 1;
    int genome_gap_len = s->genome_end - s->genome_start + 1;

    int read_gap_len_ex = read_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank;
    int genome_gap_len_ex = genome_gap_len_ex + sw_prepare->left_flank + sw_prepare->right_flank;

    LOG_DEBUG_F("\tgap (read %lu-%lu, genome %lu-%lu) = (%i, %i): read %s\n", 
		s->read_start, s->read_end, s->genome_start, s->genome_end,
		read_gap_len, genome_gap_len, sw_prepare->read->id);
    LOG_DEBUG_F("\tflanks (left, right) = (%i, %i)\n", sw_prepare->left_flank, sw_prepare->right_flank);
    LOG_DEBUG_F("\tquery : %s\n", sw_prepare->query);
    LOG_DEBUG_F("\tref   : %s\n", sw_prepare->ref);
    LOG_DEBUG_F("\tmquery: %s (start %i)\n", output->query_map_p[i], output->query_start_p[i]);
    LOG_DEBUG_F("\tmref  : %s (start %i)\n", output->ref_map_p[i], output->ref_start_p[i]);

    cigar_code_t *cigar_c = generate_cigar_code(output->query_map_p[i], output->ref_map_p[i],
						strlen(output->query_map_p[i]), output->query_start_p[i],
						output->ref_start_p[i], read_gap_len, genome_gap_len,
						&distance, sw_prepare->ref_type);
    LOG_DEBUG_F("\tscore : %0.2f, cigar: %s (distance = %i)\n", 
		output->score_p[i], new_cigar_code_string(cigar_c), distance);

    /*
    if (output->query_start_p[i] > 0 && output->ref_start_p[i] > 0 && 
	output->query_start_p[i] != output->ref_start_p[i]) { 
      LOG_DEBUG("both map start points > 0 and are different lengths");
      exit(-1);
    }
    */
    //    assert(output->query_start_p[i] == 0);
    //    assert(output->ref_start_p[i] == 0);

    cigar_op = cigar_code_get_op(0, cigar_c);
    if (cigar_op) {
      if (cigar_op->name == 'H') {
	if (output->ref_start_p[i] == 0) { 
	  cigar_op->name = 'I';
	} else {
	  cigar_op->name = 'M';
	}
      } else if (cigar_op->name == '=') cigar_op->name = 'M';
    }

    cigar_op = cigar_code_get_last_op(cigar_c);
    if (cigar_op && cigar_op->name == 'H') cigar_op->name = 'I';

    LOG_DEBUG_F("gap_read_len = %i, cigar_code_length (%s) = %i\n", 
		read_gap_len, new_cigar_code_string(cigar_c), cigar_code_nt_length(cigar_c));
    assert(read_gap_len == cigar_code_nt_length(cigar_c));

    /*
    if (cigar_code_get_num_ops(cigar_c) > 2) {
      if (sw_prepare->left_flank > 0) {
	cigar_op = cigar_code_get_op(0, cigar_c);
	assert(cigar_op->number >= sw_prepare->left_flank && cigar_op->name == 'M');
	cigar_op->number -= sw_prepare->left_flank;
      }
      if (sw_prepare->right_flank > 0) {
	cigar_op = cigar_code_get_last_op(cigar_c);
	assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M');
	cigar_op->number -= sw_prepare->right_flank;
      }
      init_cigar_string(cigar_c);
      LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c));
    } else {
      assert(cigar_code_get_num_ops(cigar_c) == 1);
      if (sw_prepare->right_flank > 0) {
	cigar_op = cigar_code_get_last_op(cigar_c);
	assert(cigar_op->number >= sw_prepare->right_flank && cigar_op->name == 'M');
	cigar_op->number -= (sw_prepare->left_flank + sw_prepare->right_flank);
	if (cigar_op->number > read_gap_len) {
	  cigar_code_append_op(cigar_op_new(cigar_op->number - read_gap_len, 'D'), cigar_c);
	} else if (cigar_op->number < read_gap_len) {
	  cigar_code_append_op(cigar_op_new(read_gap_len - cigar_op->number, 'I'), cigar_c);
	} else{
	  init_cigar_string(cigar_c);
	}
	//	LOG_DEBUG_F("\tnew cigar: %s\n", new_cigar_code_string(cigar_c));
      }
    }
    */
    // and now set the cigar for this gap
    s->info = (void *) cigar_c;

    // free
    sw_prepare_free(sw_prepare);
  }

  display_sr_lists("END of fill_gaps", mapping_batch);
    
  // free memory
  sw_multi_output_free(output);
  array_list_free(sw_prepare_list, (void *) NULL);
}
예제 #21
0
size_t bwt_search_pair_anchors(array_list_t *list, unsigned int read_length) {
  bwt_anchor_t *bwt_anchor;
  int max_anchor_length = 0;
  

  bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw;
  int anchor_length_tmp, anchor_back, anchor_forw;
  int strand = 0, type = 0;
  int found_anchor = 0, found_double_anchor = 0;

  const int MIN_ANCHOR = 25;
  const int MIN_SINGLE_ANCHOR = 40;

  //const int MIN_DOUBLE_ANCHOR = MIN_ANCHOR*2;
  const int MAX_BWT_REGIONS = 50;
  const int MAX_BWT_ANCHOR_DISTANCE = 500000;

  array_list_t *anchor_list_tmp, *forward_anchor_list, *backward_anchor_list;
  cal_t *cal;
  int seed_size, gap_read, gap_genome;

  array_list_t *backward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *forward_anchor_list_0 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *backward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
  array_list_t *forward_anchor_list_1 = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED);

  array_list_t *big_anchor_list = array_list_new(MAX_BWT_REGIONS, 1.25f , COLLECTION_MODE_ASYNCHRONIZED);

  //printf("Tot Anchors %i\n", array_list_size(list));
  for (int i = 0; i < array_list_size(list); i++) {
    bwt_anchor = array_list_get(i, list);
    if (bwt_anchor->strand == 1) {
      //printf("(-)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1);
      if (bwt_anchor->type == FORWARD_ANCHOR) {
	array_list_insert(bwt_anchor, forward_anchor_list_1);
	//printf("FORW\n");
      } else {
	array_list_insert(bwt_anchor, backward_anchor_list_1);
	//printf("BACK\n");
      }
    } else {
      //printf("(+)bwt anchor %i:%lu-%lu (%i): \n", bwt_anchor->chromosome + 1, bwt_anchor->start, bwt_anchor->end, bwt_anchor->end - bwt_anchor->start + 1);
      if (bwt_anchor->type == FORWARD_ANCHOR) {
	array_list_insert(bwt_anchor, forward_anchor_list_0);
	//printf("FORW\n");
      } else {
	array_list_insert(bwt_anchor, backward_anchor_list_0);
	//printf("BACK\n");
      }
    }

    anchor_length_tmp = bwt_anchor->end - bwt_anchor->start + 1;
    if (anchor_length_tmp > MIN_SINGLE_ANCHOR && anchor_length_tmp > max_anchor_length) {
      max_anchor_length = anchor_length_tmp;
      found_anchor = 1;
      strand = bwt_anchor->strand;
      type = bwt_anchor->type;
    }
    
    if (read_length - anchor_length_tmp < 16) {
      array_list_insert(bwt_anchor, big_anchor_list);
    } 
    
  }
  
  array_list_clear(list, NULL);

  if (array_list_size(big_anchor_list) > 0) {
    for (int i = array_list_size(big_anchor_list) - 1; i >= 0; i--) {
      //printf("Insert cal %i\n", i);
      bwt_anchor = array_list_remove_at(i, big_anchor_list);
      size_t seed_size = bwt_anchor->end - bwt_anchor->start;

      if (bwt_anchor->type == FORWARD_ANCHOR) {
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size);
      } else {
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1);
      }
      
      array_list_insert(cal, list);
    }
    array_list_set_flag(SINGLE_ANCHORS, list);
    
    goto exit;
  }

  for (int type = 1; type >= 0; type--) {
    if (!type) {
      forward_anchor_list = forward_anchor_list_1;
      backward_anchor_list = backward_anchor_list_1;
      //printf("Strand (+): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list));
    } else { 
      forward_anchor_list = forward_anchor_list_0;
      backward_anchor_list = backward_anchor_list_0;
      //printf("Strand (-): %i-%i\n", array_list_size(forward_anchor_list), array_list_size(backward_anchor_list));
    }

    int *set_forward  = (int *)calloc(array_list_size(forward_anchor_list),  sizeof(int));
    int *set_backward = (int *)calloc(array_list_size(backward_anchor_list), sizeof(int));

    //Associate Anchors (+)/(-)
    for (int i = 0; i < array_list_size(forward_anchor_list); i++) { 
      if (set_forward[i]) { continue; }
      bwt_anchor_forw = array_list_get(i, forward_anchor_list);
      for (int j = 0; j < array_list_size(backward_anchor_list); j++) { 
	if (set_backward[j]) { continue; }
	bwt_anchor_back = array_list_get(j, backward_anchor_list);
	anchor_forw = (bwt_anchor_forw->end - bwt_anchor_forw->start + 1);
	anchor_back = (bwt_anchor_back->end - bwt_anchor_back->start + 1); 

	anchor_length_tmp = anchor_forw + anchor_back;

	//printf("\tCommpare %i:%lu-%lu with %i:%lu-%lu\n", bwt_anchor_forw->chromosome + 1, 
	//     bwt_anchor_forw->start, bwt_anchor_forw->end, bwt_anchor_back->chromosome + 1, 
	//     bwt_anchor_back->start, bwt_anchor_back->end);
	if (bwt_anchor_forw->chromosome == bwt_anchor_back->chromosome &&
	    abs(bwt_anchor_back->start - bwt_anchor_forw->end) <= MAX_BWT_ANCHOR_DISTANCE && 
	    anchor_forw >= MIN_ANCHOR && anchor_back >= MIN_ANCHOR) {
	  
	  if (bwt_anchor_back->start < bwt_anchor_forw->end) { continue; }
	  
	  gap_read = read_length - (anchor_forw + anchor_back);
	  gap_genome = bwt_anchor_back->start - bwt_anchor_forw->end;

	  //printf("anchor_forw = %i, anchor_back = %i, gap_read = %i, gap_genome = %i\n",
	  //	 anchor_forw, anchor_back, gap_read, gap_genome);
	  	  
	  int apply_flank = 0;
	  if (gap_read < 2 || gap_genome < 2) {
	    int gap;
	    if (gap_read < 0 && gap_genome < 0) {
	      gap = abs(gap_read) > abs(gap_genome) ? abs(gap_read) : abs(gap_genome);
	    } else if (gap_read < 0) {
	      gap = abs(gap_read);
	    } else if (gap_genome < 0) {
	      gap = abs(gap_genome);
	    } else {
	      gap = 2;
	    }
	    
	    int flank  = 5;
	    apply_flank = 1;
	    
	    if (abs(gap) >= flank*2) {
	      //Solve read overlap
	      flank = abs(gap)/2 + flank/2;
	    }
	    //printf("\tgap = %i, flank = %i\n", gap, flank);
	    if (flank >= anchor_forw) {
	      bwt_anchor_forw->end -= anchor_forw/2;	      
	    } else {
	      bwt_anchor_forw->end -= flank;
	    }

	    if (flank >= anchor_back) {
	      bwt_anchor_back->start += anchor_back/2;	    
	    } else {
	      bwt_anchor_back->start += flank;
	    }
	  } 
	  	  
	  cal = convert_bwt_anchor_to_CAL(bwt_anchor_forw, 0, bwt_anchor_forw->end - bwt_anchor_forw->start);
	  //printf("INSERT-1 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end);
	  array_list_insert(cal, list);
	  seed_size = bwt_anchor_back->end - bwt_anchor_back->start + 1;
	  //if (bwt_anchor_forw->end + read_length >= bwt_anchor_back->start) {	    
	  //seed_region_t *seed_region = seed_region_new(read_length - seed_size, read_length - 1,
	  //bwt_anchor_back->start, bwt_anchor_back->end, 1);
	  //cal->end = bwt_anchor_back->end;
	  //linked_list_insert_last(seed_region, cal->sr_list);	
	  //} else {
	  cal = convert_bwt_anchor_to_CAL(bwt_anchor_back, read_length - seed_size, read_length - 1);
	  //printf("INSERT-2 (%i)[%i:%lu-%lu]\n", cal->strand, cal->chromosome_id, cal->start, cal->end);
	  array_list_insert(cal, list);
	  if (array_list_size(list) > 5) { 
	    free(set_backward);
	    free(set_forward);	    
	    goto exit;
	  }

	  array_list_set_flag(DOUBLE_ANCHORS, list);
	  found_double_anchor = 1;
	  set_forward[i]  = 1;
	  set_backward[j] = 1;
	  break;
	}                                                                                                                      
      }         
    }
    free(set_backward);
    free(set_forward);
  }

  if (!found_double_anchor && found_anchor) { 
    //Not Double anchor found but one Yes!!
    if (strand == 1) {
      if (type == FORWARD_ANCHOR) {
	anchor_list_tmp = forward_anchor_list_1;
      } else {
	anchor_list_tmp =  backward_anchor_list_1;
      }
    } else {
      if (type == FORWARD_ANCHOR) {
	anchor_list_tmp =  forward_anchor_list_0;
      } else {
	anchor_list_tmp =  backward_anchor_list_0;
      }
    }

    //printf("LIST SIZE %i\n", array_list_size(anchor_list_tmp));
    for (int i = 0; i < array_list_size(anchor_list_tmp); i++) {
      bwt_anchor = array_list_get(i, anchor_list_tmp);
      size_t seed_size = bwt_anchor->end - bwt_anchor->start;
      //array_list_insert(bwt_anchor_new(bwt_anchor->strand, bwt_anchor->chromosome, 
      //			       bwt_anchor->start, bwt_anchor->end, bwt_anchor->type), anchor_list);
      if (bwt_anchor->type == FORWARD_ANCHOR) {
	//printf("------------------------> start %i\n", 0);
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, 0, seed_size);
      } else {
	//printf("------------------------> start %i\n", read_length - seed_size);
	cal = convert_bwt_anchor_to_CAL(bwt_anchor, read_length - seed_size - 1, read_length - 1);
      }
      array_list_insert(cal, list);
    }
    array_list_set_flag(SINGLE_ANCHORS, list);
  } 

 exit:
  array_list_free(forward_anchor_list_1, (void *)bwt_anchor_free);
  array_list_free(backward_anchor_list_1,  (void *)bwt_anchor_free);
  array_list_free(forward_anchor_list_0,  (void *)bwt_anchor_free);
  array_list_free(backward_anchor_list_0,  (void *)bwt_anchor_free);
  array_list_free(big_anchor_list,  (void *)bwt_anchor_free);

  return array_list_size(list);
  
}
예제 #22
0
파일: main.c 프로젝트: CharoL/bioinfo-libs
int main (int argc, char *argv[]) {

	if(!strcmp("count-lines", argv[1])) {
		fastq_file_t *file = fastq_fopen(argv[2]);
		array_list_t *reads = array_list_new(2000000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_fread_se(reads, 100000, file)) != 0) {
			count += nread;
			for(int i=0; i<reads->size; i++) {
				fastq_read_print(array_list_get(i, reads));
			}
			//			printf("Size: %i, Capacity: %i\n", reads->size, reads->capacity);
			array_list_clear(reads, fastq_read_free);
		}
		//		printf("Total num reads: %i\n", reads->size);
		//		fastq_read_print(array_list_get(0, reads));
		//		fastq_read_print(array_list_get(reads->size-1, reads));
		array_list_free(reads, fastq_read_free);
		fastq_fclose(file);
	}

	if(!strcmp("count-lines-gz", argv[1])) {
		fastq_gzfile_t *file = fastq_gzopen(argv[2]);
		//		printf("=>%i\n", file->ret);
		array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_gzread_se(reads, 100000, file)) != 0) {
			//			nread = fastq_gzread_se(reads, 1000000, file);
			count += nread;
			//			printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread);
			for(int i=0; i<reads->size; i++) {
				fastq_read_print(array_list_get(i, reads));
			}
			//			fastq_read_print((fastq_read_t*)array_list_get(reads->size-1, reads));
			array_list_clear(reads, fastq_read_free);
		}
		//		printf("Total num reads: %i\n", count);
		//		fastq_read_print(array_list_get(0, reads));
		array_list_free(reads, fastq_read_free);
		fastq_gzclose(file);
	}

	if(!strcmp("count-bytes-gz", argv[1])) {
		fastq_gzfile_t *file = fastq_gzopen(argv[2]);
		//			printf("=>%i\n", file->ret);
		array_list_t *reads = array_list_new(1000000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_gzread_bytes_se(reads, 10000000, file)) != 0) {
			//				nread = fastq_gzread_bytes_se(reads, 100000, file);
			count += reads->size;
			//				printf("Size: %i, Capacity: %i, count = %i, nread: %i\n", reads->size, reads->capacity, count, nread);
			for(int i=0; i<reads->size; i++) {
				fastq_read_print(array_list_get(i, reads));
			}
			//				fastq_read_print(array_list_get(reads->size-1, reads));
			array_list_clear(reads, fastq_read_free);
		}
		//			printf("Total num reads: %i\n", count);
		//		fastq_read_print(array_list_get(0, reads));
		array_list_free(reads, fastq_read_free);
		fastq_gzclose(file);
	}

	if(!strcmp("filter", argv[1])) {
		fastq_file_t *file = fastq_fopen(argv[2]);
		fastq_filter_options_t *fastq_filter_options = fastq_filter_options_new(50,150, 30, 80, 2, 100);
		array_list_t *reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		array_list_t *passed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		array_list_t *failed_reads ;//= array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
		size_t nread = 1;
		int count = 0;
		while((nread = fastq_fread_se(reads, 1000000, file)) != 0) {
			count += reads->size;
			passed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
			failed_reads = array_list_new(200000, 1.8, COLLECTION_MODE_SYNCHRONIZED);
			//			for(int i=0; i<reads->size; i++) {
			//				fastq_read_print(array_list_get(i, reads));
			//			}
			fastq_filter(reads, passed_reads, failed_reads, fastq_filter_options);
			fastq_read_print(array_list_get(0, passed_reads));
			fastq_read_print(array_list_get(0, failed_reads));
			printf("Total Reads: %lu, Passed Reads: %lu, Reads failed: %lu\n", reads->size, passed_reads->size, failed_reads->size);
			array_list_clear(reads, fastq_read_free);
			array_list_free(passed_reads, NULL);
			array_list_free(failed_reads, NULL);
			//			fastq_read_print(array_list_get(0, passed_reads));
			//			fastq_read_print(array_list_get(0, failed_reads));
			//			printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size);
		}
		//		fastq_read_print(array_list_get(0, passed_reads));
		//		fastq_read_print(array_list_get(0, failed_reads));
//		printf("Total Reads: %lu, Passed Reads: %lu, Reads filter: %lu\n", reads->size, passed_reads->size, failed_reads->size);

		fastq_filter_options_free(fastq_filter_options);
		array_list_free(reads, NULL);
		//		array_list_free(passed_reads, fastq_read_free);
		//		array_list_free(failed_reads, fastq_read_free);
		fastq_fclose(file);
	}

	return 0;
}
예제 #23
0
int run_filter(shared_options_data_t *shared_options_data, filter_options_data_t *options_data) {
    int ret_code;
    double start, stop, total;
    
    vcf_file_t *file = vcf_open(shared_options_data->vcf_filename, shared_options_data->max_batches);
    if (!file) {
        LOG_FATAL("VCF file does not exist!\n");
    }
    
    ret_code = create_directory(shared_options_data->output_directory);
    if (ret_code != 0 && errno != EEXIST) {
        LOG_FATAL_F("Can't create output directory: %s\n", shared_options_data->output_directory);
    }
    
#pragma omp parallel sections private(start, stop, total)
    {
#pragma omp section
        {
            LOG_DEBUG_F("Thread %d reads the VCF file\n", omp_get_thread_num());
            // Reading
            start = omp_get_wtime();

            if (shared_options_data->batch_bytes > 0) {
                ret_code = vcf_parse_batches_in_bytes(shared_options_data->batch_bytes, file);
            } else if (shared_options_data->batch_lines > 0) {
                ret_code = vcf_parse_batches(shared_options_data->batch_lines, file);
            }

            stop = omp_get_wtime();
            total = stop - start;

            if (ret_code) { LOG_FATAL_F("[%dR] Error code = %d\n", omp_get_thread_num(), ret_code); }

            LOG_INFO_F("[%dR] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%dR] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            notify_end_parsing(file);
        }
        
#pragma omp section
        {
            filter_t **filters = NULL;
            int num_filters = 0;
            if (shared_options_data->chain != NULL) {
                filters = sort_filter_chain(shared_options_data->chain, &num_filters);
            }
    
            FILE *passed_file = NULL, *failed_file = NULL;
            get_filtering_output_files(shared_options_data, &passed_file, &failed_file);
            if (!options_data->save_rejected) {
                fclose(failed_file);
            }
            LOG_DEBUG("File streams created\n");
            
            start = omp_get_wtime();

            int i = 0;
            vcf_batch_t *batch = NULL;
            while ((batch = fetch_vcf_batch(file)) != NULL) {
                if (i == 0) {
                    // Add headers associated to the defined filters
                    vcf_header_entry_t **filter_headers = get_filters_as_vcf_headers(filters, num_filters);
                    for (int j = 0; j < num_filters; j++) {
                        add_vcf_header_entry(filter_headers[j], file);
                    }
                    
                    // Write file format, header entries and delimiter
                    write_vcf_header(file, passed_file);
                    if (options_data->save_rejected) {
                        write_vcf_header(file, failed_file);
                    }

                    LOG_DEBUG("VCF header written created\n");
                }
                
                array_list_t *input_records = batch->records;
                array_list_t *passed_records, *failed_records;

                if (i % 100 == 0) {
                    LOG_INFO_F("Batch %d reached by thread %d - %zu/%zu records \n", 
                                i, omp_get_thread_num(),
                                batch->records->size, batch->records->capacity);
                }

                if (filters == NULL) {
                    passed_records = input_records;
                } else {
                    failed_records = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED);
                    passed_records = run_filter_chain(input_records, failed_records, filters, num_filters);
                }

                // Write records that passed and failed to 2 new separated files
                if (passed_records != NULL && passed_records->size > 0) {
                    LOG_DEBUG_F("[batch %d] %zu passed records\n", i, passed_records->size);
                #pragma omp critical 
                    {
                        for (int r = 0; r < passed_records->size; r++) {
                            write_vcf_record(passed_records->items[r], passed_file);
                        }
//                         write_batch(passed_records, passed_file);
                    }
                }
                
                if (options_data->save_rejected && failed_records != NULL && failed_records->size > 0) {
                    LOG_DEBUG_F("[batch %d] %zu failed records\n", i, failed_records->size);
                #pragma omp critical 
                    {
                        for (int r = 0; r < failed_records->size; r++) {
                            write_vcf_record(failed_records->items[r], failed_file);
                        }
//                         write_batch(failed_records, failed_file);
                    }
                }
                
                // Free batch and its contents
                vcf_batch_free(batch);
                
                // Free items in both lists (not their internal data)
                if (passed_records != input_records) {
                    array_list_free(passed_records, NULL);
                }
                if (failed_records) {
                    array_list_free(failed_records, NULL);
                }
                
                i++;
            }

            stop = omp_get_wtime();

            total = stop - start;

            LOG_INFO_F("[%d] Time elapsed = %f s\n", omp_get_thread_num(), total);
            LOG_INFO_F("[%d] Time elapsed = %e ms\n", omp_get_thread_num(), total*1000);

            // Free resources
            if (passed_file) {
            	fclose(passed_file);
            }
            if (options_data->save_rejected && failed_file) {
            	fclose(failed_file);
            }

            free_filters(filters, num_filters);
        }
    }
    
    vcf_close(file);
    
    return 0;
}
예제 #24
0
int sa_bam_writer(void *data) {
  sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data;
  
  sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch;
  if (mapping_batch == NULL) {
    printf("bam_writer1: error, NULL mapping batch\n");
    return 0;
  }

  //  for (int i = 0; i < NUM_COUNTERS; i++) {
  //    counters[i] += mapping_batch->counters[i];
  //  }

  #ifdef _TIMING
  for (int i = 0; i < NUM_TIMING; i++) {
    func_times[i] += mapping_batch->func_times[i];
  }
  #endif

  int flag, len;
  char *sequence, *quality;

  fastq_read_t *read;
  array_list_t *read_list = mapping_batch->fq_reads;

  bam1_t *bam1;
  alignment_t *alig;
  array_list_t *mapping_list;
  bam_file_t *out_file = wf_batch->writer_input->bam_file;

  sa_genome3_t *genome = wf_batch->sa_index->genome;

  size_t num_reads, num_mappings, num_mate_mappings;
  num_reads = mapping_batch->num_reads;
  for (size_t i = 0; i < num_reads; i++) {
    read = (fastq_read_t *) array_list_get(i, read_list);
    mapping_list = mapping_batch->mapping_lists[i];
    num_mappings = array_list_size(mapping_list);
    num_total_mappings += num_mappings;

    #ifdef _VERBOSE
    if (num_mappings > 1) {
      num_dup_reads++;
      num_total_dup_reads += num_mappings;
    }
    #endif

    if (num_mappings > 0) {
      num_mapped_reads++;
      if (num_mappings > 1) {
	num_multihit_reads++;
      }
      for (size_t j = 0; j < num_mappings; j++) {
	alig = (alignment_t *) array_list_get(j, mapping_list);

	// update alignment
	if (num_mappings > 1) {
	  alig->map_quality = 0;
	} else {
	  alig->map_quality = alig->mapq;
	}

	bam1 = convert_to_bam(alig, 33);
	bam_fwrite(bam1, out_file);
	bam_destroy1(bam1);
	alignment_free(alig);
      }
    } else {
      num_unmapped_reads++;

      if (read->adapter) {
	// sequences and cigar
	len = read->length + abs(read->adapter_length);
	sequence = (char *) malloc(len + 1);
	quality = (char *) malloc(len + 1);

	if (read->adapter_length < 0) {
	  strcpy(quality, read->adapter_quality);
	  strcat(quality, read->quality);
	} else {
	  strcpy(quality, read->quality);
	  strcat(quality, read->adapter_quality);
	}
	
	if ((read->adapter_strand == 0 && read->adapter_length < 0) || 
	    (read->adapter_strand == 1 && read->adapter_length > 0)) {
	  strcpy(sequence, read->adapter);
	  strcat(sequence, read->sequence);
	} else {
	  strcpy(sequence, read->sequence);
	  strcat(sequence, read->adapter);
	}
	sequence[len] = 0; 
	quality[len] = 0; 
      } else {
	// sequences
	sequence = read->sequence;
	quality = read->quality;
      }
      
      alig = alignment_new();       
      alignment_init_single_end(strdup(read->id), sequence, quality,
				0, -1, -1, /*strdup(aux)*/"", 0, 0, 0, 0, 0, NULL, alig);
      
      bam1 = convert_to_bam(alig, 33);
      bam_fwrite(bam1, out_file);
        
      // free memory
      bam_destroy1(bam1);
      alig->sequence = NULL;
      alig->quality = NULL;
      alig->cigar = NULL;
      alignment_free(alig);
      if (read->adapter) {
	free(sequence);
	free(quality);
      }
    }
    array_list_free(mapping_list, (void *) NULL);
  }

  // free memory
  sa_mapping_batch_free(mapping_batch);

  if (wf_batch) sa_wf_batch_free(wf_batch);

  return 0;
}
예제 #25
0
//====================================================================================
// apply_caling
//====================================================================================
int apply_caling(cal_seeker_input_t* input, batch_t *batch) {
  mapping_batch_t *mapping_batch = batch->mapping_batch;
  array_list_t *list = NULL;
  size_t read_index, num_cals;
  int min_seeds, max_seeds;


  cal_t *cal;
  array_list_t *cal_list;

  fastq_read_t *read;



  size_t num_chromosomes = input->genome->num_chromosomes + 1;
  size_t num_targets = mapping_batch->num_targets;
  size_t *targets = mapping_batch->targets;
  size_t new_num_targets = 0;
  array_list_t *region_list;
  bwt_anchor_t *bwt_anchor_back, *bwt_anchor_forw;
  linked_list_t *linked_list;
  int anchor_nt, gap_nt;
  seed_region_t *seed_region_start, *seed_region_end;
  //max_seeds = input->cal_optarg->num_seeds;
  
  //  size_t *new_targets = (size_t *) calloc(num_targets, sizeof(size_t));
  
  // set to zero
  mapping_batch->num_to_do = 0;

  for (size_t i = 0; i < num_targets; i++) {

    read_index = targets[i];
    read = array_list_get(read_index, mapping_batch->fq_batch); 
    region_list = mapping_batch->mapping_lists[read_index];
    // for debugging
    //    LOG_DEBUG_F("%s\n", ((fastq_read_t *) array_list_get(read_index, mapping_batch->fq_batch))->id);
    
    if (!list) {
      list = array_list_new(1000, 
			    1.25f, 
			    COLLECTION_MODE_ASYNCHRONIZED);
    }


    if (array_list_get_flag(region_list) == 0 || 
	array_list_get_flag(region_list) == 2) {
      //We have normal and extend seeds (anchors)
      max_seeds = (read->length / 15)*2 + 10;
      num_cals = bwt_generate_cal_list_linked_list(region_list,
						   input->cal_optarg,
						   &min_seeds, &max_seeds,
						   num_chromosomes,
						   list, read->length,
						   input->cal_optarg->min_cal_size, 0);
    } else {
      //We have double anchors with smaller distance between they
      //printf("Easy case... Two anchors and same distance between read gap and genome distance\n");
      num_cals = 0;
      for (int a = array_list_size(region_list) - 1; a >= 0; a -= 2) {
	max_seeds = 2;
	min_seeds = 2;
	bwt_anchor_back = array_list_remove_at(a, region_list);
	bwt_anchor_forw = array_list_remove_at(a - 1, region_list);

	linked_list = linked_list_new(COLLECTION_MODE_ASYNCHRONIZED);

	
	//Seed for the first anchor
	anchor_nt = bwt_anchor_forw->end - bwt_anchor_forw->start;
	//printf("\t seed0[%i-%i][%lu-%lu]\n", 0, anchor_nt - 1,
	//     bwt_anchor_forw->start, bwt_anchor_forw->end);
	seed_region_start = seed_region_new(0, anchor_nt - 1,
					    bwt_anchor_forw->start, bwt_anchor_forw->end, 0, 0, 0);

	//Seed for the first anchor
	gap_nt = read->length - (anchor_nt + (bwt_anchor_back->end - bwt_anchor_back->start));
	//printf("\t gap_nt = %i, anchor_nt = %i\n", gap_nt, anchor_nt);
	//printf("\t seed1[%i-%i][%lu-%lu]\n", anchor_nt + gap_nt, read->length - 1, 
	//     bwt_anchor_back->start + 1, bwt_anchor_back->end);
	seed_region_end = seed_region_new(anchor_nt + gap_nt, read->length - 1,
					  bwt_anchor_back->start + 1, bwt_anchor_back->end, 1, 0, 0);

	//The reference distance is 0 and the read distance not
	//The read distance is 0 and the reference distance not
	//if (seed_region_start->genome_end > seed_region_end->genome_start || 
	//  seed_region_start->read_end > seed_region_end->read_start) { 
	//array_list_clear(region_list, NULL);
	//continue;
	if (seed_region_end->genome_start - seed_region_start->genome_end < 5 || 
	    seed_region_end->read_start - seed_region_start->read_end < 5) {
	  seed_region_start->genome_end -= 5;
	  seed_region_start->read_end -= 5;
	  seed_region_end->genome_start += 5;
	  seed_region_end->read_start += 5;
	}

	linked_list_insert(seed_region_start, linked_list);
	linked_list_insert_last(seed_region_end, linked_list);

	cal = cal_new(bwt_anchor_forw->chromosome + 1,
		      bwt_anchor_forw->strand,
		      bwt_anchor_forw->start,
		      bwt_anchor_back->end + 1,
		      2,
		      linked_list,
		      linked_list_new(COLLECTION_MODE_ASYNCHRONIZED));
	array_list_insert(cal, list);
	num_cals++;
      }
    }

    // for debugging
    LOG_DEBUG_F("read %s : num. cals = %i, min. seeds = %i, max. seeds = %i\n", 
		read->id, num_cals, min_seeds, max_seeds);


    /*    if (num_cals == 0) {
      int seed_size = 24;
      //First, Delete old regions
      array_list_clear(mapping_batch->mapping_lists[read_index], region_bwt_free);
      //Second, Create new regions with seed_size 24 and 1 Mismatch
      bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2,
				bwt_optarg, bwt_index, 
				mapping_batch->mapping_lists[read_index]);

      num_cals = bwt_generate_cal_list_linked_list(mapping_batch->mapping_lists[mapping_batch->targets[i]], 
						   input->cal_optarg,
						   &min_seeds, &max_seeds,
						   num_chromosomes,
						   list, read->length);
						   }*/

    /*
    for (size_t j = 0; j < num_cals; j++) {
      cal = array_list_get(j, list);
      LOG_DEBUG_F("\tchr: %i, strand: %i, start: %lu, end: %lu, num_seeds = %i, num. regions = %lu\n", 
		  cal->chromosome_id, cal->strand, cal->start, cal->end, cal->num_seeds, cal->sr_list->size);
    }
    */
    //    printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", 
    //	   min_seeds, max_seeds, min_limit, array_list_size(list));

    // filter incoherent CALs
    int founds[num_cals], found = 0;
    for (size_t j = 0; j < num_cals; j++) {
      founds[j] = 0;
      cal = array_list_get(j, list);
      LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", 
		  j, num_cals, cal->sr_list->size, cal->num_seeds,
		  cal->chromosome_id, cal->start, cal->end);
      if (cal->sr_list->size > 0) {
	int start = 0;
	for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) {
	  seed_region_t *s = list_item->item;
	  
	  LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start);
	  if (start > s->read_start) {
	    LOG_DEBUG("\t\t\t:: remove\n");
	    found++;
	    founds[j] = 1;
	  }
	  start = s->read_end + 1;
	}
      } else {
	found++;
	founds[j] = 1;
      }
    }
    if (found) {
      min_seeds = 100000;
      max_seeds = 0;
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	if (!founds[j]) {
	  cal = array_list_get(j, list);
	  cal->num_seeds = cal->sr_list->size;
	  if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds;
	  if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds;
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_free(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
      list = cal_list;
    }
  
    //    LOG_FATAL_F("num. cals = %i, min. seeds = %i, max. seeds = %i\n", num_cals, min_seeds, max_seeds);
    // filter CALs by the number of seeds

    cal_list = list;
    list = NULL;
    /*
    int min_limit = input->cal_optarg->min_num_seeds_in_cal;

    if (min_limit < 0) min_limit = max_seeds;
    //    min_limit -= 3;
    
    if (min_seeds == max_seeds || min_limit <= min_seeds) {
      cal_list = list;
      list = NULL;
    } else {
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	cal = array_list_get(j, list);
	if (cal->num_seeds >= min_limit) {
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_clear(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
    }
    */
    if (num_cals > MAX_CALS) {
      for (size_t j = num_cals - 1; j >= MAX_CALS; j--) {
	cal = (cal_t *) array_list_remove_at(j, cal_list);
	cal_free(cal);
      }
      num_cals = array_list_size(cal_list);
    }
    
    //    LOG_DEBUG_F("num. cals = %i, MAX_CALS = %i\n", num_cals, MAX_CALS);

    if (num_cals > 0 && num_cals <= MAX_CALS) {
      array_list_set_flag(2, cal_list);
      targets[new_num_targets++] = read_index;

      /*
      int count1 = 0, count2 = 0;
      // count number of sw to do

      // method #1
      //      printf("method #1\n");
      seed_region_t *s, *prev_s;
      linked_list_iterator_t* itr;
      for (size_t j = 0; j < num_cals; j++) {
	prev_s = NULL;
	cal = array_list_get(j, cal_list);
	itr = linked_list_iterator_new(cal->sr_list);
	s = (seed_region_t *) linked_list_iterator_curr(itr);
	while (s != NULL) {
	  if ((prev_s == NULL && s->read_start != 0) || (prev_s != NULL)) {
	    //	    printf("\t\t\tcase 1\n");
	    count1++;
	  }
	  prev_s = s;
	  linked_list_iterator_next(itr);
	  s = linked_list_iterator_curr(itr);
	}
	if (prev_s != NULL && prev_s->read_end < read->length - 1) { 
	  count1++;
	  //	  printf("\t\t\tcase 2 (%i < %i)\n", prev_s->read_end, read->length - 1);
	}
	linked_list_iterator_free(itr);
      }

      // method #2
      printf("method #2\n");
      for (size_t j = 0; j < num_cals; j++) {
	cal = array_list_get(j, cal_list);
	printf("\t: %i\n", j);
	if (cal->sr_list->size > 0) {
	  int start = 0;
	  for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) {
	    seed_region_t *s = list_item->item;
	    printf("\t\t[%i|%i - %i|%i]\n", s->genome_start, s->read_start, s->read_end, s->genome_end);
	    if (s->read_start != start) {
	      count2++;
	    }
	    start = s->read_end + 1;
	  }
	  if (start < read->length) { 
	    count2++;
	  }
	}
      }
      printf("count #1 = %i, count #2 = %i\n", count1, count2);
      assert(count1 == count2);

      mapping_batch->num_to_do += count1;
*/

      // we have to free the region list
      array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      mapping_batch->mapping_lists[read_index] = cal_list;
    } else {
      array_list_set_flag(0, mapping_batch->mapping_lists[read_index]);
      // we have to free the region list
      array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      if (cal_list) array_list_free(cal_list, (void *) cal_free);
      if (list) array_list_clear(list, (void *) cal_free);
    }

    /*    
    cal_list = list;
    list = NULL;
    array_list_set_flag(2, cal_list);
    //    mapping_batch->num_to_do += num_cals;
    targets[new_num_targets++] = read_index;
    
    // we have to free the region list
    array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
    mapping_batch->mapping_lists[read_index] = cal_list;
    */
    /*
    // filter CALs by the number of seeds
    int min_limit = input->cal_optarg->min_num_seeds_in_cal;
    if (min_limit < 0) min_limit = max_seeds;

    printf("min_seeds = %i, max_seeds = %i, min_limit = %i, num_cals = %i\n", 
	   min_seeds, max_seeds, min_limit, array_list_size(list));
    
    if (min_seeds == max_seeds || min_limit <= min_seeds) {
      cal_list = list;
      list = NULL;
    } else {
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	cal = array_list_get(j, list);
	if (cal->num_seeds >= min_limit) {
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_clear(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
      printf("************, num_cals = %i\n", num_cals);
    }

    if (num_cals > MAX_CALS) {
      for (size_t j = num_cals - 1; j >= MAX_CALS; j--) {
	cal = (cal_t *) array_list_remove_at(j, cal_list);
	cal_free(cal);
      }
      num_cals = array_list_size(cal_list);
    }

    if (num_cals > 0 && num_cals <= MAX_CALS) {
      array_list_set_flag(2, cal_list);
      mapping_batch->num_to_do += num_cals;
      targets[new_num_targets++] = read_index;
      
      // we have to free the region list
      array_list_free(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      mapping_batch->mapping_lists[read_index] = cal_list;
    } else {
      array_list_set_flag(0, mapping_batch->mapping_lists[read_index]);
      // we have to free the region list
      array_list_clear(mapping_batch->mapping_lists[read_index], (void *) region_bwt_free);
      if (cal_list) array_list_free(cal_list, (void *) cal_free);
      if (list) array_list_clear(list, (void *) cal_free);
    }
    */
  } // end for 0 ... num_targets

  // update batch
  mapping_batch->num_targets = new_num_targets;

  //  LOG_DEBUG_F("num. SW to do: %i\n", 	mapping_batch->num_to_do);

  //  exit(-1);

  // free memory
  if (list) array_list_free(list, NULL);

  if (batch->mapping_mode == RNA_MODE) {
    return RNA_STAGE;
  }

  if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) {
    return PRE_PAIR_STAGE;
  } else if (batch->mapping_batch->num_targets > 0) {
    return SW_STAGE;
  }
  
  return DNA_POST_PAIR_STAGE;
}
예제 #26
0
int apply_caling_rna(cal_seeker_input_t* input, batch_t *batch) {

  LOG_DEBUG("========= APPLY CALING RNA =========\n");



  //if (time_on) { start_timer(start); }


  bwt_optarg_t *bwt_optarg = input->bwt_optarg;
  bwt_index_t *bwt_index = input->index;
  cal_optarg_t *cal_optarg = input->cal_optarg;
  mapping_batch_t *mapping_batch = batch->mapping_batch;

  size_t num_cals, select_cals;

  size_t total_reads = 0;
  size_t num_targets, target_pos, total_targets, extra_target_pos;
  fastq_read_t *read;
  genome_t *genome = input->genome;
  unsigned int num_chromosomes = genome->num_chromosomes;

  int min_seeds, max_seeds;
  int seed_size = input->cal_optarg->seed_size;
  array_list_t *cal_list, *list;
  cal_t *cal;
  //array_list_t *region_list;


  num_targets = mapping_batch->num_targets;
  total_targets = 0;
  extra_target_pos = 0;
  total_reads += num_targets;
  target_pos = 0;

  mapping_batch->extra_stage_do = 1;

  /*  int t, target;
  for (t = 0; t < num_targets; t++) {
    target = mapping_batch->targets[t];
    mapping_batch->mapping_lists[target]->size = 0;
  }
  return RNA_POST_PAIR_STAGE;
  */

  array_list_t *region_list = array_list_new(1000, 
					     1.25f, 
					     COLLECTION_MODE_ASYNCHRONIZED);
  

  //extern size_t TOTAL_READS_SEEDING, TOTAL_READS_SEEDING2;

  //pthread_mutex_lock(&mutex_sp);
  //TOTAL_READS_SEEDING += num_targets;
  //pthread_mutex_unlock(&mutex_sp);

  //printf("Num targets = %i\n", num_targets);
  
  for (size_t i = 0; i < num_targets; i++) {
    read = array_list_get(mapping_batch->targets[i], mapping_batch->fq_batch); 
    
    //printf("From CAL Seeker %s\n", read->id);
    list = mapping_batch->mapping_lists[mapping_batch->targets[i]];
    
    //if (array_list_get_flag(region_list) == 0 || 
    //	array_list_get_flag(region_list) == 2) {
    //We have normal and extend seeds (anchors)
    max_seeds = (read->length / 15)*2 + 10;      
    //printf("%i\n", input->cal_optarg->min_cal_size);
    num_cals = bwt_generate_cals(read->sequence, 
				 seed_size, 
				 bwt_optarg,
				 cal_optarg,
				 bwt_index, 
				 list, 
				 num_chromosomes);


    // if we want to seed with 24-length seeds,
    if (num_cals == 0) {
      //printf("No Cals seeding...\n");
      
      //pthread_mutex_lock(&mutex_sp);
      //extern size_t seeds_1err;
      //seeds_1err++;
      //pthread_mutex_unlock(&mutex_sp);

      int seed_size = 24;
      //First, Delete old regions
      array_list_clear(region_list, (void *)region_bwt_free);
      
      //Second, Create new regions with seed_size 24 and 1 Mismatch
      
      bwt_map_inexact_seeds_seq(read->sequence, seed_size, seed_size/2,
				bwt_optarg, bwt_index, 
				region_list);
      
      max_seeds = (read->length / 15)*2 + 10;
      //int prev_min_cal = input->cal_optarg->min_cal_size;
      //input->cal_optarg->min_cal_size = seed_size + seed_size / 2;
      //printf("NO CALS, new seeds %lu\n", array_list_size(region_list));

      num_cals = bwt_generate_cal_list_linked_list(region_list,
						   input->cal_optarg,
						   &min_seeds, &max_seeds,
						   genome->num_chromosomes + 1,
						   list, read->length,
						   cal_optarg->min_cal_size,
						   0);

      //input->cal_optarg->min_cal_size = prev_min_cal;

      //pthread_mutex_lock(&mutex_sp);
      //TOTAL_READS_SEEDING2++;
      //pthread_mutex_unlock(&mutex_sp);

    } 

    array_list_clear(region_list, (void *)region_bwt_free);

    //filter-incoherent CALs
    int founds[num_cals], found = 0;
    for (size_t j = 0; j < num_cals; j++) {
      founds[j] = 0;
      cal = array_list_get(j, list);
      LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", 
		  j, num_cals, cal->sr_list->size, cal->num_seeds,
		  cal->chromosome_id, cal->start, cal->end);
      if (cal->sr_list->size > 0) {
	int start = 0;
	size_t genome_start = 0;
	int first = 1;
	for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) {
	  seed_region_t *s = list_item->item;
	  
	  LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start);
	  LOG_DEBUG_F("\t\t:: read_star %lu > read_end %lu \n", s->read_start, s->read_end);
	  if (start > s->read_start || s->read_start >= s->read_end) {
	    LOG_DEBUG("\t\t\t:: remove\n");
	    found++;
	    founds[j] = 1;
	  }

	  if (!first && 
	      ((s->genome_start < genome_start) || 
	      (s->genome_start - genome_start) > 2*read->length)) {
	    //printf("Remove (genome_start = %i s->genome_start = %i)\n", genome_start, s->genome_start);
	    //cal_print(cal);
	    found++;
	    founds[j] = 1;
	  }

	  first = 0;
	  start = s->read_end + 1;
	  genome_start = s->genome_end + 1;
	}
      } else {
	found++;
	founds[j] = 1;
      }
    }

    if (found) {
      min_seeds = 100000;
      max_seeds = 0;
      cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
      for (size_t j = 0; j < num_cals; j++) {
	if (!founds[j]) {
	  cal = array_list_get(j, list);
	  cal->num_seeds = cal->sr_list->size;
	  if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds;
	  if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds;
	  array_list_insert(cal, cal_list);
	  array_list_set(j, NULL, list);
	}
      }
      array_list_free(list, (void *) cal_free);
      num_cals = array_list_size(cal_list);
      list = cal_list;
    }

    mapping_batch->mapping_lists[mapping_batch->targets[i]] = list;
    num_cals = array_list_size(list);

    int max = 100;
    if (num_cals > max) {
      select_cals = num_cals - max;
      for(int j = num_cals - 1; j >= max; j--) {
	cal_free(array_list_remove_at(j, mapping_batch->mapping_lists[mapping_batch->targets[i]]));
      }
    }

    //mapping_batch->targets[target_pos++] = mapping_batch->targets[i];
    //} //else if (num_cals > 0) {

    mapping_batch->targets[target_pos++] = mapping_batch->targets[i];

    
    /* printf("<<<<<===== CAL SERVER =====>>>>>\n"); */
    /* for (int c = 0; c < array_list_size(mapping_batch->mapping_lists[mapping_batch->targets[i]]); c++) { */
    /*   cal_t *cal_aux = array_list_get(c, mapping_batch->mapping_lists[mapping_batch->targets[i]]); */
    /*   cal_print(cal_aux); */
    /* } */
    /* printf("<<<<<===== CAL SERVER END =====>>>>>\n"); */
   
    //printf("Total CALs %i\n", num_cals);

  }

  mapping_batch->num_targets = target_pos;

  array_list_free(region_list, NULL);

  //if (time_on) { stop_timer(start, end, time); timing_add(time, CAL_SEEKER, timing); }

  LOG_DEBUG("========= APPLY CALING RNA END =========\n");

  
  //  return RNA_STAGE;
  if (batch->mapping_mode == RNA_MODE) {
    return RNA_STAGE;
  }

  if (batch->pair_input->pair_mng->pair_mode != SINGLE_END_MODE) {
    return PRE_PAIR_STAGE;
  } else if (batch->mapping_batch->num_targets > 0) {
    return SW_STAGE;
  }
  
  return DNA_POST_PAIR_STAGE;

}
예제 #27
0
파일: filesystem.c 프로젝트: jucs/musicfs
static int filesystem_readdir(const char *path, void *buffer, fuse_fill_dir_t fill_dir,
                              off_t offset, struct fuse_file_info *file_info) {
    int retstat = 0;
    //	DIR *dp;
    //	struct dirent *de;

    logging_log("Filesystem", LOGGING_LEVEL_INFO,
                "filesystem_readdir(path=\"%s\", buf=0x%08x, filler=0x%08x, offset=%ld, fi=0x%08x)...",
                path, buffer, fill_dir, offset, file_info);

    if(!strcmp(path, "/")) {
        pthread_mutex_lock(filesystem_io_mutex);
        ArrayList *search_dirs = searcher_get_searches();
        pthread_mutex_unlock(filesystem_io_mutex);
        if(search_dirs != NULL) {
            for(size_t i = 0; i < array_list_get_length(search_dirs); i++) {
                char *name;
                array_list_get(search_dirs, (const void **)&name, i);

                if(fill_dir(buffer, name, NULL, 0) != 0) {
                    array_list_free(search_dirs);
                    return -ENOMEM;
                }
            }
            array_list_free(search_dirs);
        }
    } else {
        path_t *path_parsed = path_parse(path);

        logging_log("Filesystem", LOGGING_LEVEL_INFO, "path_parsed->parts_length: %lu...",
                    path_parsed->parts_length);
        if(path_parsed->parts_length) {
            pthread_mutex_lock(filesystem_io_mutex);
            ALDictionary *results = searcher_get_search_results(path_parsed->parts[0]);
            if(results != NULL) {
                logging_log("Filesystem", LOGGING_LEVEL_INFO, "results != NULL...");
                ALDictionaryEnumerator *e = al_dictionary_get_enumerator(results);
                while(al_dictionary_enumerator_move_next(e)) {
                    ALDictionaryKeyValuePair *pair;
                    al_dictionary_enumerator_get_current(e, &pair);

                    logging_log("Filesystem", LOGGING_LEVEL_INFO, "pair->key: %s...",
                                (char*)pair->key);
                    if(fill_dir(buffer, (char*)pair->key, NULL, 0) != 0) {
                        al_dictionary_enumerator_free(e);
                        path_free(path_parsed);
                        pthread_mutex_unlock(filesystem_io_mutex);
                        return -ENOMEM;
                    }
                }
                al_dictionary_enumerator_free(e);
            } else
                retstat = -1;
            pthread_mutex_unlock(filesystem_io_mutex);
        } else
            retstat = -1;
        path_free(path_parsed);
    }

    return retstat;
}
예제 #28
0
void apply_sw(sw_server_input_t* input, aligner_batch_t *batch) {


  //  printf("START: apply_sw\n"); 
  int tid = omp_get_thread_num();

  cal_t *cal = NULL;
  array_list_t *cal_list = NULL, *mapping_list = NULL;//, *old_list = NULL, *new_list = NULL;
  fastq_batch_t *fq_batch = batch->fq_batch;

  size_t start, end;
  genome_t *genome = input->genome_p;
     
  size_t flank_length = input->flank_length;

  // SIMD support for Smith-Waterman
  float score, min_score = input->min_score;
  //  size_t curr_depth = 0;
  sw_output_t *sw_output;
  //  sw_simd_input_t *sw_sinput = sw_simd_input_new(SIMD_DEPTH);
  //  sw_simd_output_t *sw_soutput = sw_simd_output_new(SIMD_DEPTH);
  //sw_simd_context_t *context = sw_simd_context_new(input->match, input->mismatch, 
  //						    input->gap_open, input->gap_extend); 

  // for tracking the current read, cal being processed using sw_channel_t
  //sw_channel_t *channel;
  //sw_channel_t sw_channels[SIMD_DEPTH];
  //memset(sw_channels, 0, sizeof(sw_channels));
  
  //size_t header_len, read_len;
  //size_t strands[SIMD_DEPTH], chromosomes[SIMD_DEPTH], starts[SIMD_DEPTH];
  
  size_t index, num_cals;
  size_t total = 0, valids = 0;

  size_t num_seqs = batch->num_targets;

  // set to zero
  batch->num_done = batch->num_to_do;
  batch->num_to_do = 0;

  size_t sw_total = batch->num_done;
  /*
  // for all seqs pending to process !!
  size_t sw_total = 0;
  for (size_t i = 0; i < num_seqs; i++) {
    sw_total += array_list_size(batch->mapping_lists[batch->targets[i]]);
  }
  printf("number of sw to run: %d (vs num_done = %d)\n", sw_total, batch->num_done);
  */

  sw_optarg_t *sw_optarg = &input->sw_optarg;
    /*
  sw_optarg_t sw_optarg; //= sw_optarg_new(gap_open, gap_extend, matrix_filename);
  sw_optarg.gap_open = input->gap_open;
  sw_optarg.gap_extend = input->gap_extend;
  sw_optarg.subst_matrix['A']['A'] = input->match;    sw_optarg.subst_matrix['C']['A'] = input->mismatch; sw_optarg.subst_matrix['T']['A'] = input->mismatch; sw_optarg.subst_matrix['G']['A'] = input->mismatch;
  sw_optarg.subst_matrix['A']['C'] = input->mismatch; sw_optarg.subst_matrix['C']['C'] = input->match;    sw_optarg.subst_matrix['T']['C'] = input->mismatch; sw_optarg.subst_matrix['G']['C'] = input->mismatch;
  sw_optarg.subst_matrix['A']['G'] = input->mismatch; sw_optarg.subst_matrix['C']['T'] = input->mismatch; sw_optarg.subst_matrix['T']['T'] = input->match;    sw_optarg.subst_matrix['G']['T'] = input->mismatch;
  sw_optarg.subst_matrix['A']['T'] = input->mismatch; sw_optarg.subst_matrix['C']['G'] = input->mismatch; sw_optarg.subst_matrix['T']['G'] = input->mismatch; sw_optarg.subst_matrix['G']['G'] = input->match;
    */
  sw_multi_output_t *output = sw_multi_output_new(sw_total);
  char *q[sw_total], *r[sw_total];
  uint8_t strands[sw_total], chromosomes[sw_total];
  size_t starts[sw_total];
  size_t sw_count = 0, read_indices[sw_total];
  int read_len;

  // debugging: to kown how many reads are not mapped by SW score
  //  int unmapped_by_score[fq_batch->num_reads];
  //  memset(unmapped_by_score, 0, fq_batch->num_reads * sizeof(int));

  //  printf("num of sw to do: %i\n", sw_total);

  // initialize query and reference sequences to Smith-Waterman
  for (size_t i = 0; i < num_seqs; i++) {
    index = batch->targets[i];

    cal_list = batch->mapping_lists[index];
    num_cals = array_list_size(cal_list);

    //    printf("sw_server: read #%i with %i cals\n", index, num_cals);

    // processing each CAL from this read
    for(size_t j = 0; j < num_cals; j++) {

      // get cal and read index
      cal = array_list_get(j, cal_list);
      read_indices[sw_count] = index;

      // query sequence, revcomp if necessary
      read_len = fq_batch->data_indices[index + 1] - fq_batch->data_indices[index];
      q[sw_count] = (char *) calloc((read_len + 1), sizeof(char));
      memcpy(q[sw_count], &(fq_batch->seq[fq_batch->data_indices[index]]), read_len);
      if (cal->strand == 1) {
	seq_reverse_complementary(q[sw_count], read_len);
      }
      //q[sw_count] = &(fq_batch->seq[fq_batch->data_indices[index]]);

      // reference sequence
      //printf("\tSW: %d.[chromosome:%d]-[strand:%d]-[start:%d, end:%d]\n", j, cal->chromosome_id, cal->strand, cal->start, cal->end);
  
      start = cal->start - flank_length;
      end = cal->end + flank_length;
      r[sw_count] = calloc(1, end - start + 2);
      genome_read_sequence_by_chr_index(r[sw_count], cal->strand,
					cal->chromosome_id - 1, &start, &end, genome);

      // save some stuff, we'll use them after...
      strands[sw_count] = cal->strand;
      chromosomes[sw_count] = cal->chromosome_id;
      starts[sw_count] = start;


      //      printf("read #%i (sw #%i): query: %s (%i)\nref  : %s (%i)\n\n", index, sw_count, q[sw_count], strlen(q[sw_count]), r[sw_count], strlen(r[sw_count]));

      // increase counter
      sw_count++;
    }

    // free cal_list
    array_list_free(cal_list, (void *)cal_free);
    batch->mapping_lists[index] = NULL;
  }

  // run Smith-Waterman
  //  printf("before smith_waterman: number of sw = %i\n", sw_total);
  smith_waterman_mqmr(q, r, sw_total, sw_optarg, 1, output);
  //  printf("after smith_waterman\n");

  /*
  // debugging
  {
    FILE *fd = fopen("sw.out", "w");
    sw_multi_output_save(sw_total, output, fd);
    fclose(fd);
  }
  */

  size_t num_targets = 0;
  // filter alignments by min_score
  for (size_t i = 0; i < sw_total; i++) {

    //    score = output->score_p[i] / (strlen(output->query_map_p[i]) * input->match);
    //    if (score >= min_score) {
    /*
    printf("--------------------------------------------------------------\n");
    printf("Smith-Waterman results:\n");
    printf("id\t%s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[read_indices[i]]]));
    printf("ref\n%s\n", r[i]);
    printf("query\n%s\n", q[i]);
    printf("map\n%s\n", output->ref_map_p[i]);
    printf("ref: chr = %d, strand = %d, start = %d, len = %d\n", chromosomes[i], strands[i], starts[i], strlen(r[i]));
    printf("query-map-start = %d, ref-map-start = %d\n", 
	   output->query_start_p[i], output->ref_start_p[i]);
    printf("score = %0.2f (min. score = %0.2f)\n", output->score_p[i], min_score);
    printf("--------------------------------------------------------------\n");
    */
    if (output->score_p[i] >= min_score) {
      // valid mappings, 
      //insert in the list for further processing
      index = read_indices[i];
      if (batch->mapping_lists[index] == NULL) {
	mapping_list = array_list_new(1000, 
				      1.25f, 
				      COLLECTION_MODE_ASYNCHRONIZED);
	array_list_set_flag(0, mapping_list);
	
	batch->mapping_lists[index] = mapping_list;
	batch->targets[num_targets++] = index;
      }

      sw_output = sw_output_new(strands[i],
				chromosomes[i],
				starts[i],
				strlen(r[i]),
				strlen(output->query_map_p[i]),
				output->query_start_p[i],
				output->ref_start_p[i],
				output->score_p[i],
				score,
				output->query_map_p[i],
				output->ref_map_p[i]);
      array_list_insert(sw_output, mapping_list);

      batch->num_to_do++;

      // debugging
      //unmapped_by_score[index] = 1;
    }

    // free query and reference
    free(q[i]);
    free(r[i]);
  }
  batch->num_targets = num_targets;
  /*
  // debugging
  for (size_t i = 0; i < fq_batch->num_reads; i++) {
    if (unmapped_by_score[i] == 0) {
	unmapped_by_score_counter[tid]++;
	//printf("by score: %s\n", &(batch->fq_batch->header[batch->fq_batch->header_indices[index]]));
      }
  }
  */

  // update counter
  thr_sw_items[tid] += sw_count;

  // free
  sw_multi_output_free(output);

  //  printf("END: apply_sw, (%d Smith-Waterman, %d valids)\n", total, valids);
}
예제 #29
0
int sa_sam_writer(void *data) {
  sa_wf_batch_t *wf_batch = (sa_wf_batch_t *) data;
  
  sa_mapping_batch_t *mapping_batch = (sa_mapping_batch_t *) wf_batch->mapping_batch;
  if (mapping_batch == NULL) {
    printf("bam_writer1: error, NULL mapping batch\n");
    return 0;
  }
  /*
  for (int i = 0; i < NUM_COUNTERS; i++) {
    counters[i] += mapping_batch->counters[i];
  }
  */
  #ifdef _TIMING
  for (int i = 0; i < NUM_TIMING; i++) {
    func_times[i] += mapping_batch->func_times[i];
  }
  #endif

  int num_mismatches, num_cigar_ops;
  size_t flag, pnext = 0, tlen = 0;
  char *cigar_string, *cigar_M_string, *rnext = "*";

  fastq_read_t *read;
  array_list_t *read_list = mapping_batch->fq_reads;

  array_list_t *mapping_list, *mate_list;
  FILE *out_file = (FILE *) wf_batch->writer_input->bam_file;

  sa_genome3_t *genome = wf_batch->sa_index->genome;

  size_t num_reads, num_mappings, num_mate_mappings;
  num_reads = mapping_batch->num_reads;

  if (mapping_batch->options->pair_mode != SINGLE_END_MODE) {
    // PAIR MODE
    int len;
    char *sequence, *quality;

    char *seq, *opt_fields;
    alignment_t *alig;
  
    for (size_t i = 0; i < num_reads; i++) {
      read = (fastq_read_t *) array_list_get(i, read_list);
      //      seq = read->sequence;
      /*
      if (i % 2 == 0)  {
	mate_list = mapping_batch->mapping_lists[i+1];
	num_mate_mappings = array_list_size(mate_list);
      } else {
	mate_list = mapping_list;
	num_mate_mappings = num_mappings;
      }
      */
      mapping_list = mapping_batch->mapping_lists[i];
      num_mappings = array_list_size(mapping_list);
      num_total_mappings += num_mappings;

      #ifdef _VERBOSE
      if (num_mappings > 1) {
	num_dup_reads++;
	num_total_dup_reads += num_mappings;
      }
      #endif
      
      if (num_mappings > 0) {
	num_mapped_reads++;
	if (num_mappings > 1) {
	  num_multihit_reads++;
	}
	for (size_t j = 0; j < num_mappings; j++) {
	  alig = (alignment_t *) array_list_get(j, mapping_list);
	  /*
	  // update alignment
	  alig->secondary_alignment = 0;
	  if (num_mate_mappings != 1) {
	    alig->is_mate_mapped = 0;
	    alig->is_paired_end_mapped = 0;
	    alig->mate_strand = 0;
	  }
	  */
	  if (alig->optional_fields) {
	    opt_fields = (char *) calloc(strlen(alig->optional_fields) + 100, sizeof(char));
	    sprintf(opt_fields, "NH:i:%i\t%s", num_mappings, alig->optional_fields);
	    //	    sprintf(opt_fields, "NH:i:%i\t%s\tXU:i:%i", num_mappings, alig->optional_fields, mapping_batch->status[i]);
	  } else {
	    opt_fields = (char *) calloc(100, sizeof(char));
	    sprintf(opt_fields, "NH:i:%i", num_mappings);
	    //	    sprintf(opt_fields, "NH:i:%i\tXU:i:%i", num_mappings, mapping_batch->status[i]);
	  }
	  /*
	  // update alignment
	  alig->secondary_alignment = 0;
	  if (num_mate_mappings != 1) {
	    alig->is_mate_mapped = 0;
	    alig->is_paired_end_mapped = 0;
	    alig->mate_strand = 0;
	  }
	  */
	  flag = 0;
	  if (alig->is_paired_end)                              flag += BAM_FPAIRED;
	  if (alig->is_paired_end_mapped)                       flag += BAM_FPROPER_PAIR;
	  if (!alig->is_seq_mapped)                             flag += BAM_FUNMAP;   
	  if ((!alig->is_mate_mapped) && (alig->is_paired_end)) flag += BAM_FMUNMAP;
	  if (alig->mate_strand)                                flag += BAM_FMREVERSE;
	  if (alig->pair_num == 1)	                        flag += BAM_FREAD1;
	  if (alig->pair_num == 2)                              flag += BAM_FREAD2;
	  if (alig->secondary_alignment)                        flag += BAM_FSECONDARY;
	  if (alig->fails_quality_check)                        flag += BAM_FQCFAIL;
	  if (alig->pc_optical_duplicate)                       flag += BAM_FDUP;
	  if (alig->seq_strand)                                 flag += BAM_FREVERSE;

	  fprintf(out_file, "%s\t%lu\t%s\t%i\t%i\t%s\t%s\t%i\t%i\t%s\t%s\t%s\n", 
		  read->id,
		  flag,
		  genome->chrom_names[alig->chromosome],
		  alig->position + 1,
		  (num_mappings > 1 ? 0 : alig->mapq), //60, //(alig->map_quality > 3 ? 0 : alig->map_quality),
		  alig->cigar,
		  (alig->chromosome == alig->mate_chromosome ? "=" : genome->chrom_names[alig->mate_chromosome]),
		  alig->mate_position + 1,
		  alig->template_length,
		  alig->sequence,
		  alig->quality,
		  opt_fields
		  );

	  // free memory
	  free(opt_fields);
	  alignment_free(alig);	 
	} // end for num_mappings
      } else {
	num_unmapped_reads++;

	opt_fields = (char *) calloc(100, sizeof(char));
	sprintf(opt_fields, "XM:i:%i XU:i:%i", num_mappings, mapping_batch->status[i]);

	if (read->adapter) {
	  len = read->length + abs(read->adapter_length);
	  sequence = (char *) malloc(len + 1);
	  quality = (char *) malloc(len + 1);

	  if (read->adapter_length < 0) {
	    strcpy(quality, read->adapter_quality);
	    strcat(quality, read->quality);
	  } else {
	    strcpy(quality, read->quality);
	    strcat(quality, read->adapter_quality);
	  }
	  
	  if ((read->adapter_strand == 0 && read->adapter_length < 0) || 
	      (read->adapter_strand == 1 && read->adapter_length > 0)) {
	    strcpy(sequence, read->adapter);
	    strcat(sequence, read->sequence);
	  } else {
	    strcpy(sequence, read->sequence);
	    strcat(sequence, read->adapter);
	  }

	  sequence[len] = 0; 
	  quality[len] = 0; 
	} else {
	  sequence = read->sequence;
	  quality = read->quality;
	}

	fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\t%s\n", 
		read->id,
		sequence,
		quality,
		opt_fields
		);

	free(opt_fields);

	if (read->adapter) {
	  free(sequence);
	  free(quality);
	}
      }
      array_list_free(mapping_list, (void *) NULL);
    }
  } else {
    // SINGLE MODE
    int len, mapq;
    char *seq;
    seed_cal_t *cal;

    cigar_t *cigar;
    char *sequence, *revcomp, *quality;

    for (size_t i = 0; i < num_reads; i++) {
      read = (fastq_read_t *) array_list_get(i, read_list);
      mapping_list = mapping_batch->mapping_lists[i];
      num_mappings = array_list_size(mapping_list);
      num_total_mappings += num_mappings;

      #ifdef _VERBOSE
      if (num_mappings > 1) {
	num_dup_reads++;
	num_total_dup_reads += num_mappings;
      }
      #endif
      
      if (num_mappings > 0) {
	num_mapped_reads++;
	if (num_mappings > 1) {
	  num_multihit_reads++;
	}

	for (size_t j = 0; j < num_mappings; j++) {
	  cal = (seed_cal_t *) array_list_get(j, mapping_list);
	  
	  if (read->adapter) {
	    // sequences and cigar
	    len = read->length + abs(read->adapter_length);
	    sequence = (char *) malloc(len + 1);
	    revcomp = (char *) malloc(len + 1);
	    quality = (char *) malloc(len + 1);
	    cigar = cigar_new_empty();

	    if (read->adapter_length < 0) {
	      strcpy(quality, read->adapter_quality);
	      strcat(quality, read->quality);
	    } else {
	      strcpy(quality, read->quality);
	      strcat(quality, read->adapter_quality);
	    }
	    
	    if ( (cal->strand == 1 && 
		  ((read->adapter_strand == 0 && read->adapter_length > 0) || 
		   (read->adapter_strand == 1 && read->adapter_length < 0)))
		 ||
		 (cal->strand == 0 && 
		  ((read->adapter_strand == 0 && read->adapter_length < 0) ||
		   (read->adapter_strand == 1 && read->adapter_length > 0))) ) {
	      strcpy(sequence, read->adapter);
	      strcat(sequence, read->sequence);
	      strcpy(revcomp, read->adapter_revcomp);
	      strcat(revcomp, read->revcomp);
	      
	      cigar_append_op(abs(read->adapter_length), 'S', cigar);
	      cigar_concat(&cal->cigar, cigar);
	    } else {
	      strcpy(sequence, read->sequence);
	      strcat(sequence, read->adapter);
	      strcpy(revcomp, read->revcomp);
	      strcat(revcomp, read->adapter_revcomp);
	      
	      cigar_concat(&cal->cigar, cigar);
	      cigar_append_op(read->adapter_length, 'S', cigar);
	    }
	    sequence[len] = 0; 
	    revcomp[len] = 0; 
	    quality[len] = 0; 
	  } else {
	    // sequences and cigar
	    sequence = read->sequence;
	    revcomp = read->revcomp;
	    quality = read->quality;
	    cigar = &cal->cigar;
	  }

	  if (cal->strand) {
	    flag = 16;
	    seq = revcomp;
	  } else {
	    flag = 0;
	    seq = sequence;
	  }

	  /*
	  if (i == 0) {
	    flag += BAM_FSECONDARY;
	  }
	  */

	  cigar_string = cigar_to_string(cigar);
	  cigar_M_string = cigar_to_M_string(&num_mismatches, &num_cigar_ops, cigar);
	  if (num_mappings > 1) {
	    cal->mapq = 0;
	  }
	  fprintf(out_file, "%s\t%i\t%s\t%i\t%i\t%s\t%s\t%lu\t%i\t%s\t%s\tNH:i:%i\tNM:i:%i\n", 
		  read->id,
		  flag,
		  genome->chrom_names[cal->chromosome_id],
		  cal->start + 1,
		  (num_mappings == 1 ? cal->mapq : 0),
		  cigar_M_string,
		  rnext,
		  pnext,
		  tlen,
		  seq,
		  quality,
		  num_mappings,
		  num_mismatches
		  );

	  // free memory
	  free(cigar_M_string);
	  free(cigar_string);
	  seed_cal_free(cal);	 
	  if (read->adapter) {
	    free(sequence);
	    free(revcomp);
	    free(quality);
	    cigar_free(cigar);
	  }
	}
      } else {
	num_unmapped_reads++;

	if (read->adapter) {
	  // sequences and cigar
	  len = read->length + abs(read->adapter_length);
	  sequence = (char *) malloc(len + 1);
	  quality = (char *) malloc(len + 1);

	  if (read->adapter_length < 0) {
	    strcpy(quality, read->adapter_quality);
	    strcat(quality, read->quality);
	  } else {
	    strcpy(quality, read->quality);
	    strcat(quality, read->adapter_quality);
	  }
	  
	  if ((read->adapter_strand == 0 && read->adapter_length < 0) || 
	      (read->adapter_strand == 1 && read->adapter_length > 0)) {
	    strcpy(sequence, read->adapter);
	    strcat(sequence, read->sequence);
	  } else {
	    strcpy(sequence, read->sequence);
	    strcat(sequence, read->adapter);
	  }

	  sequence[len] = 0; 
	  quality[len] = 0; 
	} else {
	  // sequences
	  sequence = read->sequence;
	  quality = read->quality;
	}
	
	fprintf(out_file, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t%s\t%s\n", 
		read->id,
		sequence,
		quality
		);

	if (read->adapter) {
	  free(sequence);
	  free(quality);
	}
      }
      
      array_list_free(mapping_list, (void *) NULL);
    } // end for num_reads
  }

  // free memory
  sa_mapping_batch_free(mapping_batch);

  if (wf_batch) sa_wf_batch_free(wf_batch);

  return 0;
}
예제 #30
0
array_list_t *filter_cals(size_t num_cals, size_t read_length, array_list_t *list) {
  cal_t *cal;
  int min_seeds, max_seeds;
  array_list_t *cal_list;
  size_t select_cals;

  //filter-incoherent CALs
  int founds[num_cals], found = 0;
  for (size_t j = 0; j < num_cals; j++) {
    founds[j] = 0;
    cal = array_list_get(j, list);
    LOG_DEBUG_F("\tcal %i of %i: sr_list size = %i (cal->num_seeds = %i) %i:%lu-%lu\n", 
		j, num_cals, cal->sr_list->size, cal->num_seeds,
		cal->chromosome_id, cal->start, cal->end);
    if (cal->sr_list->size > 0) {
      int start = 0;
      size_t genome_start = 0;
      int first = 1;
      for (linked_list_item_t *list_item = cal->sr_list->first; list_item != NULL; list_item = list_item->next) {
	seed_region_t *s = list_item->item;
	
	LOG_DEBUG_F("\t\t:: star %lu > %lu s->read_start\n", start, s->read_start);
	LOG_DEBUG_F("\t\t:: read_star %lu > read_end %lu \n", s->read_start, s->read_end);
	if (start > s->read_start || s->read_start >= s->read_end) {
	  LOG_DEBUG("\t\t\t:: remove\n");
	  found++;
	  founds[j] = 1;
	}
	
	if (!first && 
	    ((s->genome_start < genome_start) || 
	     (s->genome_start - genome_start) > 2 * read_length)) {
	  //printf("Remove (genome_start = %i s->genome_start = %i)\n", genome_start, s->genome_start);
	  //cal_print(cal);
	  found++;
	  founds[j] = 1;
	}
	
	first = 0;
	start = s->read_end + 1;
	genome_start = s->genome_end + 1;
      }
    } else {
      found++;
      founds[j] = 1;
    }
  }
  
  if (found) {
    min_seeds = 100000;
    max_seeds = 0;
    cal_list = array_list_new(MAX_CALS, 1.25f, COLLECTION_MODE_ASYNCHRONIZED);
    for (size_t j = 0; j < num_cals; j++) {
      if (!founds[j]) {
	cal = array_list_get(j, list);
	cal->num_seeds = cal->sr_list->size;
	if (cal->num_seeds > max_seeds) max_seeds = cal->num_seeds;
	if (cal->num_seeds < min_seeds) min_seeds = cal->num_seeds;
	array_list_insert(cal, cal_list);
	array_list_set(j, NULL, list);
      }
    }
    array_list_free(list, (void *) cal_free);
    num_cals = array_list_size(cal_list);
    list = cal_list;
  }
  
  num_cals = array_list_size(list);
  
  int max = 100;
  if (num_cals > max) {
    select_cals = num_cals - max;
    for(int j = num_cals - 1; j >= max; j--) {
      cal_free(array_list_remove_at(j, list));
    }
  }
 
  return list;
}