array_list_t *indel_filter(array_list_t *input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void *f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); int include_indels = ((indel_filter_args*)f_args)->include_indels; LOG_DEBUG_F("indel_filter (preserve indels = %d) over %zu records\n", include_indels, input_records->size); vcf_record_t *record; variant_stats_t *variant_stats; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; variant_stats = input_stats[i]; if (variant_stats->is_indel) { if (include_indels) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } else { if (include_indels) { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } else { array_list_insert(record, passed); } } } return passed; }
array_list_t *snp_filter(array_list_t *input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void *f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); int include_snps = ((snp_filter_args*)f_args)->include_snps; LOG_DEBUG_F("snp_filter (preserve SNPs = %d) over %zu records\n", include_snps, input_records->size); vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; if (record->id_len == 1 && strncmp(".", record->id, 1) == 0) { if (include_snps) { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } else { array_list_insert(record, passed); } } else { if (include_snps) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } } return passed; }
array_list_t* mendelian_errors_filter(array_list_t* input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void* args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); int max_errors = ((mendelian_errors_filter_args*) args)->max_mendelian_errors; float allele_count; variant_stats_t *variant_stats; // The stats returned by get_variants_stats are related to a record in the same // position of the input_records list, so when a variant_stats_t fulfills the condition, // it means the related vcf_record_t passes the filter vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; variant_stats = input_stats[i]; allele_count = 0; if (variant_stats->mendelian_errors <= max_errors) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } return passed; }
array_list_t* maf_filter(array_list_t* input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void* args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); float min_maf = ((maf_filter_args*) args)->min_maf; float record_maf = 1.0; variant_stats_t *variant_stats; // The stats returned by get_variants_stats are related to a record in the same // position of the input_records list, so when a variant_stats_t fulfills the condition, // it means the related vcf_record_t passes the filter vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; variant_stats = input_stats[i]; record_maf = 1.0; for (int j = 0; j < variant_stats->num_alleles; j++) { record_maf = fmin(record_maf, variant_stats->alleles_freq[j]); } if (record_maf >= min_maf) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } return passed; }
array_list_t *inheritance_pattern_filter(array_list_t *input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void *f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); enum inheritance_pattern pattern = ((inheritance_pattern_filter_args*)f_args)->pattern; float min_following_pattern = ((inheritance_pattern_filter_args*)f_args)->min_following_pattern; if (pattern == DOMINANT) { LOG_DEBUG_F("inheritance_pattern_filter (dominant in %.2f% of samples) over %zu records\n", min_following_pattern * 100, input_records->size); } else { LOG_DEBUG_F("inheritance_pattern_filter (recessive in %.2f% of samples) over %zu records\n", min_following_pattern * 100, input_records->size); } vcf_record_t *record; variant_stats_t *stats; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; stats = input_stats[i]; if (pattern == DOMINANT) { if (stats->cases_percent_dominant >= min_following_pattern && stats->controls_percent_dominant >= min_following_pattern) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } else if (pattern == RECESSIVE) { if (stats->cases_percent_recessive >= min_following_pattern && stats->controls_percent_recessive >= min_following_pattern) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } } return passed; }
array_list_t* coverage_filter(array_list_t* input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void* f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); int min_coverage = ((coverage_filter_args*)f_args)->min_coverage; LOG_DEBUG_F("coverage_filter (min coverage = %d) over %zu records\n", min_coverage, input_records->size); char *aux_buffer = (char*) calloc (128, sizeof(char)); vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; if (record->info_len > strlen(aux_buffer)) { aux_buffer = realloc (aux_buffer, record->info_len+1); memset(aux_buffer, 0, (record->info_len+1) * sizeof(char)); } strncpy(aux_buffer, record->info, record->info_len); char *record_coverage = get_field_value_in_info("DP", aux_buffer); if (record_coverage != NULL && is_numeric(record_coverage)) { if (atoi(record_coverage) >= min_coverage) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } free(aux_buffer); return passed; }
array_list_t *region_filter(array_list_t *input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void *f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); region_filter_args *args = (region_filter_args*) f_args; region_table_t *regions = args->regions; LOG_DEBUG_F("region_filter over %zu records\n", input_records->size); vcf_record_t *record; region_t *region = (region_t*) malloc (sizeof(region_t)); for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; // LOG_DEBUG_F("record = %s, %ld\n", record->chromosome, record->position); region->chromosome = strndup(record->chromosome, record->chromosome_len); region->start_position = record->position; region->end_position = record->position; int found = 0; if (args->type) { region->type = args->type; found = find_region_by_type(region, regions); } else { found = find_region(region, regions); } if (found) { // Add to the list of records that pass all checks for at least one region array_list_insert(record, passed); // LOG_DEBUG_F("%.*s, %ld passed\n", record->chromosome_len, record->chromosome, record->position); } else { // Add to the list of records that fail all checks for all regions annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } free(region->chromosome); } free(region); return passed; }
array_list_t* quality_filter(array_list_t* input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void* f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); int min_quality = ((quality_filter_args*)f_args)->min_quality; LOG_DEBUG_F("quality_filter (min quality = %d) over %zu records\n", min_quality, input_records->size); vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; if (record->quality >= min_quality) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } return passed; }
array_list_t* missing_values_filter(array_list_t* input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void* args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); float max_missing = ((missing_values_filter_args*) args)->max_missing; float record_missing; float allele_count; list_item_t *stats_item = NULL; variant_stats_t *variant_stats; // The stats returned by get_variants_stats are related to a record in the same // position of the input_records list, so when a variant_stats_t fulfills the condition, // it means the related vcf_record_t passes the filter vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; variant_stats = input_stats[i]; allele_count = 0; for (int j = 0; j < variant_stats->num_alleles; j++) { allele_count += variant_stats->alleles_count[j]; } record_missing = variant_stats->missing_alleles / (allele_count + variant_stats->missing_alleles); if (record_missing <= max_missing) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } return passed; }
array_list_t *variant_type_filter(array_list_t *input_records, array_list_t *failed, variant_stats_t **input_stats, char *filter_name, void *f_args) { assert(input_records); assert(failed); array_list_t *passed = array_list_new(input_records->size + 1, 1, COLLECTION_MODE_ASYNCHRONIZED); size_t filter_name_len = strlen(filter_name); enum variant_type type = ((variant_type_filter_args*)f_args)->type; LOG_DEBUG_F("variant_type_filter (variant_type %d) over %zu records\n", type, input_records->size); vcf_record_t *record; for (int i = 0; i < input_records->size; i++) { record = input_records->items[i]; if (record->type == type) { array_list_insert(record, passed); } else { annotate_failed_record(filter_name, filter_name_len, record); array_list_insert(record, failed); } } return passed; }