static FileInfo *load_data(char *filename) { FILE *fp; char line[LINE_SIZE]; char d[TIME_STR_SIZE], a[TIME_STR_SIZE]; int m; bool read_error = false, data_error = false; FileInfo *fi = NULL; /* NULL so first realloc is just like malloc() */ size_t l; if (!is_file_valid(filename)) exit(EXIT_FAILURE); if ((fp = fopen(filename, "r")) == NULL) { fprintf(stderr, "failed to open %s\n", filename); exit(EXIT_FAILURE); } for (l = 0;fgets(line, sizeof(line), fp) != NULL; l++) { if (!(fi = realloc(fi, sizeof(FileInfo) + (sizeof(int (*)[2])) * (l + 1)))) { fprintf(stderr, "realloc: %s:%d\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } sscanf(line, "%s %s", d, a); if ((m = to_minutes(d)) == -1) { DATA_ERROR(filename, l + 1, d); break; } fi->data[l][0] = m; if ((m = to_minutes(a)) == -1) { DATA_ERROR(filename, l + 1, a); break; } fi->data[l][1] = m; } fi->size = l; if (!feof(fp) || ferror(fp)) { if (!data_error) { perror(FILE_PATH); read_error = true; errno = 0; } } if (fclose(fp) == EOF || read_error || data_error) { if (!data_error && !read_error) perror(filename); free(fi); exit(EXIT_FAILURE); } return fi; }
/** * returns true if checks are passed */ void runChecks ( const char * accession, const CheckCorruptConfig * config, const VCursor * pa_cursor, const VCursor * sa_cursor, const VCursor * seq_cursor ) { rc_t rc; uint32_t pa_has_ref_offset_idx; uint32_t sa_has_ref_offset_idx; uint32_t sa_seq_spot_id_idx; uint32_t sa_seq_read_id_idx; uint32_t sa_pa_id_idx; uint32_t sa_tmp_mismatch_idx; uint32_t seq_pa_id_idx; uint32_t seq_read_len_idx; uint32_t seq_cmp_read_idx; bool has_tmp_mismatch; /* add columns to cursor */ #define add_column(tbl_name, cursor, idx, col_spec) \ rc = VCursorAddColumn( cursor, &idx, col_spec ); \ if ( rc != 0 ) \ throw VDB_ERROR("VCursorAddColumn() failed for " tbl_name " table, " col_spec " column", rc); add_column( "PRIMARY_ALIGNMENT", pa_cursor, pa_has_ref_offset_idx, "(bool)HAS_REF_OFFSET" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_has_ref_offset_idx, "(bool)HAS_REF_OFFSET" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_seq_spot_id_idx, "SEQ_SPOT_ID" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_seq_read_id_idx, "SEQ_READ_ID" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_pa_id_idx, "PRIMARY_ALIGNMENT_ID" ); add_column( "SEQUENCE", seq_cursor, seq_pa_id_idx, "PRIMARY_ALIGNMENT_ID" ); add_column( "SEQUENCE", seq_cursor, seq_read_len_idx, "READ_LEN" ); add_column( "SEQUENCE", seq_cursor, seq_cmp_read_idx, "CMP_READ" ); // optional columns rc = VCursorAddColumn( sa_cursor, &sa_tmp_mismatch_idx, "TMP_MISMATCH" ); if ( rc == 0 ) has_tmp_mismatch = true; else { has_tmp_mismatch = false; rc = 0; } #undef add_column rc = VCursorOpen( pa_cursor ); if (rc != 0) throw VDB_ERROR("VCursorOpen() failed for PRIMARY_ALIGNMENT table", rc); rc = VCursorOpen( sa_cursor ); if (rc != 0) throw VDB_ERROR("VCursorOpen() failed for SECONDARY_ALIGNMENT table", rc); rc = VCursorOpen( seq_cursor ); if (rc != 0) throw VDB_ERROR("VCursorOpen() failed for SEQUENCE table", rc); int64_t sa_id_first; uint64_t sa_row_count; rc = VCursorIdRange( sa_cursor, sa_pa_id_idx, &sa_id_first, &sa_row_count ); if (rc != 0) throw VDB_ERROR("VCursorIdRange() failed for SECONDARY_ALIGNMENT table, PRIMARY_ALIGNMENT_ID column", rc); bool reported_about_no_pa = false; uint64_t pa_longer_sa_rows = 0; uint64_t pa_longer_sa_limit; if (config->pa_len_threshold_percent > 0) pa_longer_sa_limit = ceil( config->pa_len_threshold_percent * sa_row_count ); else if (config->pa_len_threshold_number == 0 || config->pa_len_threshold_number > sa_row_count) pa_longer_sa_limit = sa_row_count; else pa_longer_sa_limit = config->pa_len_threshold_number; uint64_t sa_row_limit; if (config->sa_cutoff_percent > 0) sa_row_limit = ceil( config->sa_cutoff_percent * sa_row_count ); else if (config->sa_cutoff_number == 0 || config->sa_cutoff_number > sa_row_count) sa_row_limit = sa_row_count; else sa_row_limit = config->sa_cutoff_number; for ( uint64_t i = 0; i < sa_row_count && i < sa_row_limit; ++i ) { int64_t sa_row_id = i + sa_id_first; const void * data_ptr = NULL; uint32_t data_len; uint32_t pa_row_len; uint32_t sa_row_len; uint32_t seq_read_len_len; // SA:HAS_REF_OFFSET rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_has_ref_offset_idx, NULL, (const void**)&data_ptr, NULL, &sa_row_len ); if ( rc != 0 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, HAS_REF_OFFSET column", sa_row_id, rc); const int64_t * p_seq_spot_id; uint32_t seq_spot_id_len; // SA:SEQ_SPOT_ID rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_seq_spot_id_idx, NULL, (const void**)&p_seq_spot_id, NULL, &seq_spot_id_len ); if ( rc != 0 || p_seq_spot_id == NULL || seq_spot_id_len != 1 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, SEQ_SPOT_ID column", sa_row_id, rc); int64_t seq_spot_id = *p_seq_spot_id; if (seq_spot_id == 0) { std::stringstream ss; ss << "SECONDARY_ALIGNMENT:" << sa_row_id << " has SEQ_SPOT_ID = " << seq_spot_id; throw DATA_ERROR(ss.str()); } if ( has_tmp_mismatch ) { const char * p_sa_tmp_mismatch; // SA:TMP_MISMATCH rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_tmp_mismatch_idx, NULL, (const void**)&p_sa_tmp_mismatch, NULL, &data_len ); if ( rc != 0 || p_sa_tmp_mismatch == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, TMP_MISMATCH column", sa_row_id, rc); for ( uint32_t j = 0; j < data_len; ++j ) { if ( p_sa_tmp_mismatch[j] == '=' ) { std::stringstream ss; ss << "SECONDARY_ALIGNMENT:" << sa_row_id << " TMP_MISMATCH contains '='"; throw DATA_ERROR(ss.str()); } } } const int64_t * p_pa_row_id; // SA:PRIMARY_ALIGNMENT_ID rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_pa_id_idx, NULL, (const void**)&p_pa_row_id, NULL, &data_len ); if ( rc != 0 || p_pa_row_id == NULL || data_len != 1 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, PRIMARY_ALIGNMENT_ID column", sa_row_id, rc); int64_t pa_row_id = *p_pa_row_id; if (pa_row_id == 0) { if (!reported_about_no_pa) { PLOGMSG (klogInfo, (klogInfo, "$(ACC) has secondary alignments without primary", "ACC=%s", accession)); reported_about_no_pa = true; } continue; } // PA:HAS_REF_OFFSET rc = VCursorCellDataDirect ( pa_cursor, pa_row_id, pa_has_ref_offset_idx, NULL, &data_ptr, NULL, &pa_row_len ); if ( rc != 0 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on PRIMARY_ALIGNMENT table, HAS_REF_OFFSET column", pa_row_id, rc); // move on when PA.len equal to SA.len if (pa_row_len == sa_row_len) continue; if (pa_row_len < sa_row_len) { std::stringstream ss; ss << "PRIMARY_ALIGNMENT:" << pa_row_id << " HAS_REF_OFFSET length (" << pa_row_len << ") less than SECONDARY_ALIGNMENT:" << sa_row_id << " HAS_REF_OFFSET length (" << sa_row_len << ")"; throw DATA_ERROR(ss.str()); } // we already know that pa_row_len > sa_row_len ++pa_longer_sa_rows; const int32_t * p_seq_read_id; // SA:SEQ_READ_ID rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_seq_read_id_idx, NULL, (const void**)&p_seq_read_id, NULL, &data_len ); if ( rc != 0 || p_seq_read_id == NULL || data_len != 1 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, SEQ_READ_ID column", sa_row_id, rc); // one-based read index int32_t seq_read_id = *p_seq_read_id; const uint32_t * p_seq_read_len; // SEQ:READ_LEN rc = VCursorCellDataDirect ( seq_cursor, seq_spot_id, seq_read_len_idx, NULL, (const void**)&p_seq_read_len, NULL, &seq_read_len_len ); if ( rc != 0 || p_seq_read_len == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, READ_LEN column", seq_spot_id, rc); if ( seq_read_id < 1 || (uint32_t)seq_read_id > seq_read_len_len ) { std::stringstream ss; ss << "SECONDARY:" << sa_row_id << " SEQ_READ_ID value (" << seq_read_id << ") - 1 based, is out of SEQUENCE:" << seq_spot_id << " READ_LEN range (" << seq_read_len_len << ")"; throw DATA_ERROR(ss.str()); } if (pa_row_len != p_seq_read_len[seq_read_id - 1]) { std::stringstream ss; ss << "PRIMARY_ALIGNMENT:" << pa_row_id << " HAS_REF_OFFSET length (" << pa_row_len << ") does not match its SEQUENCE:" << seq_spot_id << " READ_LEN[" << seq_read_id - 1 << "] value (" << p_seq_read_len[seq_read_id - 1] << ")"; throw DATA_ERROR(ss.str()); } if (pa_longer_sa_rows >= pa_longer_sa_limit) { std::stringstream ss; ss << "Limit violation (pa_longer_sa): there are at least " << pa_longer_sa_rows << " alignments where HAS_REF_OFFSET column is longer in PRIMARY_ALIGNMENT than in SECONDARY_ALIGNMENT"; throw DATA_ERROR(ss.str()); } } int64_t seq_id_first; uint64_t seq_row_count; rc = VCursorIdRange( seq_cursor, seq_pa_id_idx, &seq_id_first, &seq_row_count ); if (rc != 0) throw VDB_ERROR("VCursorIdRange() failed for SEQUENCE table, PRIMARY_ALIGNMENT_ID column", rc); uint64_t seq_row_limit; if (config->seq_cutoff_percent > 0) seq_row_limit = ceil( config->seq_cutoff_percent * seq_row_count ); else if (config->seq_cutoff_number == 0 || config->seq_cutoff_number > seq_row_count) seq_row_limit = seq_row_count; else seq_row_limit = config->seq_cutoff_number; for ( uint64_t i = 0; i < seq_row_count && i < seq_row_limit; ++i ) { int64_t seq_row_id = i + seq_id_first; const void * data_ptr = NULL; uint32_t data_len; const int64_t * p_seq_pa_id; uint32_t seq_pa_id_len; // SEQ:PRIMARY_ALIGNMENT_ID rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_pa_id_idx, NULL, (const void**)&p_seq_pa_id, NULL, &seq_pa_id_len ); if ( rc != 0 || p_seq_pa_id == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, PRIMARY_ALIGNMENT_ID column", seq_row_id, rc); const uint32_t * p_seq_read_len; // SEQ:READ_LEN rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_read_len_idx, NULL, (const void**)&p_seq_read_len, NULL, &data_len ); if ( rc != 0 || p_seq_read_len == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, READ_LEN column", seq_row_id, rc); if ( seq_pa_id_len != data_len ) { std::stringstream ss; ss << "SEQUENCE:" << seq_row_id << " PRIMARY_ALIGNMENT_ID length (" << seq_pa_id_len << ") does not match SEQUENCE:" << seq_row_id << " READ_LEN length (" << data_len << ")"; throw DATA_ERROR(ss.str()); } uint64_t sum_unaligned_read_len = 0; for ( uint32_t j = 0; j < seq_pa_id_len; ++j ) { if ( p_seq_pa_id[j] == 0 ) { sum_unaligned_read_len += p_seq_read_len[j]; } } // SEQ:CMP_READ rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_cmp_read_idx, NULL, (const void**)&data_ptr, NULL, &data_len ); if ( rc != 0 || data_ptr == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, SEQ:CMP_READ column", seq_row_id, rc); if ( sum_unaligned_read_len != data_len ) { std::stringstream ss; ss << "SEQUENCE:" << seq_row_id << " CMP_READ length (" << data_len << ") does not match sum of unaligned READ_LEN values (" << sum_unaligned_read_len << ")"; throw DATA_ERROR(ss.str()); } } if (sa_row_limit < sa_row_count || seq_row_limit < seq_row_count) PLOGMSG (klogInfo, (klogInfo, "$(ACC) looks good (based on first $(SA_CUTOFF) of SECONDARY_ALIGNMENT and $(SEQ_CUTOFF) SEQUENCE rows)", "ACC=%s,SA_CUTOFF=%lu,SEQ_CUTOFF=%lu", accession, sa_row_limit, seq_row_limit)); else PLOGMSG (klogInfo, (klogInfo, "$(ACC) looks good", "ACC=%s", accession)); }