Пример #1
0
static FileInfo *load_data(char *filename)
{
  FILE *fp;
  char line[LINE_SIZE];
  char d[TIME_STR_SIZE], a[TIME_STR_SIZE];
  int m;
  bool read_error = false, data_error = false;
  FileInfo *fi = NULL;  /* NULL so first realloc is just like malloc() */
  size_t l;

  if (!is_file_valid(filename))
    exit(EXIT_FAILURE);

  if ((fp = fopen(filename, "r")) == NULL) {
    fprintf(stderr, "failed to open %s\n", filename);
    exit(EXIT_FAILURE);
  }
  for (l = 0;fgets(line, sizeof(line), fp) != NULL; l++)
  {
    if (!(fi = realloc(fi, sizeof(FileInfo) + (sizeof(int (*)[2])) *  (l + 1)))) {
      fprintf(stderr, "realloc: %s:%d\n", __FILE__, __LINE__);
      exit(EXIT_FAILURE);
    }
    sscanf(line, "%s %s", d, a);

    if ((m = to_minutes(d)) == -1) {
      DATA_ERROR(filename, l + 1, d);
      break;
    }
    fi->data[l][0] = m;
    if ((m = to_minutes(a)) == -1) {
      DATA_ERROR(filename, l + 1, a);
      break;
    }
    fi->data[l][1] = m;
  }
  fi->size = l;

  if (!feof(fp) || ferror(fp)) {
    if (!data_error)  {
      perror(FILE_PATH);
      read_error = true;
      errno = 0;
    }
  }

  if (fclose(fp) == EOF || read_error || data_error) {
    if (!data_error && !read_error)
      perror(filename);
    free(fi);
    exit(EXIT_FAILURE);
  }
  return fi;
}
Пример #2
0
/**
 * returns true if checks are passed
 */
void runChecks ( const char * accession, const CheckCorruptConfig * config, const VCursor * pa_cursor, const VCursor * sa_cursor, const VCursor * seq_cursor )
{
    rc_t rc;
    uint32_t pa_has_ref_offset_idx;
    uint32_t sa_has_ref_offset_idx;
    uint32_t sa_seq_spot_id_idx;
    uint32_t sa_seq_read_id_idx;
    uint32_t sa_pa_id_idx;
    uint32_t sa_tmp_mismatch_idx;
    uint32_t seq_pa_id_idx;
    uint32_t seq_read_len_idx;
    uint32_t seq_cmp_read_idx;
    bool has_tmp_mismatch;

    /* add columns to cursor */
#define add_column(tbl_name, cursor, idx, col_spec) \
    rc = VCursorAddColumn( cursor, &idx, col_spec ); \
    if ( rc != 0 ) \
        throw VDB_ERROR("VCursorAddColumn() failed for " tbl_name " table, " col_spec " column", rc);

    add_column( "PRIMARY_ALIGNMENT", pa_cursor, pa_has_ref_offset_idx, "(bool)HAS_REF_OFFSET" );
    add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_has_ref_offset_idx, "(bool)HAS_REF_OFFSET" );
    add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_seq_spot_id_idx, "SEQ_SPOT_ID" );
    add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_seq_read_id_idx, "SEQ_READ_ID" );
    add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_pa_id_idx, "PRIMARY_ALIGNMENT_ID" );
    add_column( "SEQUENCE", seq_cursor, seq_pa_id_idx, "PRIMARY_ALIGNMENT_ID" );
    add_column( "SEQUENCE", seq_cursor, seq_read_len_idx, "READ_LEN" );
    add_column( "SEQUENCE", seq_cursor, seq_cmp_read_idx, "CMP_READ" );

    // optional columns
    rc = VCursorAddColumn( sa_cursor, &sa_tmp_mismatch_idx, "TMP_MISMATCH" );
    if ( rc == 0 )
        has_tmp_mismatch = true;
    else
    {
        has_tmp_mismatch = false;
        rc = 0;
    }


#undef add_column

    rc = VCursorOpen( pa_cursor );
    if (rc != 0)
        throw VDB_ERROR("VCursorOpen() failed for PRIMARY_ALIGNMENT table", rc);
    rc = VCursorOpen( sa_cursor );
    if (rc != 0)
        throw VDB_ERROR("VCursorOpen() failed for SECONDARY_ALIGNMENT table", rc);
    rc = VCursorOpen( seq_cursor );
    if (rc != 0)
        throw VDB_ERROR("VCursorOpen() failed for SEQUENCE table", rc);

    int64_t sa_id_first;
    uint64_t sa_row_count;

    rc = VCursorIdRange( sa_cursor, sa_pa_id_idx, &sa_id_first, &sa_row_count );
    if (rc != 0)
        throw VDB_ERROR("VCursorIdRange() failed for SECONDARY_ALIGNMENT table, PRIMARY_ALIGNMENT_ID column", rc);

    bool reported_about_no_pa = false;
    uint64_t pa_longer_sa_rows = 0;
    uint64_t pa_longer_sa_limit;
    if (config->pa_len_threshold_percent > 0)
        pa_longer_sa_limit = ceil( config->pa_len_threshold_percent * sa_row_count );
    else if (config->pa_len_threshold_number == 0 || config->pa_len_threshold_number > sa_row_count)
        pa_longer_sa_limit = sa_row_count;
    else
        pa_longer_sa_limit = config->pa_len_threshold_number;

    uint64_t sa_row_limit;
    if (config->sa_cutoff_percent > 0)
        sa_row_limit = ceil( config->sa_cutoff_percent * sa_row_count );
    else if (config->sa_cutoff_number == 0 || config->sa_cutoff_number > sa_row_count)
        sa_row_limit = sa_row_count;
    else
        sa_row_limit = config->sa_cutoff_number;

    for ( uint64_t i = 0; i < sa_row_count && i < sa_row_limit; ++i )
    {
        int64_t sa_row_id = i + sa_id_first;
        const void * data_ptr = NULL;
        uint32_t data_len;
        uint32_t pa_row_len;
        uint32_t sa_row_len;
        uint32_t seq_read_len_len;

        // SA:HAS_REF_OFFSET
        rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_has_ref_offset_idx, NULL, (const void**)&data_ptr, NULL, &sa_row_len );
        if ( rc != 0 )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, HAS_REF_OFFSET column", sa_row_id, rc);

        const int64_t * p_seq_spot_id;
        uint32_t seq_spot_id_len;
        // SA:SEQ_SPOT_ID
        rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_seq_spot_id_idx, NULL, (const void**)&p_seq_spot_id, NULL, &seq_spot_id_len );
        if ( rc != 0 || p_seq_spot_id == NULL || seq_spot_id_len != 1 )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, SEQ_SPOT_ID column", sa_row_id, rc);

        int64_t seq_spot_id = *p_seq_spot_id;
        if (seq_spot_id == 0)
        {
            std::stringstream ss;
            ss << "SECONDARY_ALIGNMENT:" << sa_row_id << " has SEQ_SPOT_ID = " << seq_spot_id;

            throw DATA_ERROR(ss.str());
        }

        if ( has_tmp_mismatch )
        {
            const char * p_sa_tmp_mismatch;
            // SA:TMP_MISMATCH
            rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_tmp_mismatch_idx, NULL, (const void**)&p_sa_tmp_mismatch, NULL, &data_len );
            if ( rc != 0 || p_sa_tmp_mismatch == NULL )
                throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, TMP_MISMATCH column", sa_row_id, rc);

            for ( uint32_t j = 0; j < data_len; ++j )
            {
                if ( p_sa_tmp_mismatch[j] == '=' )
                {
                    std::stringstream ss;
                    ss << "SECONDARY_ALIGNMENT:" << sa_row_id << " TMP_MISMATCH contains '='";

                    throw DATA_ERROR(ss.str());
                }
            }
        }

        const int64_t * p_pa_row_id;
        // SA:PRIMARY_ALIGNMENT_ID
        rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_pa_id_idx, NULL, (const void**)&p_pa_row_id, NULL, &data_len );
        if ( rc != 0 || p_pa_row_id == NULL || data_len != 1 )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, PRIMARY_ALIGNMENT_ID column", sa_row_id, rc);

        int64_t pa_row_id = *p_pa_row_id;
        if (pa_row_id == 0)
        {
            if (!reported_about_no_pa)
            {
                PLOGMSG (klogInfo, (klogInfo, "$(ACC) has secondary alignments without primary", "ACC=%s", accession));
                reported_about_no_pa = true;
            }
            continue;
        }

        // PA:HAS_REF_OFFSET
        rc = VCursorCellDataDirect ( pa_cursor, pa_row_id, pa_has_ref_offset_idx, NULL, &data_ptr, NULL, &pa_row_len );
        if ( rc != 0 )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on PRIMARY_ALIGNMENT table, HAS_REF_OFFSET column", pa_row_id, rc);

        // move on when PA.len equal to SA.len
        if (pa_row_len == sa_row_len)
            continue;

        if (pa_row_len < sa_row_len)
        {
            std::stringstream ss;
            ss << "PRIMARY_ALIGNMENT:" << pa_row_id << " HAS_REF_OFFSET length (" << pa_row_len << ") less than SECONDARY_ALIGNMENT:" << sa_row_id << " HAS_REF_OFFSET length (" << sa_row_len << ")";

            throw DATA_ERROR(ss.str());
        }

        // we already know that pa_row_len > sa_row_len
        ++pa_longer_sa_rows;

        const int32_t * p_seq_read_id;
        // SA:SEQ_READ_ID
        rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_seq_read_id_idx, NULL, (const void**)&p_seq_read_id, NULL, &data_len );
        if ( rc != 0 || p_seq_read_id == NULL || data_len != 1 )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, SEQ_READ_ID column", sa_row_id, rc);

        // one-based read index
        int32_t seq_read_id = *p_seq_read_id;

        const uint32_t * p_seq_read_len;
        // SEQ:READ_LEN
        rc = VCursorCellDataDirect ( seq_cursor, seq_spot_id, seq_read_len_idx, NULL, (const void**)&p_seq_read_len, NULL, &seq_read_len_len );
        if ( rc != 0 || p_seq_read_len == NULL )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, READ_LEN column", seq_spot_id, rc);

        if ( seq_read_id < 1 || (uint32_t)seq_read_id > seq_read_len_len )
        {
            std::stringstream ss;
            ss << "SECONDARY:" << sa_row_id << " SEQ_READ_ID value (" << seq_read_id << ") - 1 based, is out of SEQUENCE:" << seq_spot_id << " READ_LEN range (" << seq_read_len_len << ")";

            throw DATA_ERROR(ss.str());
        }

        if (pa_row_len != p_seq_read_len[seq_read_id - 1])
        {
            std::stringstream ss;
            ss << "PRIMARY_ALIGNMENT:" << pa_row_id << " HAS_REF_OFFSET length (" << pa_row_len << ") does not match its SEQUENCE:" << seq_spot_id << " READ_LEN[" << seq_read_id - 1 << "] value (" << p_seq_read_len[seq_read_id - 1] << ")";

            throw DATA_ERROR(ss.str());
        }

        if (pa_longer_sa_rows >= pa_longer_sa_limit)
        {
            std::stringstream ss;
            ss << "Limit violation (pa_longer_sa): there are at least " << pa_longer_sa_rows << " alignments where HAS_REF_OFFSET column is longer in PRIMARY_ALIGNMENT than in SECONDARY_ALIGNMENT";

            throw DATA_ERROR(ss.str());
        }
    }

    int64_t seq_id_first;
    uint64_t seq_row_count;

    rc = VCursorIdRange( seq_cursor, seq_pa_id_idx, &seq_id_first, &seq_row_count );
    if (rc != 0)
        throw VDB_ERROR("VCursorIdRange() failed for SEQUENCE table, PRIMARY_ALIGNMENT_ID column", rc);

    uint64_t seq_row_limit;
    if (config->seq_cutoff_percent > 0)
        seq_row_limit = ceil( config->seq_cutoff_percent * seq_row_count );
    else if (config->seq_cutoff_number == 0 || config->seq_cutoff_number > seq_row_count)
        seq_row_limit = seq_row_count;
    else
        seq_row_limit = config->seq_cutoff_number;

    for ( uint64_t i = 0; i < seq_row_count && i < seq_row_limit; ++i )
    {
        int64_t seq_row_id = i + seq_id_first;
        const void * data_ptr = NULL;
        uint32_t data_len;

        const int64_t * p_seq_pa_id;
        uint32_t seq_pa_id_len;
        // SEQ:PRIMARY_ALIGNMENT_ID
        rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_pa_id_idx, NULL, (const void**)&p_seq_pa_id, NULL, &seq_pa_id_len );
        if ( rc != 0 || p_seq_pa_id == NULL )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, PRIMARY_ALIGNMENT_ID column", seq_row_id, rc);

        const uint32_t * p_seq_read_len;
        // SEQ:READ_LEN
        rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_read_len_idx, NULL, (const void**)&p_seq_read_len, NULL, &data_len );
        if ( rc != 0 || p_seq_read_len == NULL )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, READ_LEN column", seq_row_id, rc);
        if ( seq_pa_id_len != data_len )
        {
            std::stringstream ss;
            ss << "SEQUENCE:" << seq_row_id << " PRIMARY_ALIGNMENT_ID length (" << seq_pa_id_len << ") does not match SEQUENCE:" << seq_row_id << " READ_LEN length (" << data_len << ")";

            throw DATA_ERROR(ss.str());
        }

        uint64_t sum_unaligned_read_len = 0;
        for ( uint32_t j = 0; j < seq_pa_id_len; ++j )
        {
            if ( p_seq_pa_id[j] == 0 )
            {
                sum_unaligned_read_len += p_seq_read_len[j];
            }
        }

        // SEQ:CMP_READ
        rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_cmp_read_idx, NULL, (const void**)&data_ptr, NULL, &data_len );
        if ( rc != 0 || data_ptr == NULL )
            throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, SEQ:CMP_READ column", seq_row_id, rc);

        if ( sum_unaligned_read_len != data_len )
        {
            std::stringstream ss;
            ss << "SEQUENCE:" << seq_row_id << " CMP_READ length (" << data_len << ") does not match sum of unaligned READ_LEN values (" << sum_unaligned_read_len << ")";

            throw DATA_ERROR(ss.str());
        }
    }

    if (sa_row_limit < sa_row_count || seq_row_limit < seq_row_count)
        PLOGMSG (klogInfo, (klogInfo, "$(ACC) looks good (based on first $(SA_CUTOFF) of SECONDARY_ALIGNMENT and $(SEQ_CUTOFF) SEQUENCE rows)", "ACC=%s,SA_CUTOFF=%lu,SEQ_CUTOFF=%lu", accession, sa_row_limit, seq_row_limit));
    else
        PLOGMSG (klogInfo, (klogInfo, "$(ACC) looks good", "ACC=%s", accession));
}