static rc_t vdb_fasta_loop_with_name( const p_dump_context ctx, const fastq_ctx * fctx ) { rc_t rc = 0; int64_t row_id; vdn_start( ctx->row_generator ); while ( vdn_next( ctx->row_generator, (uint64_t*)&row_id ) && rc == 0 ) { rc = Quitting(); if ( rc == 0 ) { uint32_t elem_bits, boff, row_len, name_len; const char * data; const char * name; rc = VCursorCellDataDirect( fctx->cursor, row_id, fctx->idx_name, &elem_bits, (const void**)&name, &boff, &name_len ); if ( rc != 0 ) vdb_fastq_row_error( "VCursorCellDataDirect( row#$(row_nr), NAME ) failed", rc, row_id ); else { rc = VCursorCellDataDirect( fctx->cursor, row_id, fctx->idx_read, &elem_bits, (const void**)&data, &boff, &row_len ); if ( rc != 0 ) vdb_fastq_row_error( "VCursorCellDataDirect( row#$(row_nr), READ ) failed", rc, row_id ); else { uint32_t idx = 0; int32_t to_print = row_len; rc = KOutMsg( ">%s.%li %.*s length=%u\n", fctx->run_name, row_id, name_len, name, row_len ); if ( to_print > ctx->max_line_len ) to_print = ctx->max_line_len; while ( rc == 0 && to_print > 0 ) { rc = KOutMsg( "%.*s\n", to_print, &data[ idx ] ); if ( rc == 0 ) { idx += ctx->max_line_len; to_print = ( row_len - idx ); if ( to_print > ctx->max_line_len ) to_print = ctx->max_line_len; } } } } } } return rc; }
rc_t runChecks(const TestCase& test_case, const VCursor * cursor, uint32_t name_idx, uint32_t name_range_idx) { rc_t rc; int64_t first_id; uint64_t count_id; rc = VCursorIdRange( cursor, name_idx, &first_id, &count_id ); if (rc != 0) { LOGERR( klogInt, rc, "VCursorIdRange() failed" ); return rc; } for (uint64_t row_id = first_id; row_id < first_id + count_id; ++row_id) { const char * name = NULL; uint32_t name_len; RowRange *row_range; rc = VCursorCellDataDirect( cursor, row_id, name_idx, NULL, (void const **)&name, NULL, &name_len ); if ( rc != 0 ) return rc; rc = VCursorParamsSet( ( struct VCursorParams const * )cursor, "QUERY_NAME", "%.*s", name_len, name ); if ( rc != 0 ) return rc; rc = VCursorCellDataDirect( cursor, row_id, name_range_idx, NULL, (void const **)&row_range, NULL, NULL ); if ( rc != 0 ) return rc; std::string name_str(name, name_len); if (test_case.key_ranges.find(name_str) == test_case.key_ranges.end()) { PLOGMSG( klogInt, (klogErr, "Unexpected name '$(NAME)' in test case '$(TC_NAME)'", "TC_NAME=%s,NAME=%s", test_case_name, name_str.c_str()) ); return 1; } RowRange row_range_exp = test_case.key_ranges.find(name_str)->second; if (row_range->start_id != row_range_exp.start_id || row_range->stop_id != row_range_exp.stop_id) { PLOGMSG( klogInt, (klogErr, "Row range for name '$(NAME)' in test case '$(TC_NAME)' does not match. Expected: $(EXP_S)-$(EXP_F), actual: $(ACT_S)-$(ACT_F)", "TC_NAME=%s,NAME=%s,EXP_S=%ld,EXP_F=%ld,ACT_S=%ld,ACT_F=%ld", test_case_name, name_str.c_str(), row_range_exp.start_id, row_range_exp.stop_id, row_range->start_id, row_range->stop_id) ); return 1; } } return rc; }
rc_t read_INSDC_dna_text_ptr( int64_t row_id, const VCursor * cursor, uint32_t idx, const INSDC_dna_text **res, uint32_t *len, const char * hint ) { rc_t rc; if ( idx == INVALID_COLUMN ) { rc = RC( rcExe, rcNoTarg, rcReading, rcItem, rcInvalid ); (void)PLOGERR( klogInt, ( klogInt, rc, "column idx invalid at row#$(tr) . $(hi) ) INSDC_dna_text (ptr)", "tr=%li,hi=%s", row_id, hint ) ); } else { const INSDC_dna_text * value; uint32_t elem_bits, boff, row_len; rc = VCursorCellDataDirect( cursor, row_id, idx, &elem_bits, (const void**)&value, &boff, &row_len ); if ( rc != 0 ) { (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorCellDataDirect( row#$(tr) . idx#$(ti) . $(hi) ) INSDC_dna_text (ptr) failed", "tr=%li,ti=%u,hi=%s", row_id, idx, hint ) ); } else { if ( row_len > 0 ) *res = value; if ( len != NULL ) *len = row_len; } } return rc; }
rc_t read_uint8( int64_t row_id, const VCursor * cursor, uint32_t idx, uint8_t *res, uint8_t dflt, const char * hint ) { rc_t rc; if ( idx == INVALID_COLUMN ) { rc = RC( rcExe, rcNoTarg, rcReading, rcItem, rcInvalid ); (void)PLOGERR( klogInt, ( klogInt, rc, "column idx invalid at row#$(tr) . $(hi) ) uint8", "tr=%li,hi=%s", row_id, hint ) ); } else { const uint8_t * value; uint32_t elem_bits, boff, row_len; rc = VCursorCellDataDirect( cursor, row_id, idx, &elem_bits, (const void**)&value, &boff, &row_len ); if ( rc != 0 ) { (void)PLOGERR( klogInt, ( klogInt, rc, "VCursorCellDataDirect( row#$(tr) . idx#$(ti) . $(hi) ) char_ptr failed", "tr=%li,ti=%u,hi=%s", row_id, idx, hint ) ); } else { *res = ( row_len > 0 ) ? *value : dflt; } } return rc; }
static const char * get_platform( const VTable * tab ) { const char * res = PT_NONE; const VCursor * cur; rc_t rc = VTableCreateCursorRead( tab, &cur ); if ( rc == 0 ) { uint32_t idx; rc = VCursorAddColumn( cur, &idx, "PLATFORM" ); if ( rc == 0 ) { rc = VCursorOpen( cur ); if ( rc == 0 ) { const uint8_t * pf; rc = VCursorCellDataDirect( cur, 1, idx, NULL, (const void**)&pf, NULL, NULL ); if ( rc == 0 ) { res = vdcd_get_platform_txt( *pf ); } } } VCursorRelease( cur ); } return res; }
static rc_t get_idx_and_read( struct VCursor const *curs, const char * name, int64_t row_id, const void ** base, uint32_t * len ) { uint32_t column_idx; rc_t rc = VCursorGetColumnIdx ( curs, &column_idx, name ); if ( rc != 0 ) LOGERR( klogInt, rc, "VCursorGetColumnIdx() failed" ); else { uint32_t elem_bits, boff, len_intern; const void * ptr; rc = VCursorCellDataDirect ( curs, row_id, column_idx, &elem_bits, &ptr, &boff, &len_intern ); if ( rc != 0 ) LOGERR( klogInt, rc, "VCursorCellDataDirect() failed" ); else { assert( boff == 0 ); if ( len != NULL ) *len = len_intern; if ( base != NULL ) *base = ptr; } } return rc; }
static void get_string_cell( char * buffer, size_t buffer_size, const VTable * tab, int64_t row, const char * column ) { if ( has_col( tab, column ) ) { const VCursor * cur; rc_t rc = VTableCreateCursorRead( tab, &cur ); if ( rc == 0 ) { uint32_t idx; rc = VCursorAddColumn( cur, &idx, column ); if ( rc == 0 ) { rc = VCursorOpen( cur ); if ( rc == 0 ) { const char * src; uint32_t row_len; rc = VCursorCellDataDirect( cur, row, idx, NULL, (const void**)&src, NULL, &row_len ); if ( rc == 0 ) { size_t num_writ; string_printf( buffer, buffer_size, &num_writ, "%.*s", row_len, src ); } } } VCursorRelease( cur ); } } }
static rc_t cigar_loop( const VCursor *cur, uint32_t cigar_idx, int64_t first, uint64_t count, uint32_t min_len ) { rc_t rc = 0; int64_t row_id, last_row = ( first + count ); rna_splice_candidates candidates; for ( row_id = first; ( row_id < last_row ) && ( rc == 0 ) && ( Quitting() == 0 ); row_id++ ) { const char * cigar; uint32_t row_len; rc = VCursorCellDataDirect ( cur, row_id, cigar_idx, NULL, ( const void ** )&cigar, NULL, &row_len ); if ( rc == 0 ) { candidates.count = 0; candidates.fwd_matched = 0; candidates.rev_matched = 0; rc = discover_rna_splicing_candidates( row_len, cigar, min_len, &candidates ); if ( rc == 0 && candidates.count > 0 ) { rc = KOutMsg( "%d rna-splice-candidates at row #%ld : %.*s\n", candidates.count, row_id, row_len, cigar ); } } } return rc; }
static rc_t vdb_fastq_loop_with_name( const p_dump_context ctx, const fastq_ctx * fctx ) { rc_t rc = 0; int64_t row_id; vdn_start( ctx->row_generator ); while ( vdn_next( ctx->row_generator, (uint64_t*)&row_id ) && rc == 0 ) { rc = Quitting(); if ( rc == 0 ) { uint32_t elem_bits, boff, row_len, name_len; const char * data; const char * name; rc = VCursorCellDataDirect( fctx->cursor, row_id, fctx->idx_name, &elem_bits, (const void**)&name, &boff, &name_len ); if ( rc != 0 ) vdb_fastq_row_error( "VCursorCellDataDirect( row#$(row_nr), NAME ) failed", rc, row_id ); else { rc = VCursorCellDataDirect( fctx->cursor, row_id, fctx->idx_read, &elem_bits, (const void**)&data, &boff, &row_len ); if ( rc != 0 ) vdb_fastq_row_error( "VCursorCellDataDirect( row#$(row_nr), READ ) failed", rc, row_id ); else { rc = KOutMsg( "@%s.%li %.*s length=%u\n%.*s\n", fctx->run_name, row_id, name_len, name, row_len, row_len, data ); if ( rc == 0 ) { rc = VCursorCellDataDirect( fctx->cursor, row_id, fctx->idx_qual, &elem_bits, (const void**)&data, &boff, &row_len ); if ( rc != 0 ) vdb_fastq_row_error( "VCursorCellDataDirect( row#$(row_nr), QUALITY ) failed", rc, row_id ); else rc = KOutMsg( "+%s.%li %.*s length=%u\n%.*s\n", fctx->run_name, row_id, name_len, name, row_len, row_len, data ); } } } } } return rc; }
static rc_t RefPosMake ( RefPos **objp, const VTable *tbl, const VCursor *native_curs ) { rc_t rc; /* create the object */ RefPos *obj = malloc ( sizeof * obj ); if ( obj == NULL ) { rc = RC ( rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted ); } else { obj->curs=NULL; BSTreeInit(&obj->tr_range); /* open the reference table cursor*/ if( (rc = AlignRefTableCursor(tbl, native_curs, &obj->curs, NULL)) == 0 ) { uint32_t itmp; if( (rc = VCursorAddColumn(obj->curs, &itmp, "(U32)MAX_SEQ_LEN")) == 0 || GetRCState(rc) == rcExists) { const void *base; uint32_t row_len; rc = VCursorCellDataDirect(obj->curs, 1, itmp, NULL, &base, NULL, &row_len); if(rc == 0) { assert(row_len == 1); memcpy(&obj->max_seq_len, base, 4); } } if( GetRCObject(rc) == rcColumn && GetRCState(rc) == rcNotFound ) { /*** no MAX_SEQ_LEN means that REF_POS==REF_START **/ VCursorRelease(obj->curs); obj->curs = NULL; obj->max_seq_len = 0; obj->name_range_idx = 0; obj->name_idx = 0; rc = 0; } else if( rc == 0 ) { /* add columns to cursor */ rc = VCursorAddColumn(obj->curs, &obj->name_idx, "(utf8)NAME"); if(rc == 0 || GetRCState(rc) == rcExists) rc = VCursorAddColumn(obj->curs, &obj->name_range_idx, "NAME_RANGE"); if(GetRCState(rc) == rcExists) rc = 0; } } if( rc == 0 ) { *objp = obj; } else { VCursorRelease(obj->curs); free(obj); } } return rc; }
rc_t read_cell( const VCursor *my_cursor, int64_t row_id, col *column, const char * name ) { rc_t rc = VCursorCellDataDirect ( my_cursor, row_id, column->idx, &column->elem_bits, &column->base, &column->bit_offset, &column->row_len ); if ( rc != 0 ) PLOGERR( klogInt, ( klogInt, rc, "VCursorCellDataDirect($(name),$(rowid)) failed", "name=%s,rowid=%lu", name, row_id ) ); return rc; }
static rc_t report_ref_row( const VCursor *cur, report_row_ctx * row_ctx ) { rc_t rc = 0; uint32_t elem_bits, boff, prim_count, sec_count; const void *base; rc = VCursorCellDataDirect ( cur, row_ctx->row_id, row_ctx->prim_idx, &elem_bits, &base, &boff, &prim_count ); if ( rc != 0 ) { (void)LOGERR( klogErr, rc, "cannot read colum >PRIMARY_ALIGNMENT_IDS<" ); } else { rc = VCursorCellDataDirect ( cur, row_ctx->row_id, row_ctx->sec_idx, &elem_bits, &base, &boff, &sec_count ); if ( rc != 0 ) { (void)LOGERR( klogErr, rc, "cannot read colum >SECONDARY_ALIGNMENT_IDS<" ); } else if ( prim_count > 0 || sec_count > 0 ) { rc = KOutMsg( "ROW[ %,lu ]: PRIM:%,u SEC:%,u\n", row_ctx->row_id, prim_count, sec_count ); } } return rc; }
static rc_t LocalRefIDMake ( LocalRefID **objp, const VTable *tbl, const VCursor *native_curs) { rc_t rc; /* create the object */ LocalRefID *obj = malloc ( sizeof * obj ); if ( obj == NULL ) { rc = RC( rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted ); } else { const VCursor *curs=NULL; /* open the reference table cursor*/ rc = AlignRefTableCursor( tbl, native_curs, &curs, NULL ); if ( rc == 0 ) { uint32_t itmp; rc = VCursorAddColumn( curs, &itmp, "(U32)MAX_SEQ_LEN" ); if ( rc == 0 || GetRCState( rc ) == rcExists ) { const void *base; uint32_t row_len; rc = VCursorCellDataDirect( curs, 1, itmp, NULL, &base, NULL, &row_len ); if ( rc == 0 ) { assert( row_len == 1 ); memmove( &obj->max_seq_len, base, 4 ); } } if ( ( GetRCObject( rc ) == ( enum RCObject )rcColumn ) && ( GetRCState( rc ) == rcNotFound ) ) { obj->max_seq_len = 0; rc = 0; } VCursorRelease( curs ); if ( rc == 0 ) { *objp = obj; return 0; } } free ( obj ); } return rc; }
/* function ascii NCBI:align:ref_seq_id ( I64 ref_id ); */ static rc_t CC align_ref_seq_id ( void *data, const VXformInfo *info, int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv[] ) { rc_t rc; RefSeqID const *self = ( void const * )data; char const *name = NULL; uint32_t name_len; /* get start and length of reference segment */ int64_t const *ref_id = argv[REF_ID].u.data.base; assert( argv[ REF_ID ].u.data.elem_bits == sizeof( *ref_id ) * 8) ; if ( self->curs == NULL || argv[ REF_ID ].u.data.elem_count == 0 ) { rslt->elem_count = 0; return 0; } ref_id += argv[ REF_ID] .u.data.first_elem; SUB_DEBUG( ( "SUB.Rd in 'align_ref_seq_id.c' at #%lu\n", ref_id[ 0 ] ) ); rc = VCursorCellDataDirect( self->curs, ref_id[ 0 ], self->seqID_idx, NULL, (void const **)&name, NULL, &name_len ); if ( GetRCState( rc ) == rcNotFound && GetRCObject( rc ) == rcRow ) { name = ""; name_len = 0; } else if ( rc != 0 ) { return rc; } rc = KDataBufferCast( rslt->data, rslt->data, sizeof( name[ 0 ] ) * 8, true ); if ( rc != 0 ) return rc; rc = KDataBufferResize( rslt->data, name_len ); if ( rc != 0 ) return rc; memmove( rslt->data->base, name, sizeof( name[ 0 ] ) * name_len ); rslt->elem_count = name_len; rslt->elem_bits = sizeof( name[ 0 ] ) * 8; return rc; }
/* Read - PRIVATE * column message sent via table */ rc_t SRATableRead ( const SRATable *self, spotid_t id, uint32_t idx, const void **base, bitsz_t *offset, bitsz_t *size ) { rc_t rc; if ( base == NULL || offset == NULL || size == NULL ) rc = RC ( rcSRA, rcColumn, rcReading, rcParam, rcNull ); else if ( self == NULL ) rc = RC ( rcSRA, rcTable, rcAccessing, rcSelf, rcNull ); else { rc = 0; /* open cursor */ if ( ! self -> curs_open ) { rc = VCursorOpen(self->curs); if ( rc == 0 ) ((SRATable *)self)->curs_open = true; } if ( rc == 0 ) { uint32_t elem_bits, elem_off, elem_cnt; rc = VCursorCellDataDirect ( self -> curs, id, idx, & elem_bits, base, & elem_off, & elem_cnt ); if ( rc == 0 ) { * offset = elem_off * elem_bits; * size = elem_cnt * elem_bits; return 0; } else if( UIError(rc, NULL, self->vtbl) ) { UITableLOGError(rc, self->vtbl, true); } } } if ( base != NULL ) * base = NULL; if ( offset != NULL ) * offset = 0; if ( size != NULL ) * size = 0; return rc; }
static rc_t fetch_all_rows(const VCursor *curs, unsigned ncol, const uint32_t cid[/* ncol */]) { int64_t start; int64_t stop; int64_t row; unsigned i; rc_t rc; for (i = 0; i != ncol; ++i) { int64_t cstart; uint64_t ccount; rc = VCursorIdRange(curs, cid[i], &cstart, &ccount); if (rc) return rc; if (i == 0) { start = cstart; stop = cstart + ccount; } else { if (start > cstart) start = cstart; if (stop < cstart + ccount) stop = cstart + ccount; } } for (row = start; row != stop; ++row) { for (i = 0; i != ncol; ++i) { uint32_t elem_bits; const void *base; uint32_t offset; uint32_t length; rc = VCursorCellDataDirect(curs, row, cid[i], &elem_bits, &base, &offset, &length); if (rc) return rc; } } return 0; }
static bool rr_store_alignment( rr_store * rr, int64_t align_id, const VCursor * curs, uint32_t read_idx ) { bool res = false; const INSDC_4na_bin * read = NULL; uint32_t read_len; rc_t rc = VCursorCellDataDirect( curs, align_id, read_idx, NULL, ( const void** ) &read, NULL, &read_len ); if ( rc == 0 ) { rr_entry * entry; res = rr_entry_make ( &entry, read, read_len ); if ( res ) { uint64_t key = ( uint64_t ) align_id; res = ( KVectorSetPtr ( rr -> v, key, entry ) == 0 ); if ( !res ) rr_entry_release( key, entry, NULL ); } } return res; }
static rc_t read_base_and_len( struct VCursor const *curs, uint32_t column_idx, int64_t row_id, const void ** base, uint32_t * len ) { uint32_t elem_bits, boff, len_intern; const void * ptr; rc_t rc = VCursorCellDataDirect ( curs, row_id, column_idx, &elem_bits, &ptr, &boff, &len_intern ); if ( rc != 0 ) { LOGERR( klogInt, rc, "VCursorCellDataDirect() failed" ); } else { if ( len != NULL ) *len = len_intern; if ( base != NULL ) *base = ptr; } return rc; }
static rc_t cg_dump_row( cg_dump_opts * opts, cg_dump_ctx * cg_ctx, uint64_t row_id ) { uint32_t elem_bits, boff, sg_len; const char * sg; rc_t rc = VCursorCellDataDirect( cg_ctx->seq_cur, row_id, cg_ctx->seq_sg_idx, &elem_bits, (const void**)&sg, &boff, &sg_len ); if ( rc != 0 ) { (void)PLOGERR( klogErr, ( klogErr, rc, "cannot read spot-group in row #$(row_id)", "row_id=%lu", row_id ) ); } else { String spot_group; lane * sg_lane; StringInit( &spot_group, sg, sg_len, sg_len ); sg_lane = ( lane * )BSTreeFind ( &cg_ctx->lanes, &spot_group, String_lane_cmp ); if ( sg_lane == NULL ) { /* KOutMsg( "row %lu (%S) not found, create it\n", row_id, &spot_group ); */ rc = make_lane( opts, cg_ctx->lookup, cg_ctx->out_dir, &spot_group, &sg_lane ); if ( rc == 0 ) { rc = BSTreeInsert ( &cg_ctx->lanes, ( BSTNode * )sg_lane, lane_lane_cmp ); if ( rc != 0 ) { (void)LOGERR( klogErr, rc, "cannot insert new lane" ); whack_lane( sg_lane ); } } } else { /* KOutMsg( "row %lu (%S) found, use it\n", row_id, &spot_group ); */ } if ( rc == 0 ) { cg_dump_write_spot( opts, cg_ctx, row_id, sg_lane ); /* <================== */ } } return rc; }
/* function ascii NCBI:align:ref_name ( I64 ref_id ); */ static rc_t CC align_ref_name ( void *data, const VXformInfo *info, int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv[] ) { rc_t rc; RefName const *self = (void const *)data; char const *name = NULL; uint32_t name_len; /* get start and length of reference segment */ int64_t const *ref_id = argv[REF_ID].u.data.base; if (argv[REF_ID].u.data.elem_count == 0) rc = RC(rcAlign, rcFunction, rcExecuting, rcRow, rcNotFound); else { assert(argv[REF_ID].u.data.elem_bits == sizeof(*ref_id) * 8); ref_id += argv[REF_ID].u.data.first_elem; rc = VCursorCellDataDirect(self->curs, ref_id[0], self->name_idx, NULL, (void const **)&name, NULL, &name_len); } if (GetRCState(rc) == rcNotFound && GetRCObject(rc) == rcRow) { name = ""; name_len = 0; } else if (rc) return rc; rc = KDataBufferCast(rslt->data, rslt->data, sizeof(name[0]) * 8, true); if (rc) return rc; rc = KDataBufferResize(rslt->data, name_len); if (rc) return rc; memcpy(rslt->data->base, name, sizeof(name[0]) * name_len); rslt->elem_count = name_len; rslt->elem_bits = sizeof(name[0]) * 8; return rc; }
static rc_t read_from_ref_node( ref_node * node, int32_t ref_offset, uint32_t ref_len, uint8_t *exclude_vector, uint32_t *active ) { rc_t rc = 0; uint64_t row_id = ( ref_offset / node->read_len ) + 1; uint8_t *dst = exclude_vector; uint32_t remaining = ref_len; uint32_t src_ofs = ref_offset % node->read_len; while ( remaining > 0 && rc == 0 ) { uint32_t elem_bits, boff, rlen; const uint8_t *src; rc = VCursorCellDataDirect ( node->cur, row_id, node->hits_idx, &elem_bits, (const void**)&src, &boff, &rlen ); if ( rc != 0 ) { PLOGERR( klogInt, ( klogInt, rc, "error to read $(col_name) from 1st row in table $(db_name).$(tab_name)", "col_name=%s,db_name=%S,tab_name=%s", HITS_COLUMN, node->name, HITMAP_TAB ) ); } else { if ( src_ofs >= rlen ) { rc = RC( rcApp, rcNoTarg, rcReading, rcParam, rcInvalid ); PLOGERR( klogInt, ( klogInt, rc, "error: try to read more data than are in var-loc $(tab_name)", "tab_name=%S", node->name ) ); } else { uint32_t to_copy = ( rlen - src_ofs ); if ( to_copy > remaining ) { to_copy = remaining; } src += src_ofs; memmove( dst, src, to_copy ); dst += to_copy; remaining -= to_copy; src_ofs = 0; row_id ++; node->bytes_requested += to_copy; } } } *active = 0; if ( rc == 0 ) { for ( src_ofs = 0; src_ofs < ref_len; ++src_ofs ) { if ( exclude_vector[ src_ofs ] > 0 ) { ( *active )++; } } } return rc; }
bool nextPileup ( PileupIteratorState* pileup_state, VCursor const* cursor_ref, VCursor const* cursor_pa, char const* const* column_names_ref, uint32_t* column_index_ref, size_t column_count_ref, char const* const* column_names_pa, uint32_t* column_index_pa, size_t column_count_pa, char* error_buf, size_t error_buf_size ) { int64_t ref_row_id; /* current row_id */ int64_t prev_ref_row_id; uint64_t ref_pos = pileup_state->ref_pos; rc_t rc; /* TODO: check the case when slice_end is beyond the reference end*/ if ( pileup_state->slice_length && pileup_state->ref_pos == pileup_state->slice_start + pileup_state->slice_length ) { error_buf[0] = '\0'; /* indicating that no error has occured */ return false; } /* drop cached alignments that we will not need anymore */ remove_unneeded_alignments ( pileup_state, ref_pos, error_buf, error_buf_size ); /* it's not an issue but this action is not rolled backed in the case of error below */ /* Check if we moved to the next reference row_id, if yes - read it and add appropriate alignments to cache */ prev_ref_row_id = pileup_state->reference_start_id + ref_pos / pileup_state->max_seq_len; ++ ref_pos; ref_row_id = pileup_state->reference_start_id + ref_pos / pileup_state->max_seq_len; if ( ref_row_id != prev_ref_row_id ) /* moved to the next row_id */ { uint32_t dummy; uint32_t row_len; uint32_t seq_start; #if USE_SINGLE_BLOB_FOR_ALIGNMENT_IDS != 1 int64_t const* alignment_ids; #endif char ref_name[ countof (pileup_state->ref_name) ]; /* TODO: consider storing this in pileup_state (don't need to calculate every time)*/ /*slice_start_id = pileup_state->reference_start_id + pileup_state->slice_start/pileup_state->max_seq_len; slice_end_id = pileup_state->slice_length != 0 ? pileup_state->reference_start_id + (pileup_state->slice_start + (int64_t)pileup_state->slice_length)/pileup_state->max_seq_len : (int64_t)pileup_state->total_row_count;*/ if ( ref_row_id < pileup_state->slice_start_id || ref_row_id > pileup_state->slice_end_id ) { error_buf[0] = '\0'; /* indicating that no error has occured */ return false; } rc = VCursorReadDirect ( cursor_ref, ref_row_id, column_index_ref [COL_NAME], sizeof (ref_name[0]) * 8, ref_name, countof(ref_name), & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorReadDirect(ref) failed with error: 0x%08x (%u) [%R]", rc, rc, rc); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return false; } ref_name[ min ( countof(ref_name) - 1, row_len) ] = '\0'; if ( strcmp (ref_name, pileup_state->ref_name) ) { /*Alignment_Init ( & pileup_state->cache_alignment); strncpy ( pileup_state->ref_name, ref_name, countof (pileup_state->ref_name) - 1 ); pileup_state->reference_start_id = ref_row_id;*/ error_buf[0] = '\0'; /* indicating that no error has occured */ return false; } #if USE_SINGLE_BLOB_FOR_ALIGNMENT_IDS == 1 rc = open_blob_for_current_id ( ref_row_id, cursor_ref, & pileup_state->blob_alignment_ids, column_index_ref [COL_PRIMARY_ALIGNMENT_IDS], error_buf, error_buf_size ); if (rc != 0) return false; #endif /* Read new SEQ_START */ rc = VCursorReadDirect ( cursor_ref, ref_row_id, column_index_ref [COL_SEQ_START], sizeof (seq_start) * 8, & seq_start, 1, & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorReadDirect(ref-seq_start) failed with error: 0x%08x (%u) [%R]", rc, rc, rc); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return false; } pileup_state->current_seq_start = seq_start; /* Read REFERENCE row's PRIMARY_ALIGNMENT_IDS column to iterate through them */ /* elem_bits = sizeof (*pileup_state->alignment_ids) * 8; */ #if USE_SINGLE_BLOB_FOR_ALIGNMENT_IDS == 1 rc = VBlobCellData ( pileup_state->blob_alignment_ids, ref_row_id, & dummy, & pileup_state->alignment_ids, NULL, & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VBlobCellData(ref-pa_ids) failed with error: 0x%08x (%u) [%R], row_len=%u", rc, rc, rc, row_len); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return false; } pileup_state -> size_alignment_ids = row_len; #else rc = VCursorCellDataDirect ( cursor_ref, ref_row_id, column_index_ref [COL_PRIMARY_ALIGNMENT_IDS], NULL, (void const**)(& alignment_ids), 0, & row_len ); /*rc = VCursorReadDirect ( cursor_ref, ref_row_id, column_index_ref [COL_PRIMARY_ALIGNMENT_IDS], sizeof (*pileup_state->alignment_ids) * 8, pileup_state->alignment_ids, countof (pileup_state->alignment_ids), & row_len );*/ if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorCellDataDirect(ref-pa_ids) failed with error: 0x%08x (%u) [%R], row_len=%u", rc, rc, rc, row_len); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return false; } rc = PileupIteratorState_SetAlignmentIds ( pileup_state, alignment_ids, row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: PileupIteratorState_SetAlignmentIds failed with error: 0x%08x (%u), row_len=%u", rc, rc, row_len); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } #endif pileup_state->next_alignment_idx = 0; /*pileup_state->size_alignment_ids = row_len;*/ printf ("Read %lu PRIMARY_ALIGNMENT_IDS for REFERENCE row_id=%lld\n", row_len, ref_row_id); /* For each PRIMARY_ALIGNMENT_ID in pa_ids: read its start, length and cache it if it intersects the slice */ rc = add_ref_row_to_cache ( pileup_state, cursor_pa, seq_start, ref_pos, pileup_state->alignment_ids, row_len, column_names_pa, column_index_pa, column_count_pa, error_buf, error_buf_size ); if ( rc != 0 ) return false; /*pileup_state -> seq_start = seq_start;*/ } else { /* read remaining alignment_ids and check if they must be cached */ size_t count = pileup_state->size_alignment_ids - pileup_state->next_alignment_idx; if (count > 0) { rc_t rc = add_ref_row_to_cache ( pileup_state, cursor_pa, pileup_state->current_seq_start, ref_pos, & pileup_state->alignment_ids[ pileup_state->next_alignment_idx ], (uint32_t)count, column_names_pa, column_index_pa, column_count_pa, error_buf, error_buf_size ); if ( rc != 0 ) return false; } } ++ pileup_state->ref_pos; return true; }
static rc_t build_scaffold_read_impl(self_t const *const self, void *const Dst, unsigned const components, INSDC_coord_one const Start[/* components */], INSDC_coord_len const Length[/* components */], NCBI_WGS_component_props const Props[/* components */], int64_t const join[/* components */]) { INSDC_4na_bin *const dst = Dst; unsigned i; unsigned j; unsigned id; rc_t rc; for (rc = 0, id = j = i = 0; rc == 0 && i != components; ++i) { INSDC_coord_len const length = Length[i]; int const props = Props[i]; if (props < 0) { /* gap */ memset(dst + j, READ_GAP_VALUE, length); } else if (self->curs == NULL) { memset(dst + j, 15, length); } else { int const type = props & 0x0F; int const strand = (props & ~(NCBI_WGS_strand_plus | NCBI_WGS_strand_minus)) >> 4; if (type != 0 || strand == 3) rc = RC(rcXF, rcFunction, rcExecuting, rcType, rcInvalid); else { int64_t const row = join[id++]; uint32_t elem_bits; uint32_t bit_offset; uint32_t elem_count; void const *base; rc = VCursorCellDataDirect(self->curs, row, self->col_idx, &elem_bits, &base, &bit_offset, &elem_count); assert(bit_offset == 0); if (rc == 0) { INSDC_coord_one const start = Start[i] - 1; if (elem_count < start + length) rc = RC(rcXF, rcFunction, rcExecuting, rcData, rcInsufficient); else { INSDC_4na_bin const *const src = base; if (strand == 2) { static INSDC_4na_bin const complement[] = { /* 0 0000 - 0000*/ 0, /* 1 0001 - 1000*/ 8, /* 2 0010 - 0100*/ 4, /* 3 0011 - 1100*/ 12, /* 4 0100 - 0010*/ 2, /* 5 0101 - 1010*/ 10, /* 6 0110 - 0110*/ 6, /* 7 0111 - 1110*/ 14, /* 8 1000 - 0001*/ 1, /* 9 1001 - 1001*/ 9, /*10 1010 - 0101*/ 5, /*11 1011 - 1101*/ 13, /*12 1100 - 0011*/ 3, /*13 1101 - 1011*/ 11, /*14 1110 - 0111*/ 7, /*15 1111 - 1111*/ 15 }; unsigned k; unsigned jj; for (jj = j + length, k = 0; k != length; ++k) { INSDC_4na_bin const elem = src[start + k]; assert(/* 0 <= elem && */ elem <= 15); --jj; dst[jj] = complement[elem]; } } else memcpy(&dst[j], &src[start], length); } } } } j += length; } return rc; }
static rc_t CC seq_restore_read_impl1 ( void *data, const VXformInfo *info, int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv [] ) { rc_t rc; int i; Read_Restorer *self = data; INSDC_4na_bin *dst; INSDC_coord_len len; uint32_t src_len = (uint32_t)argv[ 0 ].u.data.elem_count; const INSDC_4na_bin *src = argv[ 0 ].u.data.base; const uint32_t num_reads = (uint32_t)argv[ 1 ].u.data.elem_count; const int64_t *align_id = argv[ 1 ].u.data.base; const INSDC_coord_len *read_len = argv[ 2 ].u.data.base; const uint8_t *read_type = argv[ 3 ].u.data.base; bool is_sequential = false; assert( argv[ 0 ].u.data.elem_bits == 8 ); assert( argv[ 1 ].u.data.elem_bits == 64 ); assert( argv[ 2 ].u.data.elem_bits == sizeof( INSDC_coord_len ) * 8 ); assert( argv[ 2 ].u.data.elem_count == num_reads ); assert( argv[ 3 ].u.data.elem_count == num_reads ); src += argv [ 0 ] . u . data . first_elem; align_id += argv [ 1 ] . u . data . first_elem; read_len += argv [ 2 ] . u . data . first_elem; read_type += argv [ 3 ] . u . data . first_elem; if ( row_id != self->last_row_id && row_id != self->last_row_id + 1 ) { self->first_sequential_row_id = row_id; is_sequential = false; } else if ( row_id > self->first_sequential_row_id + 100 ) { is_sequential = true; } self->last_row_id = row_id; /* is_sequential = false; forcing it to false ... Sept. 16th 2015 to analyze prefetching */ for ( i = 0, len = 0; i < (int)num_reads; i++ ) { len += read_len[ i ]; } /* resize output row */ rslt->data->elem_bits = 8; rc = KDataBufferResize( rslt->data, len ); rslt->elem_count = len; dst = rslt->data->base; if ( rc == 0 && len > 0 ) { if ( len == src_len ) /*** shortcut - all data is local ***/ { memmove( dst, src, len ); } else { if ( is_sequential && ( row_id < self->prefetch_start_id || row_id > self->prefetch_stop_id ) ) { /* do prefetch */ uint32_t num_rows = ( argv[ 1 ].u.data.base_elem_count - argv[ 1 ].u.data.first_elem ); /* KTimeMs_t ts = KTimeMsStamp(); fprintf( stderr, "\nprefetch row_id #%lu ( start_id #%lu, stop_id #%lu ) num_rows = %d\n", row_id, self->prefetch_start_id, self->prefetch_stop_id, num_rows ); */ VCursorDataPrefetch( self->curs, align_id, self->read_idx, num_rows, 1, INT64_MAX, true ); /* ts = KTimeMsStamp() - ts; fprintf( stderr, "prefetch done in %lu ms\n", ts ); */ self->prefetch_start_id=row_id; self->prefetch_stop_id =argv[1].blob_stop_id; } for( i = 0; i < (int)num_reads && rc == 0; i++ ) /*** checking read by read ***/ { if ( align_id[ i ] > 0 ) { const INSDC_4na_bin *r_src; uint32_t r_src_len; SUB_DEBUG( ( "SUB.Rd in 'seq-restore-read.c' at #%lu\n", align_id[ i ] ) ); rc = VCursorCellDataDirect( self -> curs, align_id[ i ], self -> read_idx, NULL, ( const void** ) &r_src, NULL, &r_src_len ); if ( rc == 0 ) { if ( r_src_len == read_len[ i ] ) { if ( read_type[ i ] & SRA_READ_TYPE_FORWARD ) { memmove( dst, r_src, read_len[ i ] ); } else if ( read_type[ i ] & SRA_READ_TYPE_REVERSE ) { int j, k; for( j = 0, k = read_len[ i ] - 1; j < (int)read_len[ i ]; j++, k-- ) { dst[ j ] = map [ r_src[ k ] & 15 ]; } } else { rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInconsistent ); } } else { rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInconsistent ); } } } else /*** data is in READ column **/ { if ( src_len >= read_len[ i ] ) { memmove( dst, src, read_len[ i ] ); src_len -= read_len[ i ]; src += read_len[ i ]; } else { return RC( rcXF, rcFunction, rcExecuting, rcData, rcInconsistent ); } } dst += read_len[ i ]; } } } return rc; }
/* -------------------------------------------------------------------------------------- argv[ 0 ] ... CMP_READ argv[ 1 ] ... PRIM_ALIG_ID argv[ 2 ] ... READ_LEN argv[ 3 ] ... READ_TYPE -------------------------------------------------------------------------------------- */ static rc_t CC seq_restore_read_impl2 ( void *data, const VXformInfo *info, int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv [] ) { rc_t rc; Read_Restorer *self = data; INSDC_4na_bin *dst; INSDC_coord_len len; id_list align_ids; uint32_t i; uint32_t src_len = (uint32_t)argv[ 0 ] . u . data . elem_count; const INSDC_4na_bin * src = argv[ 0 ] . u . data.base; const uint32_t num_reads = (uint32_t)argv[ 1 ]. u . data . elem_count; const INSDC_coord_len * read_len = argv[ 2 ] . u . data.base; const uint8_t *read_type = argv[ 3 ] . u . data.base; int64_t last_row_id = argv[ 1 ] . blob_stop_id; align_ids.list = ( int64_t * )argv[ 1 ].u.data.base; align_ids.count = ( uint32_t )( argv[ 1 ].u.data.base_elem_count - argv[ 1 ].u.data.first_elem ); assert( argv[ 0 ].u.data.elem_bits == 8 ); assert( argv[ 1 ].u.data.elem_bits == 64 ); assert( argv[ 2 ].u.data.elem_bits == sizeof( INSDC_coord_len ) * 8 ); assert( argv[ 2 ].u.data.elem_count == num_reads ); assert( argv[ 3 ].u.data.elem_count == num_reads ); src += argv [ 0 ] . u . data . first_elem; align_ids.list += argv [ 1 ] . u . data . first_elem; read_len += argv [ 2 ] . u . data . first_elem; read_type += argv [ 3 ] . u . data . first_elem; handle_caching( self, &align_ids, row_id, last_row_id ); for ( i = 0, len = 0; i < num_reads; i++ ) len += read_len[ i ]; /* resize output row */ rslt->data->elem_bits = 8; rc = KDataBufferResize( rslt->data, len ); rslt->elem_count = len; dst = rslt->data->base; if ( rc == 0 && len > 0 ) { if ( len == src_len ) /*** shortcut - all data is local ***/ memmove( dst, src, len ); else { rr_entry * ep; const INSDC_4na_bin * rd; uint32_t rd_len; bool found_in_cache; for ( i = 0; i < num_reads && rc == 0; i++ ) /*** checking read by read ***/ { int64_t align_id = align_ids.list[ i ]; if ( align_id > 0 ) { found_in_cache = false; if ( self -> read_store != NULL ) found_in_cache = rr_get_read ( self -> read_store, align_id, &ep ); if ( found_in_cache ) { /* we found it in the cache... */ rd = &( ep->read[ 0 ] ); rd_len = ep->read_len; } else { /* we did not find it in the cache, get it from the alignment-table... */ rc = VCursorCellDataDirect( self -> curs, align_id, self -> read_idx, NULL, ( const void** ) &rd, NULL, &rd_len ); } if ( rc == 0 ) { if ( rd_len == read_len[ i ] ) { if ( read_type[ i ] & SRA_READ_TYPE_FORWARD ) { memmove( dst, rd, read_len[ i ] ); } else if ( read_type[ i ] & SRA_READ_TYPE_REVERSE ) { int j, k; for( j = 0, k = read_len[ i ] - 1; j < (int)read_len[ i ]; j++, k-- ) { dst[ j ] = map [ rd[ k ] & 15 ]; } } else { rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInconsistent ); } } else { rc = RC( rcXF, rcFunction, rcExecuting, rcData, rcInconsistent ); } } } else /*** data is in READ column **/ { if ( src_len >= read_len[ i ] ) { memmove( dst, src, read_len[ i ] ); src_len -= read_len[ i ]; src += read_len[ i ]; } else { return RC( rcXF, rcFunction, rcExecuting, rcData, rcInconsistent ); } } dst += read_len[ i ]; } } } return rc; }
static rc_t build_scaffold_qual_impl(self_t const *const self, void *const Dst, unsigned const components, INSDC_coord_one const Start[/* components */], INSDC_coord_len const Length[/* components */], NCBI_WGS_component_props const Props[/* components */], int64_t const join[/* components */]) { INSDC_quality_phred *const dst = Dst; unsigned i; unsigned j; unsigned id; rc_t rc; for (rc = 0, id = j = i = 0; rc == 0 && i != components; ++i) { INSDC_coord_len const length = Length[i]; int const props = Props[i]; if (props < 0) { /* gap */ memset(dst + j, QUAL_GAP_VALUE, length); } else if (self->curs == NULL) { memset(dst + j, 30, length); } else { int const type = props & 0x0F; int const strand = (props & ~(NCBI_WGS_strand_plus | NCBI_WGS_strand_minus)) >> 4; if (type != 0 || strand == 3) rc = RC(rcXF, rcFunction, rcExecuting, rcType, rcInvalid); else { int64_t const row = join[id++]; uint32_t elem_bits; uint32_t bit_offset; uint32_t elem_count; void const *base; rc = VCursorCellDataDirect(self->curs, row, self->col_idx, &elem_bits, &base, &bit_offset, &elem_count); assert(bit_offset == 0); if (rc == 0) { INSDC_quality_phred const start = Start[i] - 1; if (elem_count < start + length) rc = RC(rcXF, rcFunction, rcExecuting, rcData, rcInsufficient); else { INSDC_quality_phred const *const src = base; if (strand == 2) { unsigned k; unsigned jj; for (jj = j + length, k = 0; k != length; ++k) { INSDC_quality_phred const elem = src[start + k]; --jj; dst[jj] = elem; } } else memcpy(&dst[j], &src[start], length); } } } } j += length; } return rc; }
/** * returns true if checks are passed */ void runChecks ( const char * accession, const CheckCorruptConfig * config, const VCursor * pa_cursor, const VCursor * sa_cursor, const VCursor * seq_cursor ) { rc_t rc; uint32_t pa_has_ref_offset_idx; uint32_t sa_has_ref_offset_idx; uint32_t sa_seq_spot_id_idx; uint32_t sa_seq_read_id_idx; uint32_t sa_pa_id_idx; uint32_t sa_tmp_mismatch_idx; uint32_t seq_pa_id_idx; uint32_t seq_read_len_idx; uint32_t seq_cmp_read_idx; bool has_tmp_mismatch; /* add columns to cursor */ #define add_column(tbl_name, cursor, idx, col_spec) \ rc = VCursorAddColumn( cursor, &idx, col_spec ); \ if ( rc != 0 ) \ throw VDB_ERROR("VCursorAddColumn() failed for " tbl_name " table, " col_spec " column", rc); add_column( "PRIMARY_ALIGNMENT", pa_cursor, pa_has_ref_offset_idx, "(bool)HAS_REF_OFFSET" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_has_ref_offset_idx, "(bool)HAS_REF_OFFSET" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_seq_spot_id_idx, "SEQ_SPOT_ID" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_seq_read_id_idx, "SEQ_READ_ID" ); add_column( "SECONDARY_ALIGNMENT", sa_cursor, sa_pa_id_idx, "PRIMARY_ALIGNMENT_ID" ); add_column( "SEQUENCE", seq_cursor, seq_pa_id_idx, "PRIMARY_ALIGNMENT_ID" ); add_column( "SEQUENCE", seq_cursor, seq_read_len_idx, "READ_LEN" ); add_column( "SEQUENCE", seq_cursor, seq_cmp_read_idx, "CMP_READ" ); // optional columns rc = VCursorAddColumn( sa_cursor, &sa_tmp_mismatch_idx, "TMP_MISMATCH" ); if ( rc == 0 ) has_tmp_mismatch = true; else { has_tmp_mismatch = false; rc = 0; } #undef add_column rc = VCursorOpen( pa_cursor ); if (rc != 0) throw VDB_ERROR("VCursorOpen() failed for PRIMARY_ALIGNMENT table", rc); rc = VCursorOpen( sa_cursor ); if (rc != 0) throw VDB_ERROR("VCursorOpen() failed for SECONDARY_ALIGNMENT table", rc); rc = VCursorOpen( seq_cursor ); if (rc != 0) throw VDB_ERROR("VCursorOpen() failed for SEQUENCE table", rc); int64_t sa_id_first; uint64_t sa_row_count; rc = VCursorIdRange( sa_cursor, sa_pa_id_idx, &sa_id_first, &sa_row_count ); if (rc != 0) throw VDB_ERROR("VCursorIdRange() failed for SECONDARY_ALIGNMENT table, PRIMARY_ALIGNMENT_ID column", rc); bool reported_about_no_pa = false; uint64_t pa_longer_sa_rows = 0; uint64_t pa_longer_sa_limit; if (config->pa_len_threshold_percent > 0) pa_longer_sa_limit = ceil( config->pa_len_threshold_percent * sa_row_count ); else if (config->pa_len_threshold_number == 0 || config->pa_len_threshold_number > sa_row_count) pa_longer_sa_limit = sa_row_count; else pa_longer_sa_limit = config->pa_len_threshold_number; uint64_t sa_row_limit; if (config->sa_cutoff_percent > 0) sa_row_limit = ceil( config->sa_cutoff_percent * sa_row_count ); else if (config->sa_cutoff_number == 0 || config->sa_cutoff_number > sa_row_count) sa_row_limit = sa_row_count; else sa_row_limit = config->sa_cutoff_number; for ( uint64_t i = 0; i < sa_row_count && i < sa_row_limit; ++i ) { int64_t sa_row_id = i + sa_id_first; const void * data_ptr = NULL; uint32_t data_len; uint32_t pa_row_len; uint32_t sa_row_len; uint32_t seq_read_len_len; // SA:HAS_REF_OFFSET rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_has_ref_offset_idx, NULL, (const void**)&data_ptr, NULL, &sa_row_len ); if ( rc != 0 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, HAS_REF_OFFSET column", sa_row_id, rc); const int64_t * p_seq_spot_id; uint32_t seq_spot_id_len; // SA:SEQ_SPOT_ID rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_seq_spot_id_idx, NULL, (const void**)&p_seq_spot_id, NULL, &seq_spot_id_len ); if ( rc != 0 || p_seq_spot_id == NULL || seq_spot_id_len != 1 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, SEQ_SPOT_ID column", sa_row_id, rc); int64_t seq_spot_id = *p_seq_spot_id; if (seq_spot_id == 0) { std::stringstream ss; ss << "SECONDARY_ALIGNMENT:" << sa_row_id << " has SEQ_SPOT_ID = " << seq_spot_id; throw DATA_ERROR(ss.str()); } if ( has_tmp_mismatch ) { const char * p_sa_tmp_mismatch; // SA:TMP_MISMATCH rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_tmp_mismatch_idx, NULL, (const void**)&p_sa_tmp_mismatch, NULL, &data_len ); if ( rc != 0 || p_sa_tmp_mismatch == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, TMP_MISMATCH column", sa_row_id, rc); for ( uint32_t j = 0; j < data_len; ++j ) { if ( p_sa_tmp_mismatch[j] == '=' ) { std::stringstream ss; ss << "SECONDARY_ALIGNMENT:" << sa_row_id << " TMP_MISMATCH contains '='"; throw DATA_ERROR(ss.str()); } } } const int64_t * p_pa_row_id; // SA:PRIMARY_ALIGNMENT_ID rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_pa_id_idx, NULL, (const void**)&p_pa_row_id, NULL, &data_len ); if ( rc != 0 || p_pa_row_id == NULL || data_len != 1 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, PRIMARY_ALIGNMENT_ID column", sa_row_id, rc); int64_t pa_row_id = *p_pa_row_id; if (pa_row_id == 0) { if (!reported_about_no_pa) { PLOGMSG (klogInfo, (klogInfo, "$(ACC) has secondary alignments without primary", "ACC=%s", accession)); reported_about_no_pa = true; } continue; } // PA:HAS_REF_OFFSET rc = VCursorCellDataDirect ( pa_cursor, pa_row_id, pa_has_ref_offset_idx, NULL, &data_ptr, NULL, &pa_row_len ); if ( rc != 0 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on PRIMARY_ALIGNMENT table, HAS_REF_OFFSET column", pa_row_id, rc); // move on when PA.len equal to SA.len if (pa_row_len == sa_row_len) continue; if (pa_row_len < sa_row_len) { std::stringstream ss; ss << "PRIMARY_ALIGNMENT:" << pa_row_id << " HAS_REF_OFFSET length (" << pa_row_len << ") less than SECONDARY_ALIGNMENT:" << sa_row_id << " HAS_REF_OFFSET length (" << sa_row_len << ")"; throw DATA_ERROR(ss.str()); } // we already know that pa_row_len > sa_row_len ++pa_longer_sa_rows; const int32_t * p_seq_read_id; // SA:SEQ_READ_ID rc = VCursorCellDataDirect ( sa_cursor, sa_row_id, sa_seq_read_id_idx, NULL, (const void**)&p_seq_read_id, NULL, &data_len ); if ( rc != 0 || p_seq_read_id == NULL || data_len != 1 ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SECONDARY_ALIGNMENT table, SEQ_READ_ID column", sa_row_id, rc); // one-based read index int32_t seq_read_id = *p_seq_read_id; const uint32_t * p_seq_read_len; // SEQ:READ_LEN rc = VCursorCellDataDirect ( seq_cursor, seq_spot_id, seq_read_len_idx, NULL, (const void**)&p_seq_read_len, NULL, &seq_read_len_len ); if ( rc != 0 || p_seq_read_len == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, READ_LEN column", seq_spot_id, rc); if ( seq_read_id < 1 || (uint32_t)seq_read_id > seq_read_len_len ) { std::stringstream ss; ss << "SECONDARY:" << sa_row_id << " SEQ_READ_ID value (" << seq_read_id << ") - 1 based, is out of SEQUENCE:" << seq_spot_id << " READ_LEN range (" << seq_read_len_len << ")"; throw DATA_ERROR(ss.str()); } if (pa_row_len != p_seq_read_len[seq_read_id - 1]) { std::stringstream ss; ss << "PRIMARY_ALIGNMENT:" << pa_row_id << " HAS_REF_OFFSET length (" << pa_row_len << ") does not match its SEQUENCE:" << seq_spot_id << " READ_LEN[" << seq_read_id - 1 << "] value (" << p_seq_read_len[seq_read_id - 1] << ")"; throw DATA_ERROR(ss.str()); } if (pa_longer_sa_rows >= pa_longer_sa_limit) { std::stringstream ss; ss << "Limit violation (pa_longer_sa): there are at least " << pa_longer_sa_rows << " alignments where HAS_REF_OFFSET column is longer in PRIMARY_ALIGNMENT than in SECONDARY_ALIGNMENT"; throw DATA_ERROR(ss.str()); } } int64_t seq_id_first; uint64_t seq_row_count; rc = VCursorIdRange( seq_cursor, seq_pa_id_idx, &seq_id_first, &seq_row_count ); if (rc != 0) throw VDB_ERROR("VCursorIdRange() failed for SEQUENCE table, PRIMARY_ALIGNMENT_ID column", rc); uint64_t seq_row_limit; if (config->seq_cutoff_percent > 0) seq_row_limit = ceil( config->seq_cutoff_percent * seq_row_count ); else if (config->seq_cutoff_number == 0 || config->seq_cutoff_number > seq_row_count) seq_row_limit = seq_row_count; else seq_row_limit = config->seq_cutoff_number; for ( uint64_t i = 0; i < seq_row_count && i < seq_row_limit; ++i ) { int64_t seq_row_id = i + seq_id_first; const void * data_ptr = NULL; uint32_t data_len; const int64_t * p_seq_pa_id; uint32_t seq_pa_id_len; // SEQ:PRIMARY_ALIGNMENT_ID rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_pa_id_idx, NULL, (const void**)&p_seq_pa_id, NULL, &seq_pa_id_len ); if ( rc != 0 || p_seq_pa_id == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, PRIMARY_ALIGNMENT_ID column", seq_row_id, rc); const uint32_t * p_seq_read_len; // SEQ:READ_LEN rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_read_len_idx, NULL, (const void**)&p_seq_read_len, NULL, &data_len ); if ( rc != 0 || p_seq_read_len == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, READ_LEN column", seq_row_id, rc); if ( seq_pa_id_len != data_len ) { std::stringstream ss; ss << "SEQUENCE:" << seq_row_id << " PRIMARY_ALIGNMENT_ID length (" << seq_pa_id_len << ") does not match SEQUENCE:" << seq_row_id << " READ_LEN length (" << data_len << ")"; throw DATA_ERROR(ss.str()); } uint64_t sum_unaligned_read_len = 0; for ( uint32_t j = 0; j < seq_pa_id_len; ++j ) { if ( p_seq_pa_id[j] == 0 ) { sum_unaligned_read_len += p_seq_read_len[j]; } } // SEQ:CMP_READ rc = VCursorCellDataDirect ( seq_cursor, seq_row_id, seq_cmp_read_idx, NULL, (const void**)&data_ptr, NULL, &data_len ); if ( rc != 0 || data_ptr == NULL ) throw VDB_ROW_ERROR("VCursorCellDataDirect() failed on SEQUENCE table, SEQ:CMP_READ column", seq_row_id, rc); if ( sum_unaligned_read_len != data_len ) { std::stringstream ss; ss << "SEQUENCE:" << seq_row_id << " CMP_READ length (" << data_len << ") does not match sum of unaligned READ_LEN values (" << sum_unaligned_read_len << ")"; throw DATA_ERROR(ss.str()); } } if (sa_row_limit < sa_row_count || seq_row_limit < seq_row_count) PLOGMSG (klogInfo, (klogInfo, "$(ACC) looks good (based on first $(SA_CUTOFF) of SECONDARY_ALIGNMENT and $(SEQ_CUTOFF) SEQUENCE rows)", "ACC=%s,SA_CUTOFF=%lu,SEQ_CUTOFF=%lu", accession, sa_row_limit, seq_row_limit)); else PLOGMSG (klogInfo, (klogInfo, "$(ACC) looks good", "ACC=%s", accession)); }
/* function INSDC:coord:zero NCBI:align:ref_pos ( I64 ref_id, INSDC:coord:zero ref_start ); */ static rc_t CC align_ref_pos ( void *data, const VXformInfo *info, int64_t row_id, VRowResult *rslt, uint32_t argc, const VRowData argv[] ) { rc_t rc = 0; RefPos const *self = ( void const * )data; int64_t ref_row_id = 0; INSDC_coord_zero *ref_pos; unsigned const ploidy = ( unsigned const )argv[ REF_START ].u.data.elem_count; unsigned i; /* get start and length of reference segment */ int64_t const *ref_id = 0; INSDC_coord_zero const *ref_start; assert( argv[ REF_ID ].u.data.elem_bits == sizeof( *ref_id ) * 8 ); assert( argv[ REF_START ].u.data.elem_bits == sizeof( *ref_start ) * 8 ); ref_start = argv[ REF_START ].u.data.base; ref_start += argv[ REF_START ].u.data.first_elem; if ( self->curs != NULL ) { char const *name = NULL; uint32_t name_len; BSTRowRange *brr; ref_id = argv[ REF_ID ].u.data.base; ref_id += argv[ REF_ID ].u.data.first_elem; brr = ( BSTRowRange * )BSTreeFind( &self->tr_range, &ref_id[ 0 ], row_range_cmp ); if ( brr == NULL ) { RowRange *new_rr; SUB_DEBUG( ( "SUB.Rd in 'align-ref-pos.c' at #%lu\n", ref_id[ 0 ] ) ); rc = VCursorCellDataDirect( self->curs, ref_id[ 0 ], self->name_idx, NULL, (void const **)&name, NULL, &name_len ); if ( rc != 0 ) return rc; rc = VCursorParamsSet( ( struct VCursorParams const * )self->curs, "QUERY_SEQ_NAME", "%.*s", name_len, name ); if ( rc != 0 ) return rc; rc = VCursorCellDataDirect( self->curs, ref_id[ 0 ], self->name_range_idx, NULL, (void const **)&new_rr, NULL, NULL ); if ( rc != 0 ) return rc; brr = malloc( sizeof( *brr ) ); if ( brr == NULL ) { return RC( rcXF, rcFunction, rcConstructing, rcMemory, rcExhausted ); } else { memcpy( &brr->rr, new_rr, sizeof( *new_rr ) ); BSTreeInsert( ( BSTree* )&self->tr_range, ( BSTNode* )brr, row_range_sort ); } } ref_row_id = brr->rr.start_id; } rc = KDataBufferResize( rslt->data, ploidy ); if ( rc != 0 ) return rc; ref_pos = rslt->data->base; for ( i = 0; i != ploidy; ++i ) { ref_pos[ i ] = ref_start[ i ]; if ( self->curs != NULL ) { ref_pos[ i ] += ( INSDC_coord_zero )( ( ref_id[ 0 ] - ref_row_id ) * self->max_seq_len ); } } rslt->elem_count = ploidy; rslt->elem_bits = sizeof( ref_pos[ 0 ] ) * 8; return rc; }
static rc_t cg_dump_write_spot( cg_dump_opts * opts, cg_dump_ctx * cg_ctx, uint64_t row_id, lane * l ) { uint32_t elem_bits, boff, read_len; const char * read; rc_t rc = VCursorCellDataDirect( cg_ctx->seq_cur, row_id, cg_ctx->seq_read_idx, &elem_bits, (const void**)&read, &boff, &read_len ); if ( rc != 0 ) { (void)PLOGERR( klogErr, ( klogErr, rc, "cannot read READ in row #$(row_id)", "row_id=%lu", row_id ) ); } else { uint32_t qual_len; const char * qual; rc = VCursorCellDataDirect( cg_ctx->seq_cur, row_id, cg_ctx->seq_qual_idx, &elem_bits, (const void**)&qual, &boff, &qual_len ); if ( rc != 0 ) { (void)PLOGERR( klogErr, ( klogErr, rc, "cannot read QUALITY in row #$(row_id)", "row_id=%lu", row_id ) ); } else { if ( ( read_len != 70 ) && ( qual_len != 70 ) ) { rc = RC( rcExe, rcDatabase, rcReading, rcRange, rcInvalid ); (void)LOGERR( klogErr, rc, "len of read/quality columns do not match cg-length of 2 x 35" ); } else { char buffer[ 1024 ]; size_t num_writ_buf; rc = string_printf ( buffer, sizeof buffer, &num_writ_buf, "%lu\t0\t%.35s\t%.35s\n%lu\t1\t%.35s\t%.35s\n", row_id, read, qual, row_id, &(read[35]), &(qual[35]) ); if ( rc != 0 ) { (void)PLOGERR( klogErr, ( klogErr, rc, "cannot generate output in row #$(row_id)", "row_id=%lu", row_id ) ); } else { if ( opts->comp != oc_null ) { if ( l->spot_count >= opts->cutoff ) { KFileRelease( l->reads ); l->chunk++; l->spot_count = 0; l->write_pos = 0; rc = make_read_file( opts, cg_ctx->lookup, cg_ctx->out_dir, l ); } if ( rc == 0 ) { size_t num_writ_file; rc = KFileWrite ( l->reads, l->write_pos, buffer, num_writ_buf, &num_writ_file ); if ( rc != 0 ) { (void)PLOGERR( klogErr, ( klogErr, rc, "cannot write output in row #$(row_id)", "row_id=%lu", row_id ) ); } else { l->write_pos += num_writ_file; l->spot_count ++; } } } } } } } return rc; }
rc_t initialize_ref_pos ( PileupIteratorState* pileup_state, VCursor const* cursor_ref, VCursor const* cursor_pa, char const* const* column_names_ref, uint32_t* column_index_ref, size_t column_count_ref, char const* const* column_names_pa, uint32_t* column_index_pa, size_t column_count_pa, char* error_buf, size_t error_buf_size ) { int64_t row_id; uint64_t row_count; uint32_t max_seq_len, row_len; rc_t rc = VCursorIdRange ( cursor_ref, 0, & row_id, & row_count ); /*printf ("REFERENCE table: row_id=%lld, row_count=%llu\n", row_id, row_count);*/ if ( row_count < 1 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "There is no rows in REFERENCE table"); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return (rc_t)(-1); } pileup_state->total_row_count = row_count; /* We don't know the reference end id use its name to notice the moment when it changes - this will be the end */ rc = VCursorReadDirect ( cursor_ref, pileup_state->reference_start_id, column_index_ref [COL_NAME], sizeof (pileup_state->ref_name[0]) * 8, pileup_state->ref_name, countof(pileup_state->ref_name), & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorReadDirect(ref) failed with error: 0x%08x (%u) [%R]", rc, rc, rc); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } pileup_state->ref_name[ min ( countof(pileup_state->ref_name) - 1, row_len) ] = '\0'; /* Read MAX_SEQ_LEN from the start_row_id and assume that it's the same for all the rest */ rc = VCursorReadDirect ( cursor_ref, pileup_state->reference_start_id, column_index_ref [COL_MAX_SEQ_LEN], sizeof (max_seq_len) * 8, & max_seq_len, 1, & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorReadDirect(ref) failed with error: 0x%08x (%u) [%R]", rc, rc, rc); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } pileup_state->max_seq_len = max_seq_len; if ( row_len < 1 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "There is no MAX_SEQ_LEN column for row_id=%lld in REFERENCE table", row_id); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return (rc_t)(-1); } printf ("MAX_SEQ_LEN=%lu\n", max_seq_len); pileup_state->slice_start_id = pileup_state->reference_start_id + pileup_state->slice_start/max_seq_len; pileup_state->slice_end_id = pileup_state->slice_length != 0 ? pileup_state->reference_start_id + (pileup_state->slice_start + (int64_t)pileup_state->slice_length)/max_seq_len : (int64_t)pileup_state->total_row_count; printf ("slice position range: [%lld, %llu]\n", pileup_state->slice_start, pileup_state->slice_start + pileup_state->slice_length); /*printf ("slice id range: [%lld, %lld]\n", slice_start_id, slice_end_id);*/ /* Read reference slice_start_id, read OVERLAP_*_POS to find out how many rows we need to read ahead of slice_start_id TODO: this is not implemented yet, insted we read just 10 rows ahead */ /* Set cursor to <read_ahead_rows> rows ahead of slice_start_id and cache corresponding PRIMARY_ALIGNMENTS */ { int64_t current_id = max (pileup_state->reference_start_id, pileup_state->slice_start_id - 10); int64_t stop_id = pileup_state->slice_start_id; uint32_t seq_start; uint32_t dummy; #if USE_SINGLE_BLOB_FOR_ALIGNMENT_IDS != 1 int64_t const* alignment_ids; #endif for (; ; ++current_id) { /* We don't know the current reference end_id read it's name and break when it changes */ char ref_name[ countof (pileup_state->ref_name) ]; rc = VCursorReadDirect ( cursor_ref, current_id, column_index_ref [COL_NAME], sizeof (ref_name[0]) * 8, ref_name, countof(ref_name), & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorReadDirect(ref) failed with error: 0x%08x (%u) [%R]", rc, rc, rc); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } ref_name[ min ( countof(ref_name) - 1, row_len) ] = '\0'; if ( current_id > stop_id || strcmp (ref_name, pileup_state->ref_name) ) break; #if USE_SINGLE_BLOB_FOR_ALIGNMENT_IDS == 1 rc = open_blob_for_current_id ( current_id, cursor_ref, & pileup_state->blob_alignment_ids, column_index_ref [COL_PRIMARY_ALIGNMENT_IDS], error_buf, error_buf_size ); if (rc != 0) return rc; #endif /* Read REFERENCE row's SEQ_START column to know the offset */ rc = VCursorReadDirect ( cursor_ref, current_id, column_index_ref [COL_SEQ_START], sizeof (seq_start) * 8, & seq_start, 1, & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorReadDirect(ref-seq_start) failed with error: 0x%08x (%u) [%R]", rc, rc, rc); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } pileup_state->current_seq_start = seq_start; /* Read REFERENCE row's PRIMARY_ALIGNMENT_IDS column to iterate through them */ /* elem_bits = sizeof (*pileup_state->alignment_ids) * 8;*/ #if USE_SINGLE_BLOB_FOR_ALIGNMENT_IDS == 1 rc = VBlobCellData ( pileup_state->blob_alignment_ids, current_id, & dummy, & pileup_state->alignment_ids, NULL, & row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VBlobCellData(ref-pa_ids) failed with error: 0x%08x (%u) [%R], row_len=%u", rc, rc, rc, row_len); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } pileup_state -> size_alignment_ids = row_len; #else rc = VCursorCellDataDirect ( cursor_ref, current_id, column_index_ref [COL_PRIMARY_ALIGNMENT_IDS], NULL, (void const**)(& alignment_ids), 0, & row_len ); /*rc = VCursorReadDirect ( cursor_ref, current_id, column_index_ref [COL_PRIMARY_ALIGNMENT_IDS], sizeof (*pileup_state->alignment_ids) * 8, pileup_state->alignment_ids, countof (pileup_state->alignment_ids), & row_len );*/ if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: VCursorCellDataDirect(ref-pa_ids) failed with error: 0x%08x (%u) [%R], row_len=%u", rc, rc, rc, row_len); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } rc = PileupIteratorState_SetAlignmentIds ( pileup_state, alignment_ids, row_len ); if ( rc != 0 ) { rc_t res = string_printf ( error_buf, error_buf_size, NULL, "ERROR: PileupIteratorState_SetAlignmentIds failed with error: 0x%08x (%u), row_len=%u", rc, rc, row_len); if (res == rcBuffer || res == rcInsufficient) error_buf [ error_buf_size - 1 ] = '\0'; return rc; } #endif pileup_state->next_alignment_idx = 0; /*pileup_state->size_alignment_ids = row_len;*/ /*printf ("Read %lu PRIMARY_ALIGNMENT_IDS for REFERENCE row_id=%lld:", row_len, current_id);*/ { /*size_t i = 0; for (; i < row_len; ++i) printf(" %lld", pa_ids [i]);*/ /*printf ("\n");*/ } /* For each PRIMARY_ALIGNMENT_ID in alignment_ids: read its start, length and cache it if it intersects the starting position */ rc = add_ref_row_to_cache ( pileup_state, cursor_pa, seq_start, pileup_state->slice_start, pileup_state->alignment_ids, row_len, column_names_pa, column_index_pa, column_count_pa, error_buf, error_buf_size ); if ( rc != 0 ) return rc; } } return rc; }