void copy(const CPU_MATRIX & cpu_matrix, compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix ) { if ( cpu_matrix.size1() > 0 && cpu_matrix.size2() > 0 ) { gpu_matrix.resize(static_cast<unsigned int>(cpu_matrix.size1()), static_cast<unsigned int>(cpu_matrix.size2()), false); //determine nonzeros: long num_entries = 0; for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it) { unsigned int entries_per_row = 0; for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it) { ++entries_per_row; } num_entries += viennacl::tools::roundUpToNextMultiple<unsigned int>(entries_per_row, ALIGNMENT); } //std::cout << "CPU->GPU, Number of entries: " << num_entries << std::endl; //set up matrix entries: std::vector<unsigned int> row_buffer(cpu_matrix.size1() + 1); std::vector<unsigned int> col_buffer(num_entries); std::vector<SCALARTYPE> elements(num_entries); unsigned int row_index = 0; unsigned int data_index = 0; for (typename CPU_MATRIX::const_iterator1 row_it = cpu_matrix.begin1(); row_it != cpu_matrix.end1(); ++row_it) { row_buffer[row_index] = data_index; ++row_index; for (typename CPU_MATRIX::const_iterator2 col_it = row_it.begin(); col_it != row_it.end(); ++col_it) { col_buffer[data_index] = static_cast<unsigned int>(col_it.index2()); elements[data_index] = *col_it; ++data_index; } data_index = viennacl::tools::roundUpToNextMultiple<unsigned int>(data_index, ALIGNMENT); //take care of alignment } row_buffer[row_index] = data_index; /*gpu_matrix._row_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, row_buffer); gpu_matrix._col_buffer = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, col_buffer); gpu_matrix._elements = viennacl::ocl::device().createMemory(CL_MEM_READ_WRITE, elements); gpu_matrix._nonzeros = num_entries;*/ gpu_matrix.set(&row_buffer[0], &col_buffer[0], &elements[0], static_cast<unsigned int>(cpu_matrix.size1()), num_entries); } }
void BaseConvolutionLayer<Dtype>::forward_gpu_gemm(const Dtype* input, const int_tp input_off, const Dtype* weights, Dtype* output, const int_tp output_off, bool skip_im2col) { const Dtype* col_buff = input; if (this->device_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA if (!is_1x1_) { if (!skip_im2col) { conv_im2col_gpu(input + input_off, col_buffer()->mutable_gpu_data()); } col_buff = col_buffer()->gpu_data(); } for (int_tp g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>( CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype) 1., weights + weight_offset_ * g, col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 0., output + output_off + output_offset_ * g); } #endif // USE_CUDA } else { #ifdef USE_GREENTEA if (!is_1x1_) { if (!skip_im2col) { greentea_conv_im2col_gpu(input, input_off, col_buffer()->mutable_gpu_data(), 0); } col_buff = col_buffer()->gpu_data(); } for (int_tp g = 0; g < group_; ++g) { greentea_gpu_gemm<Dtype>(this->device_->id(), CblasNoTrans, CblasNoTrans, conv_out_channels_ / group_, conv_out_spatial_dim_, kernel_dim_, (Dtype) 1., (cl_mem) weights, weight_offset_ * g, (cl_mem) col_buff, (is_1x1_ ? input_off : 0) + col_offset_ * g, (Dtype) 0., (cl_mem) output, output_off + output_offset_ * g); } #endif // USE_GREENTEA } }
void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix, CPU_MATRIX & cpu_matrix ) { if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 ) { cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2()); //get raw data from memory: std::vector<unsigned int> row_buffer(gpu_matrix.size1() + 1); std::vector<unsigned int> col_buffer(gpu_matrix.nnz()); std::vector<SCALARTYPE> elements(gpu_matrix.nnz()); //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl; cl_int err; err = clEnqueueReadBuffer(viennacl::ocl::device().queue().get(), gpu_matrix.handle1().get(), CL_TRUE, 0, sizeof(unsigned int)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueReadBuffer(viennacl::ocl::device().queue().get(), gpu_matrix.handle2().get(), CL_TRUE, 0, sizeof(unsigned int)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueReadBuffer(viennacl::ocl::device().queue().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL); CL_ERR_CHECK(err); viennacl::ocl::finish(); //fill the cpu_matrix: unsigned int data_index = 0; for (unsigned int row = 1; row <= gpu_matrix.size1(); ++row) { while (data_index < row_buffer[row]) { if (col_buffer[data_index] >= gpu_matrix.size1()) { std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl; return; } if (elements[data_index] != static_cast<SCALARTYPE>(0.0)) cpu_matrix(row-1, col_buffer[data_index]) = elements[data_index]; ++data_index; } } } }
void BaseConvolutionLayer<Dtype>::backward_gpu_gemm(const Dtype* output, const int output_off, const Dtype* weights, Dtype* input, const int input_off) { Dtype* col_buff = col_buffer()->mutable_gpu_data(); if (is_1x1_) { col_buff = input; } if (this->device_context_->backend() == BACKEND_CUDA) { #ifdef USE_CUDA for (int g = 0; g < group_; ++g) { caffe_gpu_gemm<Dtype>( CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., weights + weight_offset_ * g, output + output_off + output_offset_ * g, (Dtype) 0., col_buff + (is_1x1_ ? input_off : 0) + col_offset_ * g); } if (!is_1x1_) { conv_col2im_gpu(col_buff, input + input_off); } #endif // USE_CUDA } else { #ifdef USE_GREENTEA for (int g = 0; g < group_; ++g) { greentea_gpu_gemm<Dtype>(this->device_context_->id(), CblasTrans, CblasNoTrans, kernel_dim_ / group_, conv_out_spatial_dim_, conv_out_channels_ / group_, (Dtype) 1., (cl_mem) weights, weight_offset_ * g, (cl_mem) output, output_off + output_offset_ * g, (Dtype) 0., (cl_mem) col_buff, (is_1x1_ ? input_off : 0) + col_offset_ * g); } if (!is_1x1_) { greentea_conv_col2im_gpu(col_buff, 0, input, input_off); } #endif // USE_GREENTEA } }
void copy(viennashe::math::sparse_matrix<NumericT> const & assembled_matrix, viennacl::compressed_matrix<NumericT> & vcl_matrix) { std::size_t nonzeros = assembled_matrix.nnz(); viennacl::backend::typesafe_host_array<unsigned int> row_buffer(vcl_matrix.handle1(), assembled_matrix.size1() + 1); viennacl::backend::typesafe_host_array<unsigned int> col_buffer(vcl_matrix.handle2(), nonzeros); std::vector<NumericT> elements(nonzeros); std::size_t data_index = 0; for (std::size_t i = 0; i != assembled_matrix.size1(); ++i) { typedef typename viennashe::math::sparse_matrix<NumericT>::const_iterator2 AlongRowIterator; typedef typename viennashe::math::sparse_matrix<NumericT>::row_type RowType; row_buffer.set(i, data_index); RowType const & row_i = assembled_matrix.row(i); for (AlongRowIterator col_it = row_i.begin(); col_it != row_i.end(); ++col_it) { col_buffer.set(data_index, col_it->first); elements[data_index] = col_it->second; ++data_index; } } row_buffer.set(assembled_matrix.size1(), data_index); vcl_matrix.set(row_buffer.get(), col_buffer.get(), &elements[0], assembled_matrix.size1(), assembled_matrix.size2(), nonzeros); }
/** * Pivot the rows, creating a new resultset * * Call dbpivot() immediately after dbresults(). It calls dbnextrow() as long as * it returns REG_ROW, transforming the results into a cross-tab report. * dbpivot() modifies the metadata such that DB-Library can be used tranparently: * retrieve the rows as usual with dbnumcols(), dbnextrow(), etc. * * @dbproc, our old friend * @nkeys the number of left-edge columns to group by * @keys an array of left-edge columns to group by * @ncols the number of top-edge columns to group by * @cols an array of top-edge columns to group by * @func the aggregation function to use * @val the number of the column to which @func is applied * * @returns the return code from the final call to dbnextrow(). * Success is normally indicated by NO_MORE_ROWS. */ RETCODE dbpivot(DBPROCESS *dbproc, int nkeys, int *keys, int ncols, int *cols, DBPIVOT_FUNC func, int val) { enum { logalot = 1 }; struct pivot_t P, *pp; struct agg_t input, *pout = NULL; struct key_t *pacross; struct metadata_t *metadata, *pmeta; size_t i, nmeta = 0; tdsdump_log(TDS_DBG_FUNC, "dbpivot(%p, %d,%p, %d,%p, %p, %d)\n", dbproc, nkeys, keys, ncols, cols, func, val); if (logalot) { char buffer[1024] = {'\0'}, *s = buffer; const static char *names[2] = { "\tkeys (down)", "\n\tcols (across)" }; int *p = keys, *pend = p + nkeys; for (i=0; i < 2; i++) { const char *sep = ""; s += sprintf(s, "%s: ", names[i]); for ( ; p < pend; p++) { s += sprintf(s, "%s%d", sep, *p); sep = ", "; } p = cols; pend = p + ncols; assert(s < buffer + sizeof(buffer)); } tdsdump_log(TDS_DBG_FUNC, "%s\n", buffer); } memset(&input, 0, sizeof(input)); P.dbproc = dbproc; if ((pp = tds_find(&P, pivots, npivots, sizeof(*pivots), pivot_key_equal)) == NULL ) { pp = realloc(pivots, (1 + npivots) * sizeof(*pivots)); if (!pp) return FAIL; pivots = pp; pp += npivots++; } else { agg_free(pp->output); key_free(pp->across); } memset(pp, 0, sizeof(*pp)); if ((input.row_key.keys = calloc(nkeys, sizeof(*input.row_key.keys))) == NULL) return FAIL; input.row_key.nkeys = nkeys; for (i=0; i < nkeys; i++) { int type = dbcoltype(dbproc, keys[i]); int len = dbcollen(dbproc, keys[i]); assert(type && len); col_init(input.row_key.keys+i, type, len); if (FAIL == dbbind(dbproc, keys[i], bind_type(type), input.row_key.keys[i].len, col_buffer(input.row_key.keys+i))) return FAIL; if (FAIL == dbnullbind(dbproc, keys[i], &input.row_key.keys[i].null_indicator)) return FAIL; } if ((input.col_key.keys = calloc(ncols, sizeof(*input.col_key.keys))) == NULL) return FAIL; input.col_key.nkeys = ncols; for (i=0; i < ncols; i++) { int type = dbcoltype(dbproc, cols[i]); int len = dbcollen(dbproc, cols[i]); assert(type && len); col_init(input.col_key.keys+i, type, len); if (FAIL == dbbind(dbproc, cols[i], bind_type(type), input.col_key.keys[i].len, col_buffer(input.col_key.keys+i))) return FAIL; if (FAIL == dbnullbind(dbproc, cols[i], &input.col_key.keys[i].null_indicator)) return FAIL; } /* value */ { int type = dbcoltype(dbproc, val); int len = dbcollen(dbproc, val); assert(type && len); col_init(&input.value, type, len); if (FAIL == dbbind(dbproc, val, bind_type(type), input.value.len, col_buffer(&input.value))) return FAIL; if (FAIL == dbnullbind(dbproc, val, &input.value.null_indicator)) return FAIL; } while ((pp->status = dbnextrow(dbproc)) == REG_ROW) { /* add to unique list of crosstab columns */ if ((pacross = tds_find(&input.col_key, pp->across, pp->nacross, sizeof(*pp->across), key_equal)) == NULL ) { pacross = realloc(pp->across, (1 + pp->nacross) * sizeof(*pp->across)); if (!pacross) return FAIL; pp->across = pacross; pacross += pp->nacross++; key_cpy(pacross, &input.col_key); } assert(pp->across); if ((pout = tds_find(&input, pp->output, pp->nout, sizeof(*pp->output), agg_equal)) == NULL ) { pout = realloc(pp->output, (1 + pp->nout) * sizeof(*pp->output)); if (!pout) return FAIL; pp->output = pout; pout += pp->nout++; if ((pout->row_key.keys = calloc(input.row_key.nkeys, sizeof(*pout->row_key.keys))) == NULL) return FAIL; key_cpy(&pout->row_key, &input.row_key); if ((pout->col_key.keys = calloc(input.col_key.nkeys, sizeof(*pout->col_key.keys))) == NULL) return FAIL; key_cpy(&pout->col_key, &input.col_key); col_init(&pout->value, input.value.type, input.value.len); } func(&pout->value, &input.value); } /* Mark this proc as pivoted, so that dbnextrow() sees it when the application calls it */ pp->dbproc = dbproc; pp->dbresults_state = dbproc->dbresults_state; dbproc->dbresults_state = pp->output < pout? _DB_RES_RESULTSET_ROWS : _DB_RES_RESULTSET_EMPTY; /* * Initialize new metadata */ nmeta = input.row_key.nkeys + pp->nacross; metadata = calloc(nmeta, sizeof(*metadata)); assert(pp->across || pp->nacross == 0); /* key columns are passed through as-is, verbatim */ for (i=0; i < input.row_key.nkeys; i++) { assert(i < nkeys); metadata[i].name = strdup(dbcolname(dbproc, keys[i])); metadata[i].pacross = NULL; col_cpy(&metadata[i].col, input.row_key.keys+i); } /* pivoted columms are found in the "across" data */ for (i=0, pmeta = metadata + input.row_key.nkeys; i < pp->nacross; i++) { struct col_t col; col_init(&col, SYBFLT8, sizeof(double)); assert(pmeta + i < metadata + nmeta); pmeta[i].name = make_col_name(pp->across+i); assert(pp->across); pmeta[i].pacross = pp->across + i; col_cpy(&pmeta[i].col, pp->nout? &pp->output[0].value : &col); } if (!reinit_results(dbproc->tds_socket, nmeta, metadata)) { return FAIL; } return SUCCEED; #if 0 for (pp->pout=pp->output; pp->pout < pp->output + pp->nout; pp->pout++) { char name[256] = {0}; assert(pp->pout->col_key.keys[0].len < sizeof(name)); memset(name, '\0', sizeof(name)); memcpy(name, pp->pout->col_key.keys[0].s, pp->pout->col_key.keys[0].len), printf("%5d %-30s %5d\n", pp->pout->row_key.keys[0].i, name, pp->pout->value.i ); } exit(1); #endif }
STATUS dbnextrow_pivoted(DBPROCESS *dbproc, struct pivot_t *pp) { int i; struct agg_t candidate, *pout; assert(pp); assert(dbproc && dbproc->tds_socket); assert(dbproc->tds_socket->res_info); assert(dbproc->tds_socket->res_info->columns || 0 == dbproc->tds_socket->res_info->num_cols); for (pout = pp->output; pout < pp->output + pp->nout; pout++) { if (pout->row_key.keys != NULL) break; } if (pout == pp->output + pp->nout) { dbproc->dbresults_state = _DB_RES_NEXT_RESULT; return NO_MORE_ROWS; } memset(&candidate, 0, sizeof(candidate)); key_cpy(&candidate.row_key, &pout->row_key); /* "buffer_transfer_bound_data" */ for (i = 0; i < dbproc->tds_socket->res_info->num_cols; i++) { struct col_t *pval = NULL; TDSCOLUMN *pcol = dbproc->tds_socket->res_info->columns[i]; assert(pcol); if (pcol->column_nullbind) { if (pcol->column_cur_size < 0) { *(DBINT *)(pcol->column_nullbind) = -1; } else { *(DBINT *)(pcol->column_nullbind) = 0; } } if (!pcol->column_varaddr) { fprintf(stderr, "no pcol->column_varaddr in col %d\n", i); continue; } /* find column in output */ if (pcol->bcp_terminator == NULL) { /* not a cross-tab column */ pval = &candidate.row_key.keys[i]; } else { struct agg_t *pcan; key_cpy(&candidate.col_key, (struct key_t *) pcol->bcp_terminator); if ((pcan = tds_find(&candidate, pout, pp->output + pp->nout - pout, sizeof(*pp->output), agg_next)) != NULL) { /* flag this output as used */ pout->row_key.keys = NULL; pval = &pcan->value; } } if (!pval || col_null(pval)) { /* nothing in output for this x,y location */ dbgetnull(dbproc, pcol->column_bindtype, pcol->column_bindlen, (BYTE *) pcol->column_varaddr); continue; } assert(pval); #if 0 printf("\ncopying col %d, type %d/%d, len %d to %p ", i, pval->type, pcol->column_type, pval->len, pcol->column_varaddr); switch (pval->type) { case 48: printf("value %d", (int)pval->ti); break; case 56: printf("value %d", (int)pval->si); break; } printf("\n"); #endif pcol->column_size = pval->len; pcol->column_data = col_buffer(pval); copy_data_to_host_var( dbproc, pval->type, col_buffer(pval), pval->len, (BYTE *) pcol->column_varaddr, pcol->column_bindlen, pcol->column_bindtype, (DBINT*) pcol->column_nullbind ); } return REG_ROW; }