void qrfactor(matrix & Q, matrix & R, matrix & betas, const matrix & A){ int i, j, k, l, N, M; i = 0; j = 0; k = 0; l = 0; N = 0; M = 0; fpp beta, temp; beta = 0.0; temp = 0.0; N = A.get_rows(); M = A.get_cols(); matrix bigV(N,1), temp_row(N,1); zeros(bigV); zeros(temp_row); submatrix x, v; if((Q.get_rows() != N) || (Q.get_cols() != M) ||(R.get_rows() != N) || (R.get_cols() != M) || (betas.get_rows() != A.get_rows()) || (betas.get_cols() != 1)){ std::cerr << "QR dimensions incompatible! Q(" << Q.get_rows() << "," << Q.get_cols() << "), R(" << R.get_rows() << "," << R.get_cols() << "), betas:(" << betas.get_rows() << "," << betas.get_cols() << "), A:(" << A.get_rows() << "," << A.get_cols() << ")." << std::endl; exit(-1); } R = A; zeros(Q); for(i = 0; i < N; i++){ Q(i,i) = 1.0; } for(i = 0; i < N-1; i++){ x.subcreate(R, i, i, N-i, 1); v.subcreate(bigV, i, 0, N-i, 1); house(v,beta,x); for(k = i; k < M; k++){ for(j = i; j < N; j++){ temp = 0.0; for(l = i; l < N; l++){ temp += beta*v(l-i,0)*v(j-i,0)*R(l,k); } temp_row(j,0) = R(j,k) - temp; } for(l = i; l < N; l++){ R(l,k) = temp_row(l,0); } } for(k = 0; k < M; k++){ for(j = i; j < N; j++){ temp = 0.0; for(l = i; l < N; l++){ temp += beta*v(l-i,0)*v(j-i,0)*Q(l,k); } temp_row(j,0) = Q(j,k) - temp; } for(l = i; l < N; l++){ Q(l,k) = temp_row(l,0); } } betas(i,0) = beta; } }
/** * Description not yet available. * \param */ dvar_matrix operator*(const dvar_matrix& m1, const dmatrix& cm2) { if (m1.colmin() != cm2.rowmin() || m1.colmax() != cm2.rowmax()) { cerr << " Incompatible array bounds in " "dmatrix operator*(const dvar_matrix& x, const dmatrix& m)\n"; ad_exit(21); } dmatrix cm1=value(m1); //dmatrix cm2=value(m2); dmatrix tmp(m1.rowmin(),m1.rowmax(), cm2.colmin(), cm2.colmax()); #ifdef OPT_LIB const size_t rowsize = (size_t)cm2.rowsize(); #else const int _rowsize = cm2.rowsize(); assert(_rowsize > 0); const size_t rowsize = (size_t)_rowsize; #endif try { double* temp_col = new double[rowsize]; temp_col-=cm2.rowmin(); for (int j=cm2.colmin(); j<=cm2.colmax(); j++) { for (int k=cm2.rowmin(); k<=cm2.rowmax(); k++) { temp_col[k] = cm2.elem(k,j); } for (int i=cm1.rowmin(); i<=cm1.rowmax(); i++) { double sum=0.0; dvector& temp_row = cm1(i); for (int k=cm1.colmin(); k<=cm1.colmax(); k++) { sum+=temp_row(k) * (temp_col[k]); // sum+=temp_row(k) * cm2(k,j); } tmp(i,j)=sum; } } temp_col+=cm2.rowmin(); delete [] temp_col; temp_col = 0; } catch (std::bad_alloc& e) { cerr << "Error[" << __FILE__ << ':' << __LINE__ << "]: Unable to allocate array.\n"; //ad_exit(21); throw e; } dvar_matrix vtmp=nograd_assign(tmp); save_identifier_string("TEST1"); //m1.save_dvar_matrix_value(); m1.save_dvar_matrix_position(); cm2.save_dmatrix_value(); cm2.save_dmatrix_position(); vtmp.save_dvar_matrix_position(); save_identifier_string("TEST6"); gradient_structure::GRAD_STACK1-> set_gradient_stack(dmcm_prod); return vtmp; }
void Basker<Int,Entry,Exe_Space>::btf_blk_amd ( BASKER_MATRIX &M, INT_1DARRAY p, INT_1DARRAY btf_nnz, INT_1DARRAY btf_work ) { // printf("=============BTF_BLK_AMD_CALLED========\n"); if(Options.incomplete == BASKER_TRUE) { //We note that AMD on incomplete ILUK //Seems realy bad and leads to a zero on the diag //Therefore, we simply return the natural ordering for(Int i = 0 ; i < M.ncol; i++) { p(i) = i; } //We will makeup work to be 1, //Since BTF is not supported in our iluk for(Int b = 0; b < btf_nblks; b++) { btf_nnz(b) = 1; btf_work(b) =1; } //printf("Short amd blk\n"); return; } //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); //printf("Done with btf_blk_amd malloc \n"); //printf("blks: %d \n" , btf_nblks); for(Int b = 0; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); //printf("blk: %d blk_size: %d \n", // b, blk_size); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } btf_work(b) = blk_size*blk_size*blk_size; btf_nnz(b) = (.5*(blk_size*blk_size) + blk_size); continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif double l_nnz = 0; double lu_work = 0; BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0)), l_nnz, lu_work); btf_nnz(b) = l_nnz; btf_work(b) = lu_work; #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
void Basker<Int,Entry,Exe_Space>::blk_amd(BASKER_MATRIX &M, INT_1DARRAY p) { //p == length(M) //Scan over all blks //Note, that this needs to be made parallel in the //future (Future Josh will be ok with this, right?) //This is a horrible way to do this!!!!! //KLU does this very nice, but they also make all the little blks INT_1DARRAY temp_col; MALLOC_INT_1DARRAY(temp_col, M.ncol+1); INT_1DARRAY temp_row; MALLOC_INT_1DARRAY(temp_row, M.nnz); for(Int b = btf_tabs_offset; b < btf_nblks; b++) { Int blk_size = btf_tabs(b+1) - btf_tabs(b); if(blk_size < 3) { //printf("debug, blk_size: %d \n", blk_size); for(Int ii = 0; ii < blk_size; ++ii) { //printf("set %d \n", btf_tabs(b)+ii-M.scol); p(ii+btf_tabs(b)) = btf_tabs(b)+ii-M.scol; } continue; } INT_1DARRAY tempp; MALLOC_INT_1DARRAY(tempp, blk_size+1); //Fill in temp matrix Int nnz = 0; Int column = 1; temp_col(0) = 0; for(Int k = btf_tabs(b); k < btf_tabs(b+1); k++) { for(Int i = M.col_ptr(k); i < M.col_ptr(k+1); i++) { if(M.row_idx(i) < btf_tabs(b)) continue; temp_row(nnz) = M.row_idx(i) - btf_tabs(b); nnz++; }// end over all row_idx temp_col(column) = nnz; column++; }//end over all columns k #ifdef BASKER_DEBUG_ORDER_AMD printf("col_ptr: "); for(Int i = 0 ; i < blk_size+1; i++) { printf("%d, ", temp_col(i)); } printf("\n"); printf("row_idx: "); for(Int i = 0; i < nnz; i++) { printf("%d, ", temp_row(i)); } printf("\n"); #endif BaskerSSWrapper<Int>::amd_order(blk_size, &(temp_col(0)), &(temp_row(0)),&(tempp(0))); #ifdef BASKER_DEBUG_ORDER_AMD printf("blk: %d order: \n", b); for(Int ii = 0; ii < blk_size; ii++) { printf("%d, ", tempp(ii)); } #endif //Add to the bigger perm vector for(Int ii = 0; ii < blk_size; ii++) { //printf("loc: %d val: %d \n", //ii+btf_tabs(b), tempp(ii)+btf_tabs(b)); p(tempp(ii)+btf_tabs(b)) = ii+btf_tabs(b); } FREE_INT_1DARRAY(tempp); }//over all blk_tabs #ifdef BASKER_DEBUG_AMD_ORDER printf("blk amd final order\n"); for(Int ii = 0; ii < M.ncol; ii++) { printf("%d, ", p(ii)); } printf("\n"); #endif FREE_INT_1DARRAY(temp_col); FREE_INT_1DARRAY(temp_row); }//end blk_amd()
void SeparableConvolution2d(const RowMatrixXf& image, const Eigen::RowVectorXf& kernel_x, const Eigen::RowVectorXf& kernel_y, const BorderType& border_type, RowMatrixXf* out) { const int full_size = kernel_x.size(); const int half_size = full_size / 2; out->resize(image.rows(), image.cols()); // Convolving a vertical filter across rows is the same thing as transpose // multiply i.e. kernel_y^t * rows. This will give us the convoled value for // each row. However, care must be taken at the top and bottom borders. const RowVectorXf reverse_kernel_y = kernel_y.reverse(); if (border_type == REFLECT) { for (int i = 0; i < half_size; i++) { const int forward_size = i + half_size + 1; const int reverse_size = full_size - forward_size; out->row(i) = kernel_y.tail(forward_size) * image.block(0, 0, forward_size, image.cols()) + reverse_kernel_y.tail(reverse_size) * image.block(1, 0, reverse_size, image.cols()); // Apply the same technique for the end rows. // TODO(csweeney): Move this to its own loop for cache exposure? out->row(image.rows() - i - 1) = kernel_y.head(forward_size) * image.block(image.rows() - forward_size, 0, forward_size, image.cols()) + reverse_kernel_y.head(reverse_size) * image.block(image.rows() - reverse_size - 1, 0, reverse_size, image.cols()); } } else { // Perform border with REPLICATE as the option. for (int i = 0; i < half_size; i++) { const int forward_size = i + half_size + 1; const int reverse_size = full_size - forward_size; out->row(i) = kernel_y.tail(forward_size) * image.block(0, 0, forward_size, image.cols()) + reverse_kernel_y.tail(reverse_size) * image.row(0).replicate(reverse_size, 1); // Apply the same technique for the end rows. out->row(image.rows() - i - 1) = kernel_y.head(forward_size) * image.block(image.rows() - forward_size, 0, forward_size, image.cols()) + reverse_kernel_y.head(reverse_size) * image.row(image.rows() - 1).replicate(reverse_size, 1); } } // Applying the rest of the y filter. #ifdef AKAZE_USE_OPENMP #pragma omp parallel for #endif for (int row = half_size; row < image.rows() - half_size; row++) { out->row(row) = kernel_y * image.block(row - half_size, 0, full_size, out->cols()); } // Convolving with the horizontal filter is easy. Rather than using the kernel // as a sliding indow, we use the row pixels as a sliding window around the // filter. We prepend and append the proper border values so that we are sure // to end up with the correct convolved values. if (border_type == REFLECT) { RowVectorXf temp_row(image.cols() + full_size - 1); #ifdef AKAZE_USE_OPENMP #pragma omp parallel for firstprivate(temp_row) #endif for (int row = 0; row < out->rows(); row++) { temp_row.head(half_size) = out->row(row).segment(1, half_size).reverse(); temp_row.segment(half_size, image.cols()) = out->row(row); temp_row.tail(half_size) = out->row(row) .segment(image.cols() - 1 - half_size, half_size) .reverse(); // Convolve the row. We perform the first step here explicitly so that we // avoid setting the row equal to zero. out->row(row) = kernel_x(0) * temp_row.head(image.cols()); for (int i = 1; i < full_size; i++) { out->row(row) += kernel_x(i) * temp_row.segment(i, image.cols()); } } } else { RowVectorXf temp_row(image.cols() + full_size - 1); #ifdef AKAZE_USE_OPENMP #pragma omp parallel for firstprivate(temp_row) #endif for (int row = 0; row < out->rows(); row++) { temp_row.head(half_size).setConstant((*out)(row, 0)); temp_row.segment(half_size, image.cols()) = out->row(row); temp_row.tail(half_size).setConstant((*out)(row, out->cols() - 1)); // Convolve the row. We perform the first step here explicitly so that we // avoid setting the row equal to zero. out->row(row) = kernel_x(0) * temp_row.head(image.cols()); for (int i = 1; i < full_size; i++) { out->row(row) += kernel_x(i) * temp_row.segment(i, image.cols()); } } } }