void mul(const device_vector<double> &x, device_vector<double> &y, double alpha = 1, bool append = false) const { double beta = append ? 1.0 : 0.0; cuda_check( cusparseDhybmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, desc.get(), mat.get(), x.raw_ptr(), &beta, y.raw_ptr() ) ); }
void mul(const device_vector<float> &x, device_vector<float> &y, float alpha = 1, bool append = false) const { float beta = append ? 1.0f : 0.0f; cuda_check( cusparseShybmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, desc.get(), mat.get(), x.raw_ptr(), &beta, y.raw_ptr() ) ); }
void load_texture_info() { if (need_texture_info) { texture_info.copy_to_device(); need_texture_info = false; } }
void tex_alloc(device_memory &mem) { VLOG(1) << "Texture allocate: " << mem.name << ", " << string_human_readable_number(mem.memory_size()) << " bytes. (" << string_human_readable_size(mem.memory_size()) << ")"; if (mem.interpolation == INTERPOLATION_NONE) { /* Data texture. */ kernel_tex_copy(&kernel_globals, mem.name, mem.host_pointer, mem.data_size); } else { /* Image Texture. */ int flat_slot = 0; if (string_startswith(mem.name, "__tex_image")) { int pos = string(mem.name).rfind("_"); flat_slot = atoi(mem.name + pos + 1); } else { assert(0); } if (flat_slot >= texture_info.size()) { /* Allocate some slots in advance, to reduce amount * of re-allocations. */ texture_info.resize(flat_slot + 128); } TextureInfo &info = texture_info[flat_slot]; info.data = (uint64_t)mem.host_pointer; info.cl_buffer = 0; info.interpolation = mem.interpolation; info.extension = mem.extension; info.width = mem.data_width; info.height = mem.data_height; info.depth = mem.data_depth; need_texture_info = true; } mem.device_pointer = (device_ptr)mem.host_pointer; mem.device_size = mem.memory_size(); stats.mem_alloc(mem.device_size); }
bool ImageManager::file_load_float_image(Image *img, device_vector<float4>& tex_img) { if(img->filename == "") return false; /* load image from file through OIIO */ ImageInput *in = ImageInput::create(img->filename); if(!in) return false; ImageSpec spec; if(!in->open(img->filename, spec)) { delete in; return false; } /* we only handle certain number of components */ int width = spec.width; int height = spec.height; int components = spec.nchannels; if(!(components == 1 || components == 3 || components == 4)) { in->close(); delete in; return false; } printf("loading float image: '%s' %dx%d\n", img->filename.c_str(), width, height); /* read RGBA pixels */ float *pixels = (float*)tex_img.resize(width, height); int scanlinesize = width*components*sizeof(float); in->read_image(TypeDesc::FLOAT, (uchar*)pixels + (height-1)*scanlinesize, AutoStride, -scanlinesize, AutoStride); in->close(); delete in; if(components == 3) { for(int i = width*height-1; i >= 0; i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i*3+2]; pixels[i*4+1] = pixels[i*3+1]; pixels[i*4+0] = pixels[i*3+0]; } } else if(components == 1) { for(int i = width*height-1; i >= 0; i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i]; pixels[i*4+1] = pixels[i]; pixels[i*4+0] = pixels[i]; } } return true; }
bool ImageManager::file_load_image(Image *img, ImageDataType type, int texture_limit, device_vector<DeviceType>& tex_img) { const StorageType alpha_one = (FileFormat == TypeDesc::UINT8)? 255 : 1; ImageInput *in = NULL; int width, height, depth, components; if(!file_load_image_generic(img, &in, width, height, depth, components)) { return false; } /* Read RGBA pixels. */ vector<StorageType> pixels_storage; StorageType *pixels; const size_t max_size = max(max(width, height), depth); if(max_size == 0) { /* Don't bother with invalid images. */ return false; } if(texture_limit > 0 && max_size > texture_limit) { pixels_storage.resize(((size_t)width)*height*depth*4); pixels = &pixels_storage[0]; } else { thread_scoped_lock device_lock(device_mutex); pixels = (StorageType*)tex_img.alloc(width, height, depth); } if(pixels == NULL) { /* Could be that we've run out of memory. */ return false; } bool cmyk = false; const size_t num_pixels = ((size_t)width) * height * depth; if(in) { StorageType *readpixels = pixels; vector<StorageType> tmppixels; if(components > 4) { tmppixels.resize(((size_t)width)*height*components); readpixels = &tmppixels[0]; } if(depth <= 1) { size_t scanlinesize = ((size_t)width)*components*sizeof(StorageType); in->read_image(FileFormat, (uchar*)readpixels + (height-1)*scanlinesize, AutoStride, -scanlinesize, AutoStride); } else { in->read_image(FileFormat, (uchar*)readpixels); } if(components > 4) { size_t dimensions = ((size_t)width)*height; for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) { pixels[i*4+3] = tmppixels[i*components+3]; pixels[i*4+2] = tmppixels[i*components+2]; pixels[i*4+1] = tmppixels[i*components+1]; pixels[i*4+0] = tmppixels[i*components+0]; } tmppixels.clear(); } cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4; in->close(); delete in; } else { if(FileFormat == TypeDesc::FLOAT) { builtin_image_float_pixels_cb(img->filename, img->builtin_data, (float*)&pixels[0], num_pixels * components, img->builtin_free_cache); } else if(FileFormat == TypeDesc::UINT8) { builtin_image_pixels_cb(img->filename, img->builtin_data, (uchar*)&pixels[0], num_pixels * components, img->builtin_free_cache); } else { /* TODO(dingto): Support half for ImBuf. */ } } /* Check if we actually have a float4 slot, in case components == 1, * but device doesn't support single channel textures. */ bool is_rgba = (type == IMAGE_DATA_TYPE_FLOAT4 || type == IMAGE_DATA_TYPE_HALF4 || type == IMAGE_DATA_TYPE_BYTE4); if(is_rgba) { if(cmyk) { /* CMYK */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255; pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255; pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255; pixels[i*4+3] = alpha_one; } } else if(components == 2) { /* grayscale + alpha */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = pixels[i*2+1]; pixels[i*4+2] = pixels[i*2+0]; pixels[i*4+1] = pixels[i*2+0]; pixels[i*4+0] = pixels[i*2+0]; } } else if(components == 3) { /* RGB */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = alpha_one; pixels[i*4+2] = pixels[i*3+2]; pixels[i*4+1] = pixels[i*3+1]; pixels[i*4+0] = pixels[i*3+0]; } } else if(components == 1) { /* grayscale */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = alpha_one; pixels[i*4+2] = pixels[i]; pixels[i*4+1] = pixels[i]; pixels[i*4+0] = pixels[i]; } } if(img->use_alpha == false) { for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = alpha_one; } } } /* Make sure we don't have buggy values. */ if(FileFormat == TypeDesc::FLOAT) { /* For RGBA buffers we put all channels to 0 if either of them is not * finite. This way we avoid possible artifacts caused by fully changed * hue. */ if(is_rgba) { for(size_t i = 0; i < num_pixels; i += 4) { StorageType *pixel = &pixels[i*4]; if(!isfinite(pixel[0]) || !isfinite(pixel[1]) || !isfinite(pixel[2]) || !isfinite(pixel[3])) { pixel[0] = 0; pixel[1] = 0; pixel[2] = 0; pixel[3] = 0; } } } else { for(size_t i = 0; i < num_pixels; ++i) { StorageType *pixel = &pixels[i]; if(!isfinite(pixel[0])) { pixel[0] = 0; } } } } /* Scale image down if needed. */ if(pixels_storage.size() > 0) { float scale_factor = 1.0f; while(max_size * scale_factor > texture_limit) { scale_factor *= 0.5f; } VLOG(1) << "Scaling image " << img->filename << " by a factor of " << scale_factor << "."; vector<StorageType> scaled_pixels; size_t scaled_width, scaled_height, scaled_depth; util_image_resize_pixels(pixels_storage, width, height, depth, is_rgba ? 4 : 1, scale_factor, &scaled_pixels, &scaled_width, &scaled_height, &scaled_depth); StorageType *texture_pixels; { thread_scoped_lock device_lock(device_mutex); texture_pixels = (StorageType*)tex_img.alloc(scaled_width, scaled_height, scaled_depth); } memcpy(texture_pixels, &scaled_pixels[0], scaled_pixels.size() * sizeof(StorageType)); } return true; }
bool ImageManager::file_load_half_image(Image *img, ImageDataType type, device_vector<T>& tex_img) { ImageInput *in = NULL; int width, height, depth, components; if(!file_load_image_generic(img, &in, width, height, depth, components)) return false; /* read RGBA pixels */ half *pixels = (half*)tex_img.resize(width, height, depth); if(pixels == NULL) { return false; } if(in) { half *readpixels = pixels; vector<half> tmppixels; if(components > 4) { tmppixels.resize(((size_t)width)*height*components); readpixels = &tmppixels[0]; } if(depth <= 1) { size_t scanlinesize = ((size_t)width)*components*sizeof(half); in->read_image(TypeDesc::HALF, (uchar*)readpixels + (height-1)*scanlinesize, AutoStride, -scanlinesize, AutoStride); } else { in->read_image(TypeDesc::HALF, (uchar*)readpixels); } if(components > 4) { size_t dimensions = ((size_t)width)*height; for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) { pixels[i*4+3] = tmppixels[i*components+3]; pixels[i*4+2] = tmppixels[i*components+2]; pixels[i*4+1] = tmppixels[i*components+1]; pixels[i*4+0] = tmppixels[i*components+0]; } tmppixels.clear(); } in->close(); delete in; } #if 0 /* TODO(dingto): Support half for ImBuf. */ else { builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels); } #endif /* Check if we actually have a half4 slot, in case components == 1, but device * doesn't support single channel textures. */ if(type == IMAGE_DATA_TYPE_HALF4) { size_t num_pixels = ((size_t)width) * height * depth; if(components == 2) { /* grayscale + alpha */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = pixels[i*2+1]; pixels[i*4+2] = pixels[i*2+0]; pixels[i*4+1] = pixels[i*2+0]; pixels[i*4+0] = pixels[i*2+0]; } } else if(components == 3) { /* RGB */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i*3+2]; pixels[i*4+1] = pixels[i*3+1]; pixels[i*4+0] = pixels[i*3+0]; } } else if(components == 1) { /* grayscale */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i]; pixels[i*4+1] = pixels[i]; pixels[i*4+0] = pixels[i]; } } if(img->use_alpha == false) { for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 1.0f; } } } return true; }
bool ImageManager::file_load_float_image(Image *img, ImageDataType type, device_vector<T>& tex_img) { ImageInput *in = NULL; int width, height, depth, components; if(!file_load_image_generic(img, &in, width, height, depth, components)) return false; /* read RGBA pixels */ float *pixels = (float*)tex_img.resize(width, height, depth); if(pixels == NULL) { return false; } bool cmyk = false; if(in) { float *readpixels = pixels; vector<float> tmppixels; if(components > 4) { tmppixels.resize(((size_t)width)*height*components); readpixels = &tmppixels[0]; } if(depth <= 1) { size_t scanlinesize = ((size_t)width)*components*sizeof(float); in->read_image(TypeDesc::FLOAT, (uchar*)readpixels + (height-1)*scanlinesize, AutoStride, -scanlinesize, AutoStride); } else { in->read_image(TypeDesc::FLOAT, (uchar*)readpixels); } if(components > 4) { size_t dimensions = ((size_t)width)*height; for(size_t i = dimensions-1, pixel = 0; pixel < dimensions; pixel++, i--) { pixels[i*4+3] = tmppixels[i*components+3]; pixels[i*4+2] = tmppixels[i*components+2]; pixels[i*4+1] = tmppixels[i*components+1]; pixels[i*4+0] = tmppixels[i*components+0]; } tmppixels.clear(); } cmyk = strcmp(in->format_name(), "jpeg") == 0 && components == 4; in->close(); delete in; } else { builtin_image_float_pixels_cb(img->filename, img->builtin_data, pixels); } /* Check if we actually have a float4 slot, in case components == 1, but device * doesn't support single channel textures. */ if(type == IMAGE_DATA_TYPE_FLOAT4) { size_t num_pixels = ((size_t)width) * height * depth; if(cmyk) { /* CMYK */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 255; pixels[i*4+2] = (pixels[i*4+2]*pixels[i*4+3])/255; pixels[i*4+1] = (pixels[i*4+1]*pixels[i*4+3])/255; pixels[i*4+0] = (pixels[i*4+0]*pixels[i*4+3])/255; } } else if(components == 2) { /* grayscale + alpha */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = pixels[i*2+1]; pixels[i*4+2] = pixels[i*2+0]; pixels[i*4+1] = pixels[i*2+0]; pixels[i*4+0] = pixels[i*2+0]; } } else if(components == 3) { /* RGB */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i*3+2]; pixels[i*4+1] = pixels[i*3+1]; pixels[i*4+0] = pixels[i*3+0]; } } else if(components == 1) { /* grayscale */ for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 1.0f; pixels[i*4+2] = pixels[i]; pixels[i*4+1] = pixels[i]; pixels[i*4+0] = pixels[i]; } } if(img->use_alpha == false) { for(size_t i = num_pixels-1, pixel = 0; pixel < num_pixels; pixel++, i--) { pixels[i*4+3] = 1.0f; } } } return true; }
void push_arg(device_vector<T> arg) { K.set_arg(argpos++, arg.raw()); }
void push_arg(const device_vector<T> &arg) { push_arg(arg.raw()); }
~CPUDevice() { task_pool.stop(); texture_info.free(); }
bool ImageManager::file_load_image(Image *img, device_vector<uchar4>& tex_img) { if(img->filename == "") return false; fprintf(stderr, "image input disabled\n"); #if 0 /* load image from file through OIIO */ ImageInput *in = ImageInput::create(img->filename); if(!in) return false; ImageSpec spec; if(!in->open(img->filename, spec)) { delete in; return false; } /* we only handle certain number of components */ int width = spec.width; int height = spec.height; int components = spec.nchannels; if(!(components == 1 || components == 3 || components == 4)) { in->close(); delete in; return false; } /* read RGBA pixels */ uchar *pixels = (uchar*)tex_img.resize(width, height); int scanlinesize = width*components*sizeof(uchar); in->read_image(TypeDesc::UINT8, (uchar*)pixels + (height-1)*scanlinesize, AutoStride, -scanlinesize, AutoStride); in->close(); delete in; if(components == 3) { for(int i = width*height-1; i >= 0; i--) { pixels[i*4+3] = 255; pixels[i*4+2] = pixels[i*3+2]; pixels[i*4+1] = pixels[i*3+1]; pixels[i*4+0] = pixels[i*3+0]; } } else if(components == 1) { for(int i = width*height-1; i >= 0; i--) { pixels[i*4+3] = 255; pixels[i*4+2] = pixels[i]; pixels[i*4+1] = pixels[i]; pixels[i*4+0] = pixels[i]; } } return true; #endif return false; }