/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelNUM; cl_event kernel_event, finish_event; cl_mem objPARTICULAS, objPESOS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "numparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelNUM = clCreateKernel(program, "calc_num_particulas", &ret); // Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); int *pesos = (int*) _aligned_malloc(N * sizeof(int), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(particle), NULL, &ret); objPESOS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(int), NULL, &ret); float sum = 0.0f; const size_t global = 2; const size_t local_work_size = 1; // Inicializamos las partículas (Me interesan los pesos) srand(time(NULL)); for(unsigned index = 0; index < N; ++index) { particulas[index].x = 0.0; particulas[index].y = 0.0; particulas[index].s = 0.0; particulas[index].xp = 0.0; particulas[index].yp = 0.0; particulas[index].sp = 0.0; particulas[index].x0 = 0.0; particulas[index].y0 = 0.0; particulas[index].width = 0; particulas[index].height = 0; particulas[index].w = (float) (rand() % 2000); sum+=particulas[index].w; } // Normalizamos los datos for(int i = 0; i < N; ++i) particulas[i].w /= sum; // Transferimos las partículas al dispositivo y los pesos cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelNUM, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelNUM, 1, sizeof(int), &N); ret = clSetKernelArg(kernelNUM, 2, sizeof(cl_mem), &objPESOS); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelNUM, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPESOS, CL_FALSE, 0, N * sizeof(int), pesos, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseMemObject(objPESOS); clReleaseKernel(kernelNUM); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
void* DefaultAllocateAligned(size_t size, size_t alignment) { return _aligned_malloc(size, alignment); }
int _tmain(int argc, _TCHAR* argv[]) { const size_t max_num = 100000000; const int buffer_element_count = 10000; const int max_float_digits = 8; // Init critical section; InitializeCriticalSection(&g_write_queue_cs); g_write_queue_has_more_data_event = CreateEvent(NULL, FALSE, FALSE, NULL); g_write_queue_accepts_more_data_event = CreateEvent(NULL, FALSE, TRUE, NULL); HANDLE hFile = ::CreateFile(L"output.txt", GENERIC_WRITE, 0, 0, CREATE_ALWAYS, 0, NULL); if (hFile == INVALID_HANDLE_VALUE) { printf("Oppps"); exit(-1); } // Launch a writer thread. HANDLE hThread = CreateThread(NULL, 0, &WriteThreadProc, hFile,0, 0); sfmt_t sfmt; sfmt_init_gen_rand(&sfmt, 1234); uint32_t* randoms = (uint32_t*) _aligned_malloc(sizeof(uint32_t)*buffer_element_count, 32); g_begin_ticks = GetTickCount64(); // std::ofstream output(); int finish = max_num / buffer_element_count; for (size_t i=0; i < finish; ++i) { // Prepare a block of numbers for writing. WriteBuffer* write_buffer = new WriteBuffer(buffer_element_count, max_float_digits); char* write_ptr = write_buffer->ptr_; sfmt_fill_array32(&sfmt, randoms, buffer_element_count); for (int k = 0; k < buffer_element_count; ++k) { // Format each float to string and append to buffer. // float random = float(rand()) / RAND_MAX; float random = float(randoms[k]) / 4294967296.0f; write_ptr += modp_dtoa(random, write_ptr, max_float_digits); *(write_ptr++) = '\r'; *(write_ptr++) = '\n'; } // Compute how many bytes to write. write_buffer->useful_data_size_ = write_ptr - write_buffer->ptr_; // Enqueue for writing. while (write_buffer) { EnterCriticalSection(&g_write_queue_cs); if (g_write_queue.size() < kMaxQueue) { // ops. g_write_queue.push(write_buffer); SetEvent(g_write_queue_has_more_data_event); write_buffer = NULL; } LeaveCriticalSection(&g_write_queue_cs); if (write_buffer) { // slow down writing, queue is full printf("S"); WaitForSingleObject(g_write_queue_accepts_more_data_event, 200); } } } g_end_ticks = GetTickCount64(); // Let the writing thread know we are done. EnterCriticalSection(&g_write_queue_cs); g_done = true; LeaveCriticalSection(&g_write_queue_cs); SetEvent(g_write_queue_has_more_data_event); // Wait for writing thread to finish. WaitForSingleObject(hThread, INFINITE); _aligned_free(randoms); __int64 delta = g_end_ticks - g_begin_ticks; printf("Speed %f Mb per sec\n", (g_total_bytes_written * 1000.0) / (1024.0 * 1024 * delta)); char c; scanf("%c", &c); ::CloseHandle(hFile); return 0; }
void* FreeImage_Aligned_Malloc(size_t amount, size_t alignment) { assert(alignment == FIBITMAP_ALIGNMENT); return _aligned_malloc(amount, alignment); }
/** * Function description * * @return 0 on success, otherwise a Win32 error code */ static UINT xf_CreateSurface(RdpgfxClientContext* context, const RDPGFX_CREATE_SURFACE_PDU* createSurface) { UINT ret = CHANNEL_RC_NO_MEMORY; size_t size; xfGfxSurface* surface; rdpGdi* gdi = (rdpGdi*)context->custom; xfContext* xfc = (xfContext*) gdi->context; surface = (xfGfxSurface*) calloc(1, sizeof(xfGfxSurface)); if (!surface) return CHANNEL_RC_NO_MEMORY; surface->gdi.codecs = gdi->context->codecs; if (!surface->gdi.codecs) { WLog_ERR(TAG, "%s: global GDI codecs aren't set", __FUNCTION__); goto out_free; } surface->gdi.surfaceId = createSurface->surfaceId; surface->gdi.width = (UINT32) createSurface->width; surface->gdi.height = (UINT32) createSurface->height; switch (createSurface->pixelFormat) { case GFX_PIXEL_FORMAT_ARGB_8888: surface->gdi.format = PIXEL_FORMAT_BGRA32; break; case GFX_PIXEL_FORMAT_XRGB_8888: surface->gdi.format = PIXEL_FORMAT_BGRX32; break; default: WLog_ERR(TAG, "%s: unknown pixelFormat 0x%"PRIx32"", __FUNCTION__, createSurface->pixelFormat); ret = ERROR_INTERNAL_ERROR; goto out_free; } surface->gdi.scanline = surface->gdi.width * GetBytesPerPixel(surface->gdi.format); surface->gdi.scanline = x11_pad_scanline(surface->gdi.scanline, xfc->scanline_pad); size = surface->gdi.scanline * surface->gdi.height; surface->gdi.data = (BYTE*)_aligned_malloc(size, 16); if (!surface->gdi.data) { WLog_ERR(TAG, "%s: unable to allocate GDI data", __FUNCTION__); goto out_free; } ZeroMemory(surface->gdi.data, size); if (AreColorFormatsEqualNoAlpha(gdi->dstFormat, surface->gdi.format)) { surface->image = XCreateImage(xfc->display, xfc->visual, xfc->depth, ZPixmap, 0, (char*) surface->gdi.data, surface->gdi.width, surface->gdi.height, xfc->scanline_pad, surface->gdi.scanline); } else { UINT32 width = surface->gdi.width; UINT32 bytes = GetBytesPerPixel(gdi->dstFormat); surface->stageScanline = width * bytes; surface->stageScanline = x11_pad_scanline(surface->stageScanline, xfc->scanline_pad); size = surface->stageScanline * surface->gdi.height; surface->stage = (BYTE*) _aligned_malloc(size, 16); if (!surface->stage) { WLog_ERR(TAG, "%s: unable to allocate stage buffer", __FUNCTION__); goto out_free_gdidata; } ZeroMemory(surface->stage, size); surface->image = XCreateImage(xfc->display, xfc->visual, xfc->depth, ZPixmap, 0, (char*) surface->stage, surface->gdi.width, surface->gdi.height, xfc->scanline_pad, surface->stageScanline); } if (!surface->image) { WLog_ERR(TAG, "%s: an error occurred when creating the XImage", __FUNCTION__); goto error_surface_image; } surface->image->byte_order = LSBFirst; surface->image->bitmap_bit_order = LSBFirst; surface->gdi.outputMapped = FALSE; region16_init(&surface->gdi.invalidRegion); if (context->SetSurfaceData(context, surface->gdi.surfaceId, (void*) surface) != CHANNEL_RC_OK) { WLog_ERR(TAG, "%s: an error occurred during SetSurfaceData", __FUNCTION__); goto error_set_surface_data; } return CHANNEL_RC_OK; error_set_surface_data: surface->image->data = NULL; XDestroyImage(surface->image); error_surface_image: _aligned_free(surface->stage); out_free_gdidata: _aligned_free(surface->gdi.data); out_free: free(surface); return ret; }
void* __restrict DefaultAlloc::_Allocate(size_t dwSize) { return _aligned_malloc(dwSize, 16); }
void DngDecoderSlices::decodeSlice(DngDecoderThread* t) { if (compression == 7) { while (!t->slices.empty()) { LJpegPlain l(mFile, mRaw); l.mDNGCompatible = mFixLjpeg; DngSliceElement e = t->slices.front(); l.mUseBigtable = e.mUseBigtable; t->slices.pop(); try { l.startDecoder(e.byteOffset, e.byteCount, e.offX, e.offY); } catch (RawDecoderException &err) { mRaw->setError(err.what()); } catch (IOException &err) { mRaw->setError(err.what()); } } /* Lossy DNG */ } else if (compression == 0x884c) { /* Each slice is a JPEG image */ struct jpeg_decompress_struct dinfo; struct jpeg_error_mgr jerr; while (!t->slices.empty()) { DngSliceElement e = t->slices.front(); t->slices.pop(); uchar8 *complete_buffer = NULL; JSAMPARRAY buffer = (JSAMPARRAY)malloc(sizeof(JSAMPROW)); try { uint32 size = mFile->getSize(); jpeg_create_decompress(&dinfo); dinfo.err = jpeg_std_error(&jerr); jerr.error_exit = my_error_throw; CHECKSIZE(e.byteOffset); CHECKSIZE(e.byteOffset+e.byteCount); JPEG_MEMSRC(&dinfo, (unsigned char*)mFile->getData(e.byteOffset, e.byteCount), e.byteCount); if (JPEG_HEADER_OK != jpeg_read_header(&dinfo, TRUE)) ThrowRDE("DngDecoderSlices: Unable to read JPEG header"); jpeg_start_decompress(&dinfo); if (dinfo.output_components != (int)mRaw->getCpp()) ThrowRDE("DngDecoderSlices: Component count doesn't match"); int row_stride = dinfo.output_width * dinfo.output_components; int pic_size = dinfo.output_height * row_stride; complete_buffer = (uchar8*)_aligned_malloc(pic_size, 16); while (dinfo.output_scanline < dinfo.output_height) { buffer[0] = (JSAMPROW)(&complete_buffer[dinfo.output_scanline*row_stride]); if (0 == jpeg_read_scanlines(&dinfo, buffer, 1)) ThrowRDE("DngDecoderSlices: JPEG Error while decompressing image."); } jpeg_finish_decompress(&dinfo); // Now the image is decoded, and we copy the image data int copy_w = min(mRaw->dim.x-e.offX, dinfo.output_width); int copy_h = min(mRaw->dim.y-e.offY, dinfo.output_height); for (int y = 0; y < copy_h; y++) { uchar8* src = &complete_buffer[row_stride*y]; ushort16* dst = (ushort16*)mRaw->getData(e.offX, y+e.offY); for (int x = 0; x < copy_w; x++) { for (int c=0; c < dinfo.output_components; c++) *dst++ = (*src++); } } } catch (RawDecoderException &err) { mRaw->setError(err.what()); } catch (IOException &err) { mRaw->setError(err.what()); } free(buffer); if (complete_buffer) _aligned_free(complete_buffer); jpeg_destroy_decompress(&dinfo); } } else mRaw->setError("DngDecoderSlices: Unknown compression"); }
void * __cdecl _aligned_malloc_dbg( size_t size, size_t align, const char * f_name, int line_n) { return _aligned_malloc(size, align); }
GSTextureOGL::GSTextureOGL(int type, int w, int h, int format, GLuint fbo_read) : m_pbo_size(0), m_clean(false), m_local_buffer(NULL), m_r_x(0), m_r_y(0), m_r_w(0), m_r_h(0) { // OpenGL didn't like dimensions of size 0 m_size.x = max(1,w); m_size.y = max(1,h); m_format = format; m_type = type; m_fbo_read = fbo_read; m_texture_id = 0; // Bunch of constant parameter switch (m_format) { // 1 Channel integer case GL_R32UI: case GL_R32I: m_int_format = GL_RED_INTEGER; m_int_type = (m_format == GL_R32UI) ? GL_UNSIGNED_INT : GL_INT; m_int_shift = 2; break; case GL_R16UI: m_int_format = GL_RED_INTEGER; m_int_type = GL_UNSIGNED_SHORT; m_int_shift = 1; break; // 1 Channel normalized case GL_R8: m_int_format = GL_RED; m_int_type = GL_UNSIGNED_BYTE; m_int_shift = 0; break; // 4 channel normalized case GL_RGBA16: m_int_format = GL_RGBA; m_int_type = GL_UNSIGNED_SHORT; m_int_shift = 3; break; case GL_RGBA8: m_int_format = GL_RGBA; m_int_type = GL_UNSIGNED_BYTE; m_int_shift = 2; break; // 4 channel integer case GL_RGBA16I: case GL_RGBA16UI: m_int_format = GL_RGBA_INTEGER; m_int_type = (m_format == GL_R16UI) ? GL_UNSIGNED_SHORT : GL_SHORT; m_int_shift = 3; break; // 4 channel float case GL_RGBA32F: m_int_format = GL_RGBA; m_int_type = GL_FLOAT; m_int_shift = 4; break; case GL_RGBA16F: m_int_format = GL_RGBA; m_int_type = GL_HALF_FLOAT; m_int_shift = 3; break; // Depth buffer case GL_DEPTH32F_STENCIL8: m_int_format = GL_DEPTH_STENCIL; m_int_type = GL_FLOAT_32_UNSIGNED_INT_24_8_REV; m_int_shift = 0; break; // Backbuffer case 0: m_int_format = 0; m_int_type = 0; m_int_shift = 0; break; default: m_int_format = 0; m_int_type = 0; m_int_shift = 0; ASSERT(0); } // Generate & Allocate the buffer switch (m_type) { case GSTexture::Offscreen: // Offscreen is only used to read color. So it only requires 4B by pixel m_local_buffer = (uint8*)_aligned_malloc(m_size.x * m_size.y * 4, 32); case GSTexture::Texture: case GSTexture::RenderTarget: case GSTexture::DepthStencil: glCreateTextures(GL_TEXTURE_2D, 1, &m_texture_id); glTextureStorage2D(m_texture_id, 1+GL_TEX_LEVEL_0, m_format, m_size.x, m_size.y); if (m_format == GL_R8) { // Emulate DX behavior, beside it avoid special code in shader to differentiate // palette texture from a GL_RGBA target or a GL_R texture. glTextureParameteri(m_texture_id, GL_TEXTURE_SWIZZLE_A, GL_RED); } break; case GSTexture::Backbuffer: default: break; } }
void GSClut::init() { g_pbyGSClut = (u8*)_aligned_malloc(256 * 8, 1024); // need 512 alignment! memset(g_pbyGSClut, 0, 256*8); }
ConvertToY8::ConvertToY8(PClip src, int in_matrix, IScriptEnvironment* env) : GenericVideoFilter(src), matrix(0) { yuy2_input = blit_luma_only = rgb_input = false; if (vi.IsPlanar()) { blit_luma_only = true; vi.pixel_type = VideoInfo::CS_Y8; return; } if (vi.IsYUY2()) { yuy2_input = true; vi.pixel_type = VideoInfo::CS_Y8; return; } if (vi.IsRGB()) { rgb_input = true; pixel_step = vi.BytesFromPixels(1); vi.pixel_type = VideoInfo::CS_Y8; matrix = (signed short*)_aligned_malloc(sizeof(short)*4, 16); signed short* m = matrix; if (in_matrix == Rec601) { *m++ = (signed short)((219.0/255.0)*0.114*32768.0+0.5); //B *m++ = (signed short)((219.0/255.0)*0.587*32768.0+0.5); //G *m++ = (signed short)((219.0/255.0)*0.299*32768.0+0.5); //R offset_y = 16; } else if (in_matrix == PC_601) { *m++ = (signed short)(0.114*32768.0+0.5); //B *m++ = (signed short)(0.587*32768.0+0.5); //G *m++ = (signed short)(0.299*32768.0+0.5); //R offset_y = 0; } else if (in_matrix == Rec709) { *m++ = (signed short)((219.0/255.0)*0.0722*32768.0+0.5); //B *m++ = (signed short)((219.0/255.0)*0.7152*32768.0+0.5); //G *m++ = (signed short)((219.0/255.0)*0.2126*32768.0+0.5); //R offset_y = 16; } else if (in_matrix == PC_709) { *m++ = (signed short)(0.0722*32768.0+0.5); //B *m++ = (signed short)(0.7152*32768.0+0.5); //G *m++ = (signed short)(0.2126*32768.0+0.5); //R offset_y = 0; } else if (in_matrix == AVERAGE) { *m++ = (signed short)(32768.0/3 + 0.5); //B *m++ = (signed short)(32768.0/3 + 0.5); //G *m++ = (signed short)(32768.0/3 + 0.5); //R offset_y = 0; } else { _aligned_free(matrix); matrix = 0; env->ThrowError("ConvertToY8: Unknown matrix."); } *m = 0; // Alpha if (pixel_step == 4) genRGB32toY8(vi.width, vi.height, offset_y, matrix, env); else if (pixel_step == 3) genRGB24toY8(vi.width, vi.height, offset_y, matrix, env); return; } env->ThrowError("ConvertToY8: Unknown input format"); }
static int posix_memalign(void **p, size_t align, size_t size) { void *buf = _aligned_malloc(size, align); if (buf == NULL) return errno; *p = buf; return 0; }
void* aligned_malloc(size_t size, size_t align) { return _aligned_malloc(size, align); }
TEMmod::TEMmod(PClip c, double thy, double thc, int tp, int chroma, int lnk, bool inv, float sc, IScriptEnvironment* env) : GenericVideoFilter(c), link(lnk), invert(inv), type(tp), scale(sc) { if (!vi.IsPlanar()) { env->ThrowError("TEMmod: Planar format only."); } if (vi.IsY8()) { link = 0; chroma = 0; } process[0] = 1; process[1] = process[2] = chroma; double th[] = {thy, thc}; for (int i = 0; i < 2; i++) { double d; if (type == 1) { d = th[i] * th[i] * 4 + 0.5; } else if (type == 2) { d = th[i] * th[i] * 10000 + 0.5; } else if (type == 3) { d = th[i] * 2 + 0.5; } else if (type == 4) { d = th[i] * 100 / 3.0 + 0.5; } else { d = th[i] * 4 + 0.5; } threshold[i] = static_cast<int>(d); } threshold[2] = threshold[1]; if (threshold[0] == 0 || threshold[1] == 0) { link = 0; } if (type == 1) { calc_map = calc_maps[threshold[0] > 0 ? 1 : 0]; } else if (type == 2) { calc_map = calc_maps[2 + (threshold[0] > 0 ? 1 : 0)]; } else { calc_map = calc_maps[type + 1]; } const link_planes_func* links = link == 1 ? link_y_to_uv : link_all; if (vi.IsYV24()) { link_planes = links[0]; } else if (vi.IsYV16()) { link_planes = links[1]; } else if (vi.IsYV12()) { link_planes = links[2]; } else { link_planes = links[3]; } buff_pitch = ((vi.width + 47) / 16) * 16; buff = (uint8_t*)_aligned_malloc(buff_pitch * (type * 2 + 1), 16); if (!buff) { env->ThrowError("TEMmod: failed to allocate buffer."); } }
/*--------------------------------------------------------------------------- // 16Byte Allignment calloc //-------------------------------------------------------------------------*/ void* xmm_calloc(size_t nitems, size_t size) { unsigned char* t_RetPtr = (unsigned char*)_aligned_malloc(nitems*size, 16); if(t_RetPtr) { #ifdef __SSE__ size_t i,j, k; __m128 XMM0, XMM1, XMM2, XMM3; XMM0 = XMM1 = XMM2 = XMM3 = _mm_setzero_ps(); k = nitems*size; j = k&(~127); for(i=0;i<j;i+=128) { _mm_stream_ps((float*)(t_RetPtr+i ), XMM0); _mm_stream_ps((float*)(t_RetPtr+i+ 16), XMM1); _mm_stream_ps((float*)(t_RetPtr+i+ 32), XMM2); _mm_stream_ps((float*)(t_RetPtr+i+ 48), XMM3); _mm_stream_ps((float*)(t_RetPtr+i+ 64), XMM0); _mm_stream_ps((float*)(t_RetPtr+i+ 80), XMM1); _mm_stream_ps((float*)(t_RetPtr+i+ 96), XMM2); _mm_stream_ps((float*)(t_RetPtr+i+112), XMM3); } j = k&(~63); for(;i<j;i+=64) { _mm_stream_ps((float*)(t_RetPtr+i ), XMM0); _mm_stream_ps((float*)(t_RetPtr+i+ 16), XMM1); _mm_stream_ps((float*)(t_RetPtr+i+ 32), XMM2); _mm_stream_ps((float*)(t_RetPtr+i+ 48), XMM3); } j = k&(~31); for(;i<j;i+=32) { _mm_stream_ps((float*)(t_RetPtr+i ), XMM0); _mm_stream_ps((float*)(t_RetPtr+i+ 16), XMM1); } j = k&(~15); for(;i<j;i+=16) { _mm_stream_ps((float*)(t_RetPtr+i ), XMM0); } j = k&(~7); for(;i<j;i+=8) { _mm_storel_pi((__m64*)(t_RetPtr+i ), XMM0); } j = k&(~3); for(;i<j;i+=4) { _mm_store_ss((float*)(t_RetPtr+i) , XMM0); } for(;i<k;i++) *(t_RetPtr+i ) = 0; _mm_sfence(); #else memset(t_RetPtr, 0, nitems*size); #endif } return (void*)t_RetPtr; }
void* align_base_64::operator new[](size_t bytes) { return _aligned_malloc(bytes, 64); }
HRESULT CGraphics::InitializeDisplay(HWND hWnd,UINT width,UINT height,BOOL blur) { // Check if windowed visualization (skin mode not supported) if(!hWnd) return E_FAIL; // Safe to assume that if device is not null display is initialized if(m_Device) UninitializeDisplay(); // Get the address of the create function if(!m_Direct3DCreate9) { TRACE(TEXT("Error: Failed to find \"Direct3DCreate9\" in \"%s\".\n"),D3DDLL); return E_FAIL; } // Reset audio data if(IsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE)) { TRACE(TEXT("Info: Using SSE instruction set.\n")); m_Levels = (PFLOAT)_aligned_malloc(sizeof(FLOAT) * VISUALIZATION_BARCOUNT,16); m_LevelsBuffer = (PFLOAT)_aligned_malloc(sizeof(FLOAT) * VISUALIZATION_BARCOUNT,16); m_Waveform = (PFLOAT)_aligned_malloc(sizeof(FLOAT) * SA_BUFFER_SIZE,16); m_WaveformBuffer = (PFLOAT)_aligned_malloc(sizeof(FLOAT) * SA_BUFFER_SIZE,16); } else { m_Levels = (PFLOAT)malloc(sizeof(FLOAT) * VISUALIZATION_BARCOUNT); m_LevelsBuffer = (PFLOAT)malloc(sizeof(FLOAT) * VISUALIZATION_BARCOUNT); m_Waveform = (PFLOAT)malloc(sizeof(FLOAT) * SA_BUFFER_SIZE); m_WaveformBuffer = (PFLOAT)malloc(sizeof(FLOAT) * SA_BUFFER_SIZE); } ZeroMemory(m_Levels,sizeof(FLOAT) * VISUALIZATION_BARCOUNT); ZeroMemory(m_LevelsBuffer,sizeof(FLOAT) * VISUALIZATION_BARCOUNT); ZeroMemory(m_Waveform,sizeof(FLOAT) * SA_BUFFER_SIZE); ZeroMemory(m_WaveformBuffer,sizeof(FLOAT) * SA_BUFFER_SIZE); ZeroMemory(m_Peaks,sizeof(m_Peaks)); m_Hwnd = hWnd; m_Blur = blur; m_Direct3D = m_Direct3DCreate9(D3D_SDK_VERSION); if(!m_Direct3D) { TRACE(TEXT("Error: Failed to create direct 3d.\n")); return E_FAIL; } m_Direct3D->GetDeviceCaps(D3DADAPTER_DEFAULT,D3DDEVTYPE_HAL,&m_Caps); m_Direct3D->GetAdapterIdentifier(D3DADAPTER_DEFAULT,NULL,&m_AdapterIdentifier); ZeroMemory(&m_PresentParameters,sizeof(m_PresentParameters)); m_PresentParameters.Windowed = TRUE; m_PresentParameters.SwapEffect = D3DSWAPEFFECT_DISCARD; m_PresentParameters.BackBufferFormat = D3DFMT_X8R8G8B8; //m_PresentParameters.EnableAutoDepthStencil = TRUE; m_PresentParameters.AutoDepthStencilFormat = D3DFMT_D16; m_PresentParameters.PresentationInterval = D3DPRESENT_INTERVAL_DEFAULT; //m_PresentParameters.PresentationInterval = D3DPRESENT_INTERVAL_IMMEDIATE; m_PresentParameters.BackBufferWidth = width; m_PresentParameters.BackBufferHeight = height; //m_PresentParameters.MultiSampleType = D3DMULTISAMPLE_4_SAMPLES; DWORD vp = NULL; if(m_Caps.DevCaps & D3DDEVCAPS_PUREDEVICE) { vp |= D3DCREATE_PUREDEVICE; TRACE(TEXT("Info: Using pure device.\n")); } if(m_Caps.DevCaps & D3DDEVCAPS_HWTRANSFORMANDLIGHT) { vp |= D3DCREATE_HARDWARE_VERTEXPROCESSING; TRACE(TEXT("Info: Using hardware vertex processing.\n")); } else { vp |= D3DCREATE_SOFTWARE_VERTEXPROCESSING; TRACE(TEXT("Info: Using software vertex processing.\n")); } if(FAILED(m_Direct3D->CreateDevice(D3DADAPTER_DEFAULT,D3DDEVTYPE_HAL,m_Hwnd,vp|D3DCREATE_MULTITHREADED,&m_PresentParameters,&m_Device))) { TRACE(TEXT("Error: Failed to create direct 3d device.\n")); return E_FAIL; } if(FAILED(Restore())) { TRACE(TEXT("Error: Failed to initaly restore device.\n")); return E_FAIL; } return S_OK; }
OMX_ERRORTYPE COMXCoreComponent::AllocOutputBuffers(bool use_buffers /* = false */) { OMX_ERRORTYPE omx_err = OMX_ErrorNone; if(!m_handle) return OMX_ErrorUndefined; m_omx_output_use_buffers = use_buffers; OMX_PARAM_PORTDEFINITIONTYPE portFormat; OMX_INIT_STRUCTURE(portFormat); portFormat.nPortIndex = m_output_port; omx_err = OMX_GetParameter(m_handle, OMX_IndexParamPortDefinition, &portFormat); if(omx_err != OMX_ErrorNone) return omx_err; if(GetState() != OMX_StateIdle) { if(GetState() != OMX_StateLoaded) SetStateForComponent(OMX_StateLoaded); SetStateForComponent(OMX_StateIdle); } omx_err = EnablePort(m_output_port, false); if(omx_err != OMX_ErrorNone) return omx_err; m_output_alignment = portFormat.nBufferAlignment; m_output_buffer_count = portFormat.nBufferCountActual; m_output_buffer_size = portFormat.nBufferSize; CLog::Log(LOGDEBUG, "COMXCoreComponent::AllocOutputBuffers component(%s) - port(%d), nBufferCountMin(%lu), nBufferCountActual(%lu), nBufferSize(%lu) nBufferAlignmen(%lu)\n", m_componentName.c_str(), m_output_port, portFormat.nBufferCountMin, portFormat.nBufferCountActual, portFormat.nBufferSize, portFormat.nBufferAlignment); for (size_t i = 0; i < portFormat.nBufferCountActual; i++) { OMX_BUFFERHEADERTYPE *buffer = NULL; OMX_U8* data = NULL; if(m_omx_output_use_buffers) { data = (OMX_U8*)_aligned_malloc(portFormat.nBufferSize, m_output_alignment); omx_err = OMX_UseBuffer(m_handle, &buffer, m_output_port, NULL, portFormat.nBufferSize, data); } else { omx_err = OMX_AllocateBuffer(m_handle, &buffer, m_output_port, NULL, portFormat.nBufferSize); } if(omx_err != OMX_ErrorNone) { CLog::Log(LOGERROR, "COMXCoreComponent::AllocOutputBuffers component(%s) - OMX_UseBuffer failed with omx_err(0x%x)\n", m_componentName.c_str(), omx_err); if(m_omx_output_use_buffers && data) _aligned_free(data); return omx_err; } buffer->nOutputPortIndex = m_output_port; buffer->nFilledLen = 0; buffer->nOffset = 0; buffer->pAppPrivate = (void*)i; m_omx_output_buffers.push_back(buffer); m_omx_output_available.push(buffer); } omx_err = WaitForCommand(OMX_CommandPortEnable, m_output_port); m_flush_output = false; return omx_err; }
void* DefaultAlloc::_ReAllocate(LPVOID lpData, size_t dwSize) { return (!lpData) ? _aligned_malloc(dwSize, 16) : _aligned_realloc(lpData, dwSize, 16); }
void* mpeg2_malloc(size_t size, mpeg2_alloc_t reason) { return _aligned_malloc(size,64); }
EXPORT_C GSBenchmark(HWND hwnd, HINSTANCE hinst, LPSTR lpszCmdLine, int nCmdShow) { ::SetPriorityClass(::GetCurrentProcess(), HIGH_PRIORITY_CLASS); FILE* file = fopen("c:\\temp1\\log.txt", "a"); fprintf(file, "-------------------------\n\n"); if(1) { GSLocalMemory * pMem = new GSLocalMemory(); GSLocalMemory& mem(*pMem); static struct {int psm; const char* name;} s_format[] = { {PSM_PSMCT32, "32"}, {PSM_PSMCT24, "24"}, {PSM_PSMCT16, "16"}, {PSM_PSMCT16S, "16S"}, {PSM_PSMT8, "8"}, {PSM_PSMT4, "4"}, {PSM_PSMT8H, "8H"}, {PSM_PSMT4HL, "4HL"}, {PSM_PSMT4HH, "4HH"}, {PSM_PSMZ32, "32Z"}, {PSM_PSMZ24, "24Z"}, {PSM_PSMZ16, "16Z"}, {PSM_PSMZ16S, "16ZS"}, }; uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32); for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i; // for(int tbw = 5; tbw <= 10; tbw++) { int n = 256 << ((10 - tbw) * 2); int w = 1 << tbw; int h = 1 << tbw; fprintf(file, "%d x %d\n\n", w, h); for(size_t i = 0; i < countof(s_format); i++) { const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[s_format[i].psm]; GSLocalMemory::writeImage wi = psm.wi; GSLocalMemory::readImage ri = psm.ri; GSLocalMemory::readTexture rtx = psm.rtx; GSLocalMemory::readTexture rtxP = psm.rtxP; GIFRegBITBLTBUF BITBLTBUF; BITBLTBUF.SBP = 0; BITBLTBUF.SBW = w / 64; BITBLTBUF.SPSM = s_format[i].psm; BITBLTBUF.DBP = 0; BITBLTBUF.DBW = w / 64; BITBLTBUF.DPSM = s_format[i].psm; GIFRegTRXPOS TRXPOS; TRXPOS.SSAX = 0; TRXPOS.SSAY = 0; TRXPOS.DSAX = 0; TRXPOS.DSAY = 0; GIFRegTRXREG TRXREG; TRXREG.RRW = w; TRXREG.RRH = h; GSVector4i r(0, 0, w, h); GIFRegTEX0 TEX0; TEX0.TBP0 = 0; TEX0.TBW = w / 64; GIFRegTEXA TEXA; TEXA.TA0 = 0; TEXA.TA1 = 0x80; TEXA.AEM = 0; int trlen = w * h * psm.trbpp / 8; int len = w * h * psm.bpp / 8; clock_t start, end; _ftprintf(file, _T("[%4s] "), s_format[i].name); start = clock(); for(int j = 0; j < n; j++) { int x = 0; int y = 0; (mem.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); } end = clock(); fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); start = clock(); for(int j = 0; j < n; j++) { int x = 0; int y = 0; (mem.*ri)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); } end = clock(); fprintf(file, "%6d %6d | ", (int)((float)trlen * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); const GSOffset* o = mem.GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM); start = clock(); for(int j = 0; j < n; j++) { (mem.*rtx)(o, r, ptr, w * 4, TEXA); } end = clock(); fprintf(file, "%6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); if(psm.pal > 0) { start = clock(); for(int j = 0; j < n; j++) { (mem.*rtxP)(o, r, ptr, w, TEXA); } end = clock(); fprintf(file, "| %6d %6d ", (int)((float)len * n / (end - start) / 1000), (int)((float)(w * h) * n / (end - start) / 1000)); } fprintf(file, "\n"); fflush(file); } fprintf(file, "\n"); } _aligned_free(ptr); delete pMem; } // if(0) { GSLocalMemory * pMem2 = new GSLocalMemory(); GSLocalMemory& mem2(*pMem2); uint8* ptr = (uint8*)_aligned_malloc(1024 * 1024 * 4, 32); for(int i = 0; i < 1024 * 1024 * 4; i++) ptr[i] = (uint8)i; const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[PSM_PSMCT32]; GSLocalMemory::writeImage wi = psm.wi; GIFRegBITBLTBUF BITBLTBUF; BITBLTBUF.DBP = 0; BITBLTBUF.DBW = 32; BITBLTBUF.DPSM = PSM_PSMCT32; GIFRegTRXPOS TRXPOS; TRXPOS.DSAX = 0; TRXPOS.DSAY = 1; GIFRegTRXREG TRXREG; TRXREG.RRW = 256; TRXREG.RRH = 256; int trlen = 256 * 256 * psm.trbpp / 8; int x = 0; int y = 0; (mem2.*wi)(x, y, ptr, trlen, BITBLTBUF, TRXPOS, TRXREG); delete pMem2; } // fclose(file); PostQuitMessage(0); }
int uv_fs_event_init(uv_loop_t* loop, uv_fs_event_t* handle, const char* filename, uv_fs_event_cb cb, int flags) { int name_size, is_path_dir; DWORD attr, last_error; wchar_t* dir = NULL, *dir_to_watch, *filenamew = NULL; wchar_t short_path[MAX_PATH]; /* We don't support any flags yet. */ assert(!flags); uv_fs_event_init_handle(loop, handle, filename, cb); /* Convert name to UTF16. */ name_size = uv_utf8_to_utf16(filename, NULL, 0) * sizeof(wchar_t); filenamew = (wchar_t*)malloc(name_size); if (!filenamew) { uv_fatal_error(ERROR_OUTOFMEMORY, "malloc"); } if (!uv_utf8_to_utf16(filename, filenamew, name_size / sizeof(wchar_t))) { uv__set_sys_error(loop, GetLastError()); return -1; } /* Determine whether filename is a file or a directory. */ attr = GetFileAttributesW(filenamew); if (attr == INVALID_FILE_ATTRIBUTES) { last_error = GetLastError(); goto error; } is_path_dir = (attr & FILE_ATTRIBUTE_DIRECTORY) ? 1 : 0; if (is_path_dir) { /* filename is a directory, so that's the directory that we will watch. */ handle->dirw = filenamew; dir_to_watch = filenamew; } else { /* * filename is a file. So we split filename into dir & file parts, and * watch the dir directory. */ /* Convert to short path. */ if (!GetShortPathNameW(filenamew, short_path, ARRAY_SIZE(short_path))) { last_error = GetLastError(); goto error; } if (uv_split_path(filenamew, &dir, &handle->filew) != 0) { last_error = GetLastError(); goto error; } if (uv_split_path(short_path, NULL, &handle->short_filew) != 0) { last_error = GetLastError(); goto error; } dir_to_watch = dir; free(filenamew); filenamew = NULL; } handle->dir_handle = CreateFileW(dir_to_watch, FILE_LIST_DIRECTORY, FILE_SHARE_READ | FILE_SHARE_DELETE | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS | FILE_FLAG_OVERLAPPED, NULL); if (dir) { free(dir); dir = NULL; } if (handle->dir_handle == INVALID_HANDLE_VALUE) { last_error = GetLastError(); goto error; } if (CreateIoCompletionPort(handle->dir_handle, loop->iocp, (ULONG_PTR)handle, 0) == NULL) { last_error = GetLastError(); goto error; } handle->buffer = (char*)_aligned_malloc(uv_directory_watcher_buffer_size, sizeof(DWORD)); if (!handle->buffer) { uv_fatal_error(ERROR_OUTOFMEMORY, "malloc"); } memset(&(handle->req.overlapped), 0, sizeof(handle->req.overlapped)); if (!ReadDirectoryChangesW(handle->dir_handle, handle->buffer, uv_directory_watcher_buffer_size, FALSE, FILE_NOTIFY_CHANGE_FILE_NAME | FILE_NOTIFY_CHANGE_DIR_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES | FILE_NOTIFY_CHANGE_SIZE | FILE_NOTIFY_CHANGE_LAST_WRITE | FILE_NOTIFY_CHANGE_LAST_ACCESS | FILE_NOTIFY_CHANGE_CREATION | FILE_NOTIFY_CHANGE_SECURITY, NULL, &handle->req.overlapped, NULL)) { last_error = GetLastError(); goto error; } handle->req_pending = 1; return 0; error: if (handle->filename) { free(handle->filename); handle->filename = NULL; } if (handle->filew) { free(handle->filew); handle->filew = NULL; } if (handle->short_filew) { free(handle->short_filew); handle->short_filew = NULL; } free(filenamew); if (handle->dir_handle != INVALID_HANDLE_VALUE) { CloseHandle(handle->dir_handle); handle->dir_handle = INVALID_HANDLE_VALUE; } if (handle->buffer) { _aligned_free(handle->buffer); handle->buffer = NULL; } uv__set_sys_error(loop, last_error); return -1; }
BOOL xf_Bitmap_Decompress(rdpContext* context, rdpBitmap* bitmap, BYTE* data, int width, int height, int bpp, int length, BOOL compressed, int codecId) { int status; UINT16 size; BYTE* pSrcData; BYTE* pDstData; UINT32 SrcSize; UINT32 SrcFormat; UINT32 bytesPerPixel; xfContext* xfc = (xfContext*) context; bytesPerPixel = (bpp + 7) / 8; size = width * height * 4; bitmap->data = (BYTE*) _aligned_malloc(size, 16); if (!bitmap->data) return FALSE; pSrcData = data; SrcSize = (UINT32) length; pDstData = bitmap->data; if (compressed) { if (bpp < 32) { if (!freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_INTERLEAVED)) return FALSE; status = interleaved_decompress(xfc->codecs->interleaved, pSrcData, SrcSize, bpp, &pDstData, xfc->format, -1, 0, 0, width, height, xfc->palette); } else { if (!freerdp_client_codecs_prepare(xfc->codecs, FREERDP_CODEC_PLANAR)) return FALSE; status = planar_decompress(xfc->codecs->planar, pSrcData, SrcSize, &pDstData, xfc->format, -1, 0, 0, width, height, TRUE); } if (status < 0) { WLog_ERR(TAG, "Bitmap Decompression Failed"); return FALSE; } } else { SrcFormat = gdi_get_pixel_format(bpp, TRUE); status = freerdp_image_copy(pDstData, xfc->format, -1, 0, 0, width, height, pSrcData, SrcFormat, -1, 0, 0, xfc->palette); } bitmap->compressed = FALSE; bitmap->length = size; bitmap->bpp = (xfc->depth >= 24) ? 32 : xfc->depth; return TRUE; }
rdpRpc* rpc_new(rdpTransport* transport) { rdpRpc* rpc = (rdpRpc*) malloc(sizeof(rdpRpc)); if (rpc != NULL) { ZeroMemory(rpc, sizeof(rdpRpc)); rpc->State = RPC_CLIENT_STATE_INITIAL; rpc->transport = transport; rpc->settings = transport->settings; rpc->send_seq_num = 0; rpc->ntlm = ntlm_new(); rpc->NtlmHttpIn = ntlm_http_new(); rpc->NtlmHttpOut = ntlm_http_new(); rpc_ntlm_http_init_channel(rpc, rpc->NtlmHttpIn, TSG_CHANNEL_IN); rpc_ntlm_http_init_channel(rpc, rpc->NtlmHttpOut, TSG_CHANNEL_OUT); rpc->FragBufferSize = 20; rpc->FragBuffer = (BYTE*) malloc(rpc->FragBufferSize); rpc->StubOffset = 0; rpc->StubBufferSize = 20; rpc->StubLength = 0; rpc->StubFragCount = 0; rpc->StubBuffer = (BYTE*) malloc(rpc->FragBufferSize); rpc->rpc_vers = 5; rpc->rpc_vers_minor = 0; /* little-endian data representation */ rpc->packed_drep[0] = 0x10; rpc->packed_drep[1] = 0x00; rpc->packed_drep[2] = 0x00; rpc->packed_drep[3] = 0x00; rpc->max_xmit_frag = 0x0FF8; rpc->max_recv_frag = 0x0FF8; rpc->pdu = (RPC_PDU*) _aligned_malloc(sizeof(RPC_PDU), MEMORY_ALLOCATION_ALIGNMENT); rpc->SendQueue = (PSLIST_HEADER) _aligned_malloc(sizeof(SLIST_HEADER), MEMORY_ALLOCATION_ALIGNMENT); InitializeSListHead(rpc->SendQueue); rpc->ReceiveQueue = (PSLIST_HEADER) _aligned_malloc(sizeof(SLIST_HEADER), MEMORY_ALLOCATION_ALIGNMENT); InitializeSListHead(rpc->ReceiveQueue); rpc->ReceiveWindow = 0x00010000; rpc->ChannelLifetime = 0x40000000; rpc->ChannelLifetimeSet = 0; rpc->KeepAliveInterval = 300000; rpc->CurrentKeepAliveInterval = rpc->KeepAliveInterval; rpc->CurrentKeepAliveTime = 0; rpc->VirtualConnection = rpc_client_virtual_connection_new(rpc); rpc->VirtualConnectionCookieTable = rpc_virtual_connection_cookie_table_new(rpc); rpc->call_id = 1; rpc_client_new(rpc); rpc->client->SynchronousSend = TRUE; rpc->client->SynchronousReceive = TRUE; } return rpc; }
SearchContext::SearchContext(const Point* points_begin, const Point* points_end) : mTree(nullptr) , mKDTreeMemPool(nullptr) , mIteratorMemPool(nullptr) { #ifdef _ENABLE_STATS_LOGGING spxCurrentContext = this; #endif // _ENABLE_STATS_LOGGING int64_t pointCount = points_end - points_begin; { // Add and sort all points on rank mPoints.assign(points_begin, points_end); std::sort(mPoints.begin(), mPoints.end(), [] (const Point& inLHS, const Point& inRHS) { return inLHS.rank < inRHS.rank; }); } { // Create KDTree Mem Pool int minimumNodeCount = (int) (pointCount / kBinSize) + 1; int byteCount = sizeof(KdTree<Axis_X>) * minimumNodeCount; mKDTreeMemPool = new MemoryPool(byteCount); } { // Create iterator mem pool // DANGEROUS: Can overrun memory here, // but it's marginally faster than using a checked mem pool... static const int kIteratorMemPoolMaxCount = 2000; mIteratorMemPool = (SearchIterator*) _aligned_malloc(sizeof(SearchIterator)*kIteratorMemPoolMaxCount, __alignof(SearchIterator)); } { // Create KDTree mTree = mKDTreeMemPool->Alloc< KdTree<Axis_X> >(); INC_KDTREE_COUNT; #ifdef _BALANCE_KDTREE std::vector<CoordPairAndRank> coords; coords.reserve(mPoints.size()); for (auto& p : mPoints) { coords.emplace_back( p ); } mTree->Fill(*mKDTreeMemPool, coords); #else for (auto& p : mPoints) { mTree->Add(*mKDTreeMemPool, p); } #endif // _BALANCE_KDTREE mTree->Finalise(); } #ifdef _PRINT_POINTS_AND_QUERIES if (pointCount > 1) // Avoid robustness test. { DataPrinter::DeleteFiles(); DataPrinter::PrintPoints(&(*mPoints.begin()), &(*mPoints.end())); auto copyPoints = mPoints; std::sort(copyPoints.begin(), copyPoints.end(), [] (const Point& inLHS, const Point& inRHS) { return inLHS.x < inRHS.x; }); DataPrinter::PrintPoints(&(*copyPoints.begin()), &(*copyPoints.end()), "InputSortedOnX.txt"); std::sort(copyPoints.begin(), copyPoints.end(), [] (const Point& inLHS, const Point& inRHS) { return inLHS.y < inRHS.y; }); DataPrinter::PrintPoints(&(*copyPoints.begin()), &(*copyPoints.end()), "InputSortedOnY.txt"); } #endif // _PRINT_POINTS_AND_QUERIES }
void *av_malloc(size_t size) { void *ptr = NULL; #if CONFIG_MEMALIGN_HACK long diff; #endif /* let's disallow possibly ambiguous cases */ if (size > (max_alloc_size - 32)) return NULL; #if CONFIG_MEMALIGN_HACK ptr = malloc(size + ALIGN); if (!ptr) return ptr; diff = ((~(long)ptr)&(ALIGN - 1)) + 1; ptr = (char *)ptr + diff; ((char *)ptr)[-1] = diff; #elif HAVE_POSIX_MEMALIGN if (size) //OS X on SDK 10.6 has a broken posix_memalign implementation if (posix_memalign(&ptr, ALIGN, size)) ptr = NULL; #elif HAVE_ALIGNED_MALLOC ptr = _aligned_malloc(size, ALIGN); #elif HAVE_MEMALIGN #ifndef __DJGPP__ ptr = memalign(ALIGN, size); #else ptr = memalign(size, ALIGN); #endif /* Why 64? * Indeed, we should align it: * on 4 for 386 * on 16 for 486 * on 32 for 586, PPro - K6-III * on 64 for K7 (maybe for P3 too). * Because L1 and L2 caches are aligned on those values. * But I don't want to code such logic here! */ /* Why 32? * For AVX ASM. SSE / NEON needs only 16. * Why not larger? Because I did not see a difference in benchmarks ... */ /* benchmarks with P3 * memalign(64) + 1 3071, 3051, 3032 * memalign(64) + 2 3051, 3032, 3041 * memalign(64) + 4 2911, 2896, 2915 * memalign(64) + 8 2545, 2554, 2550 * memalign(64) + 16 2543, 2572, 2563 * memalign(64) + 32 2546, 2545, 2571 * memalign(64) + 64 2570, 2533, 2558 * * BTW, malloc seems to do 8-byte alignment by default here. */ #else ptr = malloc(size); #ifdef USE_MEM_STATS printf("malloc(%ld) -> %p\n", size, ptr); if (ptr) { mem_cur += malloc_usable_size(ptr); if (mem_cur > mem_max) { mem_max = mem_cur; printf("mem_max=%d\n", mem_max); } } #endif #endif if(!ptr && !size) { size = 1; ptr= av_malloc(1); } #if CONFIG_MEMORY_POISONING if (ptr) memset(ptr, FF_MEMORY_POISON, size); #endif return ptr; }
int xf_CreateSurface(RdpgfxClientContext* context, RDPGFX_CREATE_SURFACE_PDU* createSurface) { size_t size; UINT32 bytesPerPixel; xfGfxSurface* surface; xfContext* xfc = (xfContext*) context->custom; surface = (xfGfxSurface*) calloc(1, sizeof(xfGfxSurface)); if (!surface) return -1; surface->surfaceId = createSurface->surfaceId; surface->width = (UINT32) createSurface->width; surface->height = (UINT32) createSurface->height; surface->alpha = (createSurface->pixelFormat == PIXEL_FORMAT_ARGB_8888) ? TRUE : FALSE; surface->format = PIXEL_FORMAT_XRGB32; surface->scanline = surface->width * 4; surface->scanline += (surface->scanline % (xfc->scanline_pad / 8)); size = surface->scanline * surface->height; surface->data = (BYTE*) _aligned_malloc(size, 16); if (!surface->data) { free (surface); return -1; } ZeroMemory(surface->data, size); if ((xfc->depth == 24) || (xfc->depth == 32)) { surface->image = XCreateImage(xfc->display, xfc->visual, xfc->depth, ZPixmap, 0, (char*) surface->data, surface->width, surface->height, xfc->scanline_pad, surface->scanline); } else { bytesPerPixel = (FREERDP_PIXEL_FORMAT_BPP(xfc->format) / 8); surface->stageStep = surface->width * bytesPerPixel; surface->stageStep += (surface->stageStep % (xfc->scanline_pad / 8)); size = surface->stageStep * surface->height; surface->stage = (BYTE*) _aligned_malloc(size, 16); if (!surface->stage) { free (surface->data); free (surface); return -1; } ZeroMemory(surface->stage, size); surface->image = XCreateImage(xfc->display, xfc->visual, xfc->depth, ZPixmap, 0, (char*) surface->stage, surface->width, surface->height, xfc->scanline_pad, surface->stageStep); } context->SetSurfaceData(context, surface->surfaceId, (void*) surface); return 1; }
/*--------------------------------------------------------------------------- // 16Byte Allignment malloc //-------------------------------------------------------------------------*/ void* xmm_malloc(size_t size) { return (void*)_aligned_malloc(size, 16); }
static void *b3AlignedAllocDefault(size_t size, int alignment) { return _aligned_malloc(size, (size_t)alignment); }
////////////////////////////////////////////////////////////////////////////////////////////////// /// Resize buffers and opengl texture void SoXipCPUMprRender::resizeBuffers(SbVec2s &size) { mMPRSize = size; #ifdef _CRT_ALLOCATION_DEFINED if (mMPRBuf) _aligned_free(mMPRBuf); if (mMPRCache) _aligned_free(mMPRCache); #else if (mMPRBuf) delete[] mMPRBuf; if (mMPRCache) delete[] mMPRCache; #endif if (!mMPRTexId) glGenTextures(1, &mMPRTexId); int volBytes = 1; if (mLutBuf) { volBytes = sizeof(float) * 4; mTexInternalFormat = GL_RGBA8; mTexType = GL_FLOAT; } else switch (mVolDataType) { case SbXipImage::UNSIGNED_BYTE: mTexInternalFormat = GL_LUMINANCE8; mTexType = GL_UNSIGNED_BYTE; break; case SbXipImage::BYTE: mTexInternalFormat = GL_LUMINANCE8; mTexType = GL_BYTE; break; case SbXipImage::UNSIGNED_SHORT: mTexInternalFormat = GL_LUMINANCE16; mTexType = GL_UNSIGNED_SHORT; volBytes = 2; break; case SbXipImage::SHORT: mTexInternalFormat = GL_LUMINANCE16; mTexType = GL_SHORT; volBytes = 2; break; case SbXipImage::UNSIGNED_INT: mTexInternalFormat = GL_LUMINANCE16; mTexType = GL_UNSIGNED_INT; volBytes = 4; break; case SbXipImage::INT: mTexInternalFormat = GL_LUMINANCE16; mTexType = GL_INT; volBytes = 4; break; case SbXipImage::FLOAT: mTexInternalFormat = GL_LUMINANCE16; mTexType = GL_FLOAT; volBytes = 4; break; case SbXipImage::DOUBLE: mTexInternalFormat = GL_LUMINANCE16; mTexType = GL_DOUBLE; volBytes = 8; break; default: mTexInternalFormat = 0; mTexType = 0; SoDebugError::postInfo("SoXipCPUMprRender::resizeBuffers", "Unsupported image type: %d!", mVolDataType); return; } #ifdef _CRT_ALLOCATION_DEFINED mMPRBuf = _aligned_malloc(size[0] * size[1] * volBytes, 16); mMPRCache = (mprCacheElem*) _aligned_malloc(sizeof(mprCacheElem) * size[0] * size[1], 16); #else mMPRBuf = (void*) new char[size[0] * size[1] * volBytes]; mMPRCache = new mprCacheElem[size[0] * size[1]]; #endif glBindTexture(GL_TEXTURE_2D, mMPRTexId); glTexImage2D(GL_TEXTURE_2D, 0, mTexInternalFormat, size[0], size[1], 0, mLutBuf ? GL_RGBA : GL_LUMINANCE, mTexType, 0); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); }