static BOOL WIN_Init(void) { WAVEFORMATEX wfe; WORD samplesize; MMRESULT mmr; int n; samplesize=1; if (md_mode&DMODE_STEREO) samplesize<<=1; if (md_mode&DMODE_16BITS) samplesize<<=1; wfe.wFormatTag=WAVE_FORMAT_PCM; wfe.nChannels=md_mode&DMODE_STEREO?2:1; wfe.nSamplesPerSec=md_mixfreq; wfe.nAvgBytesPerSec=md_mixfreq*samplesize; wfe.nBlockAlign=samplesize; wfe.wBitsPerSample=md_mode&DMODE_16BITS?16:8; wfe.cbSize=sizeof(wfe); mmr=waveOutOpen(&hwaveout,WAVE_MAPPER,&wfe,(DWORD)WIN_CallBack,0,CALLBACK_FUNCTION); if (mmr!=MMSYSERR_NOERROR) { _mm_errno=WIN_GetError(mmr); return 1; } buffersize=md_mixfreq*samplesize*BUFFERSIZE/1000; for (n=0;n<NUMBUFFERS;n++) { buffer[n]=_mm_malloc(buffersize); header[n].lpData=buffer[n]; header[n].dwBufferLength=buffersize; mmr=waveOutPrepareHeader(hwaveout,&header[n],sizeof(WAVEHDR)); if (!buffer[n]||mmr!=MMSYSERR_NOERROR) { if (!buffer[n]) _mm_errno=MMERR_OUT_OF_MEMORY; else _mm_errno=WIN_GetError(mmr); return 1; } } md_mode|=DMODE_SOFT_MUSIC|DMODE_SOFT_SNDFX; buffersout=nextbuffer=0; return VC_Init(); }
// Sums an array of floats; needed in replacement of Python sum() float sum(float* a, uint_fast32_t num_elements) { __m128 avec, sumflo, sumout; float* sum = _mm_malloc(sizeof(float), sizeof(int16_t)); sumflo = _mm_set_ss(*sum); for (uint_fast32_t i = 0; i < num_elements; i++) { avec = _mm_set_ss(a[i]); sumout = _mm_add_ss(avec, sumflo); _mm_store_ss(sum, sumout); } return *sum; }
double time_dgemm (const int M, const int N, const unsigned K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const unsigned ldc) { double mflops, mflop_s; double secs = -1; int num_iterations = NRUNS; int i; double* Ca = (double*) _mm_malloc(N*ldc*sizeof(double), 32); double cpu_time = 0; double last_clock = mysecond(); for (i = 0; i < num_iterations; ++i) { memcpy(Ca, C, N*ldc*sizeof(double)); cpu_time -= mysecond(); #ifdef PAPI PAPI_START; #endif dgemm (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc); #ifdef PAPI PAPI_STOP; PAPI_PRINT; #endif cpu_time += mysecond(); } mflops = 2.0 * num_iterations*M*N*K/1.0e6; secs = cpu_time; mflop_s = mflops/secs; memcpy(C, Ca, N*ldc*sizeof(double)); #ifdef PAPI PAPI_FLUSH; #endif _mm_free(Ca); return mflop_s; }
int scanhash_lyra2z(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done) { size_t size = (int64_t) ((int64_t) 16 * 16 * 96); uint64_t *wholeMatrix = _mm_malloc(size, 64); uint32_t _ALIGN(128) hash[8]; uint32_t _ALIGN(128) endiandata[20]; uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t Htarg = ptarget[7]; const uint32_t first_nonce = pdata[19]; uint32_t nonce = first_nonce; if (opt_benchmark) ptarget[7] = 0x0000ff; for (int i=0; i < 19; i++) { be32enc(&endiandata[i], pdata[i]); } do { be32enc(&endiandata[19], nonce); lyra2z_hash(wholeMatrix, hash, endiandata); // lyra2z_hash(0, hash, endiandata); if (hash[7] <= Htarg && fulltest(hash, ptarget)) { work_set_target_ratio(work, hash); pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce; _mm_free(wholeMatrix); return 1; } nonce++; } while (nonce < max_nonce && !work_restart[thr_id].restart); pdata[19] = nonce; *hashes_done = pdata[19] - first_nonce + 1; _mm_free(wholeMatrix); return 0; }
void RendererLF::init(int w, int h, int pnum, int num, int var) { // Create scene scene = new Scene(w, h, pnum); scene->setScene(num, var); // set processor number and threads procnum = pnum; bar = new boost::barrier(procnum); t = new boost::thread *[procnum]; inc = ceilf(float(scene->height) / (float)procnum); // HDR values image = (Colour*)_mm_malloc(sizeof(Colour) * scene->width * scene->height, 16); for (int i = 0; i < scene->width * scene->height; i++) image[i].Set(0.0f, 0.0f, 0.0f); // Initalise framebuffers for data final = (uint8_t*)_mm_malloc(sizeof(uint8_t) * scene->width * scene->height * 3, 16);
/*! read png texture from disk */ OBJScene::Texture *loadTexture(const FileName& fileName) { OBJScene::Texture *texture = new OBJScene::Texture(); std::string ext = strlwr(fileName.ext()); if (ext == "ptx" ) return loadPtexTexture(fileName); Ref<Image> img = loadImage(fileName); texture->width = img.ptr->width; texture->height = img.ptr->height; texture->format = OBJScene::Texture::RGBA8; texture->bytesPerTexel = 4; texture->data = _mm_malloc(sizeof(int)*texture->width*texture->height,64); texture->width_mask = isPowerOf2(texture->width) ? texture->width-1 : 0; texture->height_mask = isPowerOf2(texture->height) ? texture->height-1 : 0; img.ptr->convertToRGBA8((unsigned char*)texture->data); return texture; }
int init_find_index_array(int array_size, int *input_array) { int *temp_array_low = (int*) _mm_malloc(array_size * sizeof(int), 64); #pragma omp parallel { #pragma omp single { find_index_array_using_merge_sort(input_array, (input_array + array_size), temp_array_low, true); } } _mm_free(temp_array_low); return 1; }
void Mem::allocate(MemInfo &info, bool enabled) { info.hugePages = 0; if (!enabled) { info.memory = static_cast<uint8_t*>(_mm_malloc(info.size, 4096)); return; } info.memory = static_cast<uint8_t*>(VirtualAlloc(nullptr, info.size, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE)); if (info.memory) { info.hugePages = info.pages; return; } allocate(info, false); }
static void ult_nn_convolution_fixedpoint_comp_both_alloc( int16_t* &input, int16_t* &output, int32_t* &biases, int16_t* &kernel, int16_t* &input_ref, int16_t* &output_ref, int32_t* &biases_ref, int16_t* &kernel_ref, uint_least32_t num_output_feature_maps, uint_least32_t num_input_feature_maps, uint_least32_t output_width, uint_least32_t output_height, uint_least32_t input_width, uint_least32_t input_height, uint_least32_t kernel_width, uint_least32_t kernel_height, uint_least32_t center_x, uint_least32_t center_y ) { uint_least32_t input_size = input_width * input_height * num_input_feature_maps * sizeof(int16_t); uint_least32_t output_size = (output_width + 2 * center_x) * (output_height + 2 * center_y) * num_output_feature_maps * sizeof(int16_t); uint_least32_t bias_size = num_output_feature_maps * sizeof(int32_t); uint_least32_t kernel_size = num_input_feature_maps * num_output_feature_maps * kernel_width * kernel_height * sizeof(int16_t); input_ref = (int16_t*)_mm_malloc(input_size, 4096); output_ref = (int16_t*)_mm_malloc(output_size, 4096); biases_ref = (int32_t*)_mm_malloc(bias_size, 4096); kernel_ref = (int16_t*)_mm_malloc(kernel_size, 4096); input_size = input_width * input_height * num_input_feature_maps * sizeof(int16_t); output_size = (output_width + 2 * center_x) * (output_height + 2 * center_y) * num_output_feature_maps * sizeof(int16_t); kernel_size = num_input_feature_maps * num_output_feature_maps * kernel_width * kernel_height * sizeof(int32_t); input = (int16_t*)_mm_malloc(input_size, 4096); output = (int16_t*)_mm_malloc(output_size, 4096); biases = (int32_t*)_mm_malloc(bias_size, 4096); kernel = (int16_t*)_mm_malloc(kernel_size, 4096); }
static SWORD NDS_HW_SampleLoad(struct SAMPLOAD* sload, int type) { ASSERT(sload != NULL); SAMPLE *s = sload->sample; int handle; /* Find empty slot to put sample address in */ for(handle = 0; handle < NDS_HW_MAXSAMPLES; handle++) { if(ipc->samples[handle] == 0) { break; } } if(handle == MAXSAMPLEHANDLES) { _mm_errno = MMERR_OUT_OF_HANDLES; return -1; } /* Reality check for loop settings */ if (s->loopend > s->length) s->loopend = s->length; if (s->loopstart >= s->loopend) s->flags &= ~SF_LOOP; /* TODO difference between 8 and 16 bits? */ SL_SampleSigned(sload); ipc->samples[handle] = _mm_malloc(s->length * ((s->flags & SF_16BITS) ? 2 : 1)); if(ipc->samples[handle] == NULL) { _mm_errno = MMERR_SAMPLE_TOO_BIG; return -1; } /* read sample into buffer */ if (SL_Load(ipc->samples[handle], sload, s->length)) return -1; DC_FlushRange(ipc->samples[handle], s->length * ((s->flags & SF_16BITS) ? 2 : 1)); return handle; }
MIKMODAPI CHAR* MikMod_InfoLoader(void) { int len=0; MLOADER *l; CHAR *list=NULL; MUTEX_LOCK(lists); /* compute size of buffer */ for(l=firstloader;l;l=l->next) len+=1+(l->next?1:0)+strlen(l->version); if(len) if((list=_mm_malloc(len*sizeof(CHAR)))) { list[0]=0; /* list all registered module loders */ for(l=firstloader;l;l=l->next) sprintf(list,(l->next)?"%s%s\n":"%s%s",list,l->version); } MUTEX_UNLOCK(lists); return list; }
MREADER *_mm_new_rwops_reader(SDL_RWops * rw) { int here; MRWOPSREADER* reader=(MRWOPSREADER*)_mm_malloc(sizeof(MRWOPSREADER)); if (reader) { reader->core.Eof =&_mm_RWopsReader_Eof; reader->core.Read=&_mm_RWopsReader_Read; reader->core.Get =&_mm_RWopsReader_Get; reader->core.Seek=&_mm_RWopsReader_Seek; reader->core.Tell=&_mm_RWopsReader_Tell; reader->rw=rw; /* RWops does not explicitly support an eof check, so we shall find the end manually - this requires seek support for the RWop */ here = SDL_RWtell(rw); reader->end = SDL_RWseek(rw, 0, SEEK_END); SDL_RWseek(rw, here, SEEK_SET); /* Move back */ } return (MREADER*)reader; }
void* alignedMalloc(size_t size, size_t align) { assert((align & (align-1)) == 0); //#if defined(TASKING_TBB) // FIXME: have to disable this for now as the TBB allocator itself seems to access some uninitialized value when using valgrind // return scalable_aligned_malloc(size,align); //#else // #if USE_MADVISE // if (size >= 16*PAGE_SIZE_2M) // { // align = PAGE_SIZE_2M; // void *ptr = _mm_malloc(size,align); // os_madvise(ptr,size); // return ptr; // } // #endif return _mm_malloc(size,align); //#endif }
BOOL ReadComment(UWORD len) { if(len) { int i; if(!(of.comment=(CHAR*)_mm_malloc(len+1))) return 0; _mm_read_UBYTES(of.comment,len,modreader); /* translate IT linefeeds */ for(i=0;i<len;i++) if(of.comment[i]=='\r') of.comment[i]='\n'; of.comment[len]=0; /* just in case */ } if(!of.comment[0]) { free(of.comment); of.comment=NULL; } return 1; }
dotprod_crcf dotprod_crcf_create(float * _h, unsigned int _n) { dotprod_crcf q = (dotprod_crcf)malloc(sizeof(struct dotprod_crcf_s)); q->n = _n; // allocate memory for coefficients, 16-byte aligned q->h = (float*) _mm_malloc( 2*q->n*sizeof(float), 16 ); // set coefficients, repeated // h = { _h[0], _h[0], _h[1], _h[1], ... _h[n-1], _h[n-1]} unsigned int i; for (i=0; i<q->n; i++) { q->h[2*i+0] = _h[i]; q->h[2*i+1] = _h[i]; } // return object return q; }
void myConvKernel_simd() { #pragma omp parallel { int tid = omp_get_thread_num(); int nthreads = omp_get_num_threads(); float *intermediate = (float*) _mm_malloc(sizeof(float) * nOutputPlanes, 512); assert(intermediate != NULL); #pragma omp for for (int opY = 0; opY < ioHeight; opY++) { for (int opX = 0; opX < ioWidth; opX++) { convolve3x3withPad_1elem(opY, opX, intermediate); } } _mm_free(intermediate); } }
void * tpcc_wl::threadInitWarehouse(void * This) { tpcc_wl * wl = (tpcc_wl *) This; int tid = ATOM_FETCH_ADD(wl->next_tid, 1); uint32_t wid = tid + 1; tpcc_buffer[tid] = (drand48_data *) _mm_malloc(sizeof(drand48_data), 64); assert((uint64_t)tid < g_num_wh); srand48_r(wid, tpcc_buffer[tid]); if (tid == 0) wl->init_tab_item(); wl->init_tab_wh( wid ); wl->init_tab_dist( wid ); wl->init_tab_stock( wid ); for (uint64_t did = 1; did <= DIST_PER_WARE; did++) { wl->init_tab_cust(did, wid); wl->init_tab_order(did, wid); for (uint64_t cid = 1; cid <= g_cust_per_dist; cid++) wl->init_tab_hist(cid, did, wid); } return NULL; }
LXC_ERROR_CODE LXC_SSE3Buffer_create(LXC_BUFFER *Buffer) { if(!Buffer || !Buffer->maxFilterPartLength || !Buffer->maxFilterParts) { return LXC_ERR_INVALID_INPUT; } const uint maxElements = Buffer->maxFilterPartLength*Buffer->maxFilterParts; #if defined(TARGET_WINDOWS) LXC_SSE3cpxFloat *p = (LXC_SSE3cpxFloat*)_aligned_malloc(sizeof(LXC_SSE3cpxFloat)*maxElements, LXC_SSE3_ALIGN); #elif defined(TARGET_LINUX) LXC_SSE3cpxFloat *p = (LXC_SSE3cpxFloat*)_mm_malloc(sizeof(LXC_SSE3cpxFloat)*maxElements, LXC_SSE3_ALIGN); #endif // http://stackoverflow.com/questions/21328985/sse-reinterpret-cast-m128-instead-of-mm-load-ps // float *C = _mm_malloc(size * sizeof(*C), 16); // or // float *C = _aligned_malloc(size * sizeof(*C), 16); if(!p) { // reset the state of the buffer handle Buffer->maxFilterLength = 0; Buffer->maxFilterPartLength = 0; Buffer->maxFilterPartLength_NonZero = 0; Buffer->maxFilterParts = 0; Buffer->sampleFrequency = 0; Buffer->buffer = NULL; return LXC_ERR_DYNAMIC_MEMORY; } for(uint ii=0; ii < maxElements; ii++) { p[ii][0] = 0.0f; p[ii][1] = 0.0f; } Buffer->buffer = (void*)p; return LXC_NO_ERR; }
void DummyDataNode::forwardPropagate() { #ifdef RETURNALL return; #endif for(int i=0; i<tenTopData_.size(); i++) { int dtype = tenTopData_[i]->getDataType(); long long int bytes = tenTopData_[i]->getBufferSize(); if(dtype == DT_FLOAT) { float* top = (float*)(tenTopData_[i]->getBuffer()); fillData(top, bytes/sizeof(float)); #ifdef DEBUG printf("Executing FP %s: Data %p\n",node_name_.c_str(), top); #endif } else if(dtype == DT_BF16) { libxsmm_bfloat16* top = (libxsmm_bfloat16*)(tenTopData_[i]->getLPBuffer()); if(top == NULL) top = (libxsmm_bfloat16*)_mm_malloc(bytes/sizeof(libxsmm_bfloat16), 64); tenTopData_[i]->setLPBuffer(top); float *bot = (float*)tenTopData_[i]->getBuffer(); fillData(bot, top, bytes/sizeof(float)); #ifdef DEBUG printf("Executing FP %s: Data %p\n",node_name_.c_str(), top); #endif } else if(dtype == DT_INT) { int* top = (int*)(tenTopData_[i]->getBuffer()); for(long long int i=0; i<bytes/sizeof(int); i++) top[i] = rand()%1000; } } }
MIKMODAPI CHAR* MikMod_InfoDriver(void) { int t,len=0; MDRIVER *l; CHAR *list=NULL; MUTEX_LOCK(lists); /* compute size of buffer */ for(l=firstdriver;l;l=l->next) len+=4+(l->next?1:0)+strlen(l->Version); if(len) if((list=_mm_malloc(len*sizeof(CHAR)))) { list[0]=0; /* list all registered device drivers : */ for(t=1,l=firstdriver;l;l=l->next,t++) sprintf(list,(l->next)?"%s%2d %s\n":"%s%2d %s", list,t,l->Version); } MUTEX_UNLOCK(lists); return list; }
extern "C" void* _Offload_shared_aligned_arena_malloc( MyoArena arena, size_t size, size_t align ) { OFFLOAD_DEBUG_TRACE(3, "%s(%u, %lld, %lld)\n", __func__, arena, size, align); if (__offload_myoLoadLibrary()) { void *p = myo_wrapper.SharedAlignedArenaMalloc(arena, size, align); OFFLOAD_DEBUG_TRACE(3, "%s(%u, %lld, %lld)->%p\n", __func__, arena, size, align, p); return p; } else { if (align < sizeof(void*)) { align = sizeof(void*); } return _mm_malloc(size, align); } }
int main() { int i; float *a; double t, sum; a = (float *)_mm_malloc(sizeof(*a) * N, 16); for (i = 0; i < N; i++) a[i] = 1.0; t = hpctimer_getwtime(); for (i = 0; i < NREPS; i++) { sum = reduction_sum(a, N); // sum = reduction_sum_sse(a, N); } t = (hpctimer_getwtime() - t) / NREPS; printf("Reduction sum: %.4f (real %.4f)\n", sum, (float)N); printf("Elapsed time: %.6f sec.\n", t); _mm_free(a); return 0; }
void Query_thd::init(workload * h_wl, int thread_id) { uint64_t request_cnt; q_idx = 0; request_cnt = WARMUP / g_thread_cnt + MAX_TXN_PER_PART + 4; #if WORKLOAD == YCSB queries = (ycsb_query *) mem_allocator.alloc(sizeof(ycsb_query) * request_cnt, thread_id); srand48_r(thread_id + 1, &buffer); #elif WORKLOAD == TPCC queries = (tpcc_query *) _mm_malloc(sizeof(tpcc_query) * request_cnt, 64); #endif for (UInt32 qid = 0; qid < request_cnt; qid ++) { #if WORKLOAD == YCSB new(&queries[qid]) ycsb_query(); queries[qid].init(thread_id, h_wl, this); #elif WORKLOAD == TPCC new(&queries[qid]) tpcc_query(); queries[qid].init(thread_id, h_wl); #endif } }
bool ult_nn_lrn_fp_check_outputs( nn::data<int16_t, 3>* output, int16_t* output_ref, uint_least32_t num_feature_maps, uint_least32_t feature_map_width, uint_least32_t feature_map_height, uint_least32_t batch ) { // zxy -> xyz uint_least32_t output_size = feature_map_width * feature_map_height * num_feature_maps * sizeof(int16_t); int16_t* outputT = (int16_t*)_mm_malloc(output_size, 64); int16_t * outputOpt = (int16_t *)output->buffer; uint32_t OFMOutBlock = 8; for (size_t y = 0; y < feature_map_height; y++) { for (size_t x = 0; x < feature_map_width; x++) { for (size_t z = 0; z < num_feature_maps; z++) { outputT[z * feature_map_width * feature_map_height + y * feature_map_height + x] = outputOpt[z + x * num_feature_maps + y * num_feature_maps * feature_map_width]; } } } bool passed = true; for (uint_least32_t i = 0; i < (output_size / sizeof(int16_t)) && passed; i++) if ((outputT[i] < output_ref[i] - 3) || (outputT[i] > output_ref[i] + 3)) passed = false; _mm_free(outputT); return passed; }
void leibniz1(){ int nmic; mic_init(nmic); long n = 1l*1000*1000*800; long nbytes = n*8; double* v = (double *)_mm_malloc(nbytes, 64); leibniz_init(v, n); double sum; #pragma offload target(mic:0) \ mandatory \ in(v:length(n) align(64)) { hostmic_scale(v, n); hostmic_scale(v, n); hostmic_scale(v, n); sum = hostmic_sum(v, n); } printf(" leibniz1: sum = %f\n", sum); _mm_free(v); mic_exit(); }
// -1 -1 -1 -1 -1 // -1 1 1 1 -1 // -1 1 8 1 -1 // -1 1 1 1 -1 // -1 -1 -1 -1 -1 void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) { int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) ); detail::integral_image( in, integral, w, h ); int16_t* out_ptr = out + 3 + 3*w; int16_t* out_end = out + w * h - 2 - 2*w; const int32_t* i00 = integral; const int32_t* i50 = integral + 5; const int32_t* i05 = integral + 5*w; const int32_t* i55 = integral + 5 + 5*w; const int32_t* i11 = integral + 1 + 1*w; const int32_t* i41 = integral + 4 + 1*w; const int32_t* i14 = integral + 1 + 4*w; const int32_t* i44 = integral + 4 + 4*w; const uint8_t* im22 = in + 3 + 3*w; for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) { int32_t result = 0; result = -( *i55 - *i50 - *i05 + *i00 ); result += 2*( *i44 - *i41 - *i14 + *i11 ); result += 7* *im22; *out_ptr = result; } _mm_free( integral ); }
void p_init(size_t sz, uint* buf, uint* p) { uint* p_; p_ = (uint*) _mm_malloc(sz * sizeof(uint), 64); uint i; for(i = 0; i < sz; i++) p[i] = i; uint* l = p; uint* h = p + sz; #pragma omp parallel { #pragma omp single { p_sort(buf, l, h, p_); } } _mm_free(p_); }
static bool ult_nn_convolution_fixedpoint_comp_check_outputs( nn::data<int16_t, 3>* output, int16_t* output_ref, uint_least32_t num_output_feature_maps, uint_least32_t output_feature_map_width, uint_least32_t output_feature_map_height, uint_least32_t center_x, uint_least32_t center_y ) { // zxy -> xyz uint_least32_t output_size = (output_feature_map_width + 2 * center_x) * (output_feature_map_height + 2 * center_y) * num_output_feature_maps * sizeof(int16_t); int16_t* outputT = (int16_t*)_mm_malloc(output_size, 64); int16_t * outputOpt = (int16_t *)output->buffer; uint32_t OFMOutBlock = 16; for (size_t y = 0; y < output_feature_map_height; y++) { for (size_t x = 0; x < output_feature_map_width; x++) { for (size_t z = 0; z < num_output_feature_maps; z++) { outputT[z * output_feature_map_width * output_feature_map_height + y * output_feature_map_height + x] = outputOpt[z + x * num_output_feature_maps + y * num_output_feature_maps * output_feature_map_width]; } } } bool passed = true; for (uint_least32_t i = 0; i < (output_size / sizeof(int16_t)) && passed; i++) if (output_ref[i] != outputT[i]) passed = false; _mm_free(outputT); return passed; }
extern void *OFFLOAD_MALLOC( size_t size, size_t align ) { void *ptr; int err; OFFLOAD_DEBUG_TRACE(2, "%s(%lld, %lld)\n", __func__, size, align); if (align < sizeof(void*)) { align = sizeof(void*); } ptr = _mm_malloc(size, align); if (ptr == NULL) { LIBOFFLOAD_ERROR(c_offload_malloc, size, align); exit(1); } OFFLOAD_DEBUG_TRACE(2, "%s returned %p\n", __func__, ptr); return ptr; }
void square_dgemm(const int M, const double *A, const double *B, double *C) { // Allocate the B packed memory if(M > B_PREALLOC_SIZE) { B_pack = _mm_malloc(A_BLOCK_LEN*M*sizeof(double), MEM_ALIGN); } else { B_pack = B_pack_prealloc; } // Calculate the number of accum blocks in the master loop const int num_acc_blocks = CALC_NUM_BLOCKS(M, A_BLOCK_LEN); // For each accumulation block (row of B, column of A) for(int acc_block = 0; acc_block < num_acc_blocks; ++acc_block) { const int acc_pos = acc_block * A_BLOCK_LEN; const int num_acc = CALC_CUR_BLOCK_WIDTH(acc_pos, A_BLOCK_LEN, M); // Get the current column of A and row of B const double* cur_A = A + acc_pos * M; const double* cur_B = B + acc_pos; // Do panel panel product gepp_blk_var1(M, cur_A, num_acc, cur_B, C); } // Free the dynamically sized blcok if(M > B_PREALLOC_SIZE) { _mm_free(B_pack); B_pack = 0; } }