Exemple #1
0
static BOOL WIN_Init(void)
{
	WAVEFORMATEX	wfe;
	WORD			samplesize;
	MMRESULT		mmr;
	int				n;

	samplesize=1;
	if (md_mode&DMODE_STEREO) samplesize<<=1;
	if (md_mode&DMODE_16BITS) samplesize<<=1;

	wfe.wFormatTag=WAVE_FORMAT_PCM;
	wfe.nChannels=md_mode&DMODE_STEREO?2:1;
	wfe.nSamplesPerSec=md_mixfreq;
	wfe.nAvgBytesPerSec=md_mixfreq*samplesize;
	wfe.nBlockAlign=samplesize;
	wfe.wBitsPerSample=md_mode&DMODE_16BITS?16:8;
	wfe.cbSize=sizeof(wfe);

	mmr=waveOutOpen(&hwaveout,WAVE_MAPPER,&wfe,(DWORD)WIN_CallBack,0,CALLBACK_FUNCTION);
	if (mmr!=MMSYSERR_NOERROR) {
		_mm_errno=WIN_GetError(mmr);
		return 1;
	}

	buffersize=md_mixfreq*samplesize*BUFFERSIZE/1000;

	for (n=0;n<NUMBUFFERS;n++) {
		buffer[n]=_mm_malloc(buffersize);
		header[n].lpData=buffer[n];
		header[n].dwBufferLength=buffersize;
		mmr=waveOutPrepareHeader(hwaveout,&header[n],sizeof(WAVEHDR));
		if (!buffer[n]||mmr!=MMSYSERR_NOERROR) {
			if (!buffer[n])
				_mm_errno=MMERR_OUT_OF_MEMORY;
			else
				_mm_errno=WIN_GetError(mmr);
			return 1;
		}
	}

	md_mode|=DMODE_SOFT_MUSIC|DMODE_SOFT_SNDFX;
	buffersout=nextbuffer=0;
	return VC_Init();
}
// Sums an array of floats; needed in replacement of Python sum()
float sum(float* a, uint_fast32_t num_elements)
{
	__m128 avec, sumflo, sumout;
	float* sum = _mm_malloc(sizeof(float), sizeof(int16_t));
	sumflo = _mm_set_ss(*sum);

	for (uint_fast32_t i = 0; i < num_elements; i++) {
		avec = _mm_set_ss(a[i]);
		sumout = _mm_add_ss(avec, sumflo);
		_mm_store_ss(sum, sumout);

	}
	return *sum;




}
Exemple #3
0
double
time_dgemm (const int M, const int N, const unsigned K,
                const double alpha, const double *A, const int lda,
                const double *B, const int ldb,
                const double beta, double *C, const unsigned ldc)
{
	double mflops, mflop_s;
	double secs = -1;

	int num_iterations = NRUNS;
	int i;

	double* Ca = (double*) _mm_malloc(N*ldc*sizeof(double), 32);

	double cpu_time = 0;

	double last_clock = mysecond();
	for (i = 0; i < num_iterations; ++i) 
	{
		memcpy(Ca, C, N*ldc*sizeof(double));
		cpu_time -= mysecond();
#ifdef PAPI
		PAPI_START;
#endif
		dgemm (M, N, K, alpha, A, lda, B, ldb, beta, Ca, ldc);
#ifdef PAPI
		PAPI_STOP;
		PAPI_PRINT;
#endif
		cpu_time += mysecond();
	}

	mflops  = 2.0 * num_iterations*M*N*K/1.0e6;
	secs    = cpu_time;
	mflop_s = mflops/secs;


	memcpy(C, Ca, N*ldc*sizeof(double));
#ifdef PAPI
	PAPI_FLUSH;
#endif
	_mm_free(Ca);	
	return mflop_s;
}
Exemple #4
0
int scanhash_lyra2z(int thr_id, struct work *work, uint32_t max_nonce, uint64_t *hashes_done)
{

	size_t size = (int64_t) ((int64_t) 16 * 16 * 96);
    uint64_t *wholeMatrix = _mm_malloc(size, 64);

	uint32_t _ALIGN(128) hash[8];
	uint32_t _ALIGN(128) endiandata[20];
	uint32_t *pdata = work->data;
	uint32_t *ptarget = work->target;

	const uint32_t Htarg = ptarget[7];
	const uint32_t first_nonce = pdata[19];
	uint32_t nonce = first_nonce;

	if (opt_benchmark)
		ptarget[7] = 0x0000ff;

	for (int i=0; i < 19; i++) {
		be32enc(&endiandata[i], pdata[i]);
	}

	do {
		be32enc(&endiandata[19], nonce);
		lyra2z_hash(wholeMatrix, hash, endiandata);
//		lyra2z_hash(0, hash, endiandata);

		if (hash[7] <= Htarg && fulltest(hash, ptarget)) {
			work_set_target_ratio(work, hash);
			pdata[19] = nonce;
			*hashes_done = pdata[19] - first_nonce;
			_mm_free(wholeMatrix);
			return 1;
		}
		nonce++;

	} while (nonce < max_nonce && !work_restart[thr_id].restart);

	pdata[19] = nonce;
	*hashes_done = pdata[19] - first_nonce + 1;
	_mm_free(wholeMatrix);
	return 0;
}
Exemple #5
0
void RendererLF::init(int w, int h, int pnum, int num, int var)
{
	// Create scene
	scene = new Scene(w, h, pnum);
	scene->setScene(num, var);

	// set processor number and threads
	procnum = pnum;
	bar = new boost::barrier(procnum);
	t = new boost::thread *[procnum];
	inc = ceilf(float(scene->height) / (float)procnum);

	// HDR values
	image = (Colour*)_mm_malloc(sizeof(Colour) * scene->width * scene->height, 16);
	for (int i = 0; i < scene->width * scene->height; i++)
		image[i].Set(0.0f, 0.0f, 0.0f);

	// Initalise framebuffers for data
	final = (uint8_t*)_mm_malloc(sizeof(uint8_t) * scene->width * scene->height * 3, 16);
  /*! read png texture from disk */
  OBJScene::Texture *loadTexture(const FileName& fileName)
  {
    OBJScene::Texture *texture = new OBJScene::Texture();
    
    std::string ext = strlwr(fileName.ext());
    if (ext == "ptx" ) return loadPtexTexture(fileName);

    Ref<Image> img = loadImage(fileName);

    texture->width         = img.ptr->width;
    texture->height        = img.ptr->height;    
    texture->format        = OBJScene::Texture::RGBA8;
    texture->bytesPerTexel = 4;
    texture->data          = _mm_malloc(sizeof(int)*texture->width*texture->height,64);
    texture->width_mask    = isPowerOf2(texture->width) ? texture->width-1 : 0;
    texture->height_mask   = isPowerOf2(texture->height) ? texture->height-1 : 0;
    img.ptr->convertToRGBA8((unsigned char*)texture->data);
    return texture;
  }
Exemple #7
0
int
init_find_index_array(int array_size, int *input_array)
{
  int *temp_array_low = (int*) _mm_malloc(array_size * sizeof(int), 64);

#pragma omp parallel
  {
#pragma omp single
    {
      find_index_array_using_merge_sort(input_array, 
                                        (input_array + array_size), 
                                        temp_array_low, true);
    }
  }
  
  _mm_free(temp_array_low);

  return 1;
}
Exemple #8
0
void Mem::allocate(MemInfo &info, bool enabled)
{
    info.hugePages = 0;

    if (!enabled) {
        info.memory = static_cast<uint8_t*>(_mm_malloc(info.size, 4096));

        return;
    }

    info.memory = static_cast<uint8_t*>(VirtualAlloc(nullptr, info.size, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE));
    if (info.memory) {
        info.hugePages = info.pages;

        return;
    }

    allocate(info, false);
}
static void ult_nn_convolution_fixedpoint_comp_both_alloc(
    int16_t* &input,
    int16_t* &output,
    int32_t* &biases,
    int16_t* &kernel,
    int16_t* &input_ref,
    int16_t* &output_ref,
    int32_t* &biases_ref,
    int16_t* &kernel_ref,
    uint_least32_t num_output_feature_maps,
    uint_least32_t num_input_feature_maps,
    uint_least32_t output_width,
    uint_least32_t output_height,
    uint_least32_t input_width,
    uint_least32_t input_height,
    uint_least32_t kernel_width,
    uint_least32_t kernel_height,
    uint_least32_t center_x,
    uint_least32_t center_y
)
{
    uint_least32_t input_size = input_width * input_height * num_input_feature_maps * sizeof(int16_t);
    uint_least32_t output_size = (output_width + 2 * center_x) * (output_height + 2 * center_y) * num_output_feature_maps * sizeof(int16_t);
    uint_least32_t bias_size = num_output_feature_maps * sizeof(int32_t);
    uint_least32_t kernel_size = num_input_feature_maps * num_output_feature_maps * kernel_width * kernel_height * sizeof(int16_t);

    input_ref = (int16_t*)_mm_malloc(input_size, 4096);
    output_ref = (int16_t*)_mm_malloc(output_size, 4096);
    biases_ref = (int32_t*)_mm_malloc(bias_size, 4096);
    kernel_ref = (int16_t*)_mm_malloc(kernel_size, 4096);

    input_size = input_width * input_height * num_input_feature_maps * sizeof(int16_t);
    output_size = (output_width + 2 * center_x) * (output_height + 2 * center_y) * num_output_feature_maps * sizeof(int16_t);
    kernel_size = num_input_feature_maps * num_output_feature_maps * kernel_width * kernel_height * sizeof(int32_t);

    input = (int16_t*)_mm_malloc(input_size, 4096);
    output = (int16_t*)_mm_malloc(output_size, 4096);
    biases = (int32_t*)_mm_malloc(bias_size, 4096);
    kernel = (int16_t*)_mm_malloc(kernel_size, 4096);
}
Exemple #10
0
static SWORD NDS_HW_SampleLoad(struct SAMPLOAD* sload, int type)
{
	ASSERT(sload != NULL);

	SAMPLE *s = sload->sample;
	int handle;

	/* Find empty slot to put sample address in */
	for(handle = 0; handle < NDS_HW_MAXSAMPLES; handle++) {
		if(ipc->samples[handle] == 0) {
			break;
		}
	}

	if(handle == MAXSAMPLEHANDLES) {
		_mm_errno = MMERR_OUT_OF_HANDLES;
		return -1;
	}
	
	/* Reality check for loop settings */
	if (s->loopend > s->length)
		s->loopend = s->length;
	if (s->loopstart >= s->loopend)
		s->flags &= ~SF_LOOP;

	/* TODO difference between 8 and 16 bits? */
	SL_SampleSigned(sload);

	ipc->samples[handle] = _mm_malloc(s->length * ((s->flags & SF_16BITS) ? 2 : 1));
	if(ipc->samples[handle] == NULL) {
		_mm_errno = MMERR_SAMPLE_TOO_BIG;
		return -1;
	}

	/* read sample into buffer */
	if (SL_Load(ipc->samples[handle], sload, s->length))
		return -1;

	DC_FlushRange(ipc->samples[handle], s->length * ((s->flags & SF_16BITS) ? 2 : 1));

	return handle;
}
Exemple #11
0
MIKMODAPI CHAR* MikMod_InfoLoader(void)
{
	int len=0;
	MLOADER *l;
	CHAR *list=NULL;

	MUTEX_LOCK(lists);
	/* compute size of buffer */
	for(l=firstloader;l;l=l->next) len+=1+(l->next?1:0)+strlen(l->version);

	if(len)
		if((list=_mm_malloc(len*sizeof(CHAR)))) {
			list[0]=0;
			/* list all registered module loders */
			for(l=firstloader;l;l=l->next)
				sprintf(list,(l->next)?"%s%s\n":"%s%s",list,l->version);
		}
	MUTEX_UNLOCK(lists);
	return list;
}
Exemple #12
0
MREADER *_mm_new_rwops_reader(SDL_RWops * rw)
{
	int here;
	MRWOPSREADER* reader=(MRWOPSREADER*)_mm_malloc(sizeof(MRWOPSREADER));
	if (reader) {
		reader->core.Eof =&_mm_RWopsReader_Eof;
		reader->core.Read=&_mm_RWopsReader_Read;
		reader->core.Get =&_mm_RWopsReader_Get;
		reader->core.Seek=&_mm_RWopsReader_Seek;
		reader->core.Tell=&_mm_RWopsReader_Tell;
		reader->rw=rw;

		/* RWops does not explicitly support an eof check, so we shall find
		   the end manually - this requires seek support for the RWop */
		here = SDL_RWtell(rw);
		reader->end = SDL_RWseek(rw, 0, SEEK_END);
		SDL_RWseek(rw, here, SEEK_SET);   /* Move back */
	}
	return (MREADER*)reader;
}
Exemple #13
0
  void* alignedMalloc(size_t size, size_t align) 
  {
    assert((align & (align-1)) == 0);
//#if defined(TASKING_TBB) // FIXME: have to disable this for now as the TBB allocator itself seems to access some uninitialized value when using valgrind
//    return scalable_aligned_malloc(size,align);
//#else

// #if USE_MADVISE
//     if (size >= 16*PAGE_SIZE_2M) 
//     {
//       align = PAGE_SIZE_2M;
//       void *ptr = _mm_malloc(size,align);
//       os_madvise(ptr,size);
//       return ptr;
//      }
// #endif

    return _mm_malloc(size,align);
//#endif
  }
Exemple #14
0
BOOL ReadComment(UWORD len)
{
	if(len) {
		int i;

		if(!(of.comment=(CHAR*)_mm_malloc(len+1))) return 0;
		_mm_read_UBYTES(of.comment,len,modreader);
		
		/* translate IT linefeeds */
		for(i=0;i<len;i++)
			if(of.comment[i]=='\r') of.comment[i]='\n';

		of.comment[len]=0;	/* just in case */
	}
	if(!of.comment[0]) {
		free(of.comment);
		of.comment=NULL;
	}
	return 1;
}
dotprod_crcf dotprod_crcf_create(float * _h,
                                 unsigned int _n)
{
    dotprod_crcf q = (dotprod_crcf)malloc(sizeof(struct dotprod_crcf_s));
    q->n = _n;

    // allocate memory for coefficients, 16-byte aligned
    q->h = (float*) _mm_malloc( 2*q->n*sizeof(float), 16 );

    // set coefficients, repeated
    //  h = { _h[0], _h[0], _h[1], _h[1], ... _h[n-1], _h[n-1]}
    unsigned int i;
    for (i=0; i<q->n; i++) {
        q->h[2*i+0] = _h[i];
        q->h[2*i+1] = _h[i];
    }

    // return object
    return q;
}
void myConvKernel_simd()
{
    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        int nthreads = omp_get_num_threads();
        float *intermediate = (float*) _mm_malloc(sizeof(float) * nOutputPlanes, 512);
        assert(intermediate != NULL);
        
        #pragma omp for
        for (int opY = 0; opY < ioHeight; opY++)
        {
            for (int opX = 0; opX < ioWidth; opX++)
            {
                convolve3x3withPad_1elem(opY, opX, intermediate);
            }
        }
        
        _mm_free(intermediate);
    }
}
Exemple #17
0
void * tpcc_wl::threadInitWarehouse(void * This) {
	tpcc_wl * wl = (tpcc_wl *) This;
	int tid = ATOM_FETCH_ADD(wl->next_tid, 1);
	uint32_t wid = tid + 1;
	tpcc_buffer[tid] = (drand48_data *) _mm_malloc(sizeof(drand48_data), 64);
	assert((uint64_t)tid < g_num_wh);
	srand48_r(wid, tpcc_buffer[tid]);
	
	if (tid == 0)
		wl->init_tab_item();
	wl->init_tab_wh( wid );
	wl->init_tab_dist( wid );
	wl->init_tab_stock( wid );
	for (uint64_t did = 1; did <= DIST_PER_WARE; did++) {
		wl->init_tab_cust(did, wid);
		wl->init_tab_order(did, wid);
		for (uint64_t cid = 1; cid <= g_cust_per_dist; cid++) 
			wl->init_tab_hist(cid, did, wid);
	}
	return NULL;
}
LXC_ERROR_CODE LXC_SSE3Buffer_create(LXC_BUFFER *Buffer)
{
	if(!Buffer || !Buffer->maxFilterPartLength || !Buffer->maxFilterParts)
	{
		return LXC_ERR_INVALID_INPUT;
	}

	const uint maxElements = Buffer->maxFilterPartLength*Buffer->maxFilterParts;
#if defined(TARGET_WINDOWS)
	LXC_SSE3cpxFloat *p	= (LXC_SSE3cpxFloat*)_aligned_malloc(sizeof(LXC_SSE3cpxFloat)*maxElements, LXC_SSE3_ALIGN);
#elif defined(TARGET_LINUX)
	LXC_SSE3cpxFloat *p = (LXC_SSE3cpxFloat*)_mm_malloc(sizeof(LXC_SSE3cpxFloat)*maxElements, LXC_SSE3_ALIGN);
#endif
	// http://stackoverflow.com/questions/21328985/sse-reinterpret-cast-m128-instead-of-mm-load-ps
	// float *C = _mm_malloc(size * sizeof(*C), 16); 
	// or
	// float *C = _aligned_malloc(size * sizeof(*C), 16); 
	if(!p)
	{
		// reset the state of the buffer handle
		Buffer->maxFilterLength = 0;
		Buffer->maxFilterPartLength = 0;
		Buffer->maxFilterPartLength_NonZero = 0;
		Buffer->maxFilterParts = 0;
		Buffer->sampleFrequency = 0;
		Buffer->buffer = NULL;

		return LXC_ERR_DYNAMIC_MEMORY;
	}

	for(uint ii=0; ii < maxElements; ii++)
	{
		p[ii][0] = 0.0f;
		p[ii][1] = 0.0f;
	}

	Buffer->buffer = (void*)p;

	return LXC_NO_ERR;
}
Exemple #19
0
void DummyDataNode::forwardPropagate()
{
#ifdef RETURNALL
  return;
#endif

  for(int i=0; i<tenTopData_.size(); i++)
  {
    int dtype = tenTopData_[i]->getDataType();
    long long int bytes = tenTopData_[i]->getBufferSize();

    if(dtype == DT_FLOAT)
    {
      float* top = (float*)(tenTopData_[i]->getBuffer());
      fillData(top, bytes/sizeof(float));
#ifdef DEBUG
      printf("Executing FP %s: Data %p\n",node_name_.c_str(), top);
#endif
    }
    else if(dtype == DT_BF16)
    {
      libxsmm_bfloat16* top = (libxsmm_bfloat16*)(tenTopData_[i]->getLPBuffer());
      if(top == NULL)
        top = (libxsmm_bfloat16*)_mm_malloc(bytes/sizeof(libxsmm_bfloat16), 64);
      tenTopData_[i]->setLPBuffer(top);
      float *bot = (float*)tenTopData_[i]->getBuffer();
      fillData(bot, top, bytes/sizeof(float));

#ifdef DEBUG
      printf("Executing FP %s: Data %p\n",node_name_.c_str(), top);
#endif
    }
    else if(dtype == DT_INT)
    {
      int* top = (int*)(tenTopData_[i]->getBuffer());
      for(long long int i=0; i<bytes/sizeof(int); i++)
        top[i] = rand()%1000;
    }
  }
}
Exemple #20
0
MIKMODAPI CHAR* MikMod_InfoDriver(void)
{
	int t,len=0;
	MDRIVER *l;
	CHAR *list=NULL;

	MUTEX_LOCK(lists);
	/* compute size of buffer */
	for(l=firstdriver;l;l=l->next)
		len+=4+(l->next?1:0)+strlen(l->Version);

	if(len)
		if((list=_mm_malloc(len*sizeof(CHAR)))) {
			list[0]=0;
			/* list all registered device drivers : */
			for(t=1,l=firstdriver;l;l=l->next,t++)
				sprintf(list,(l->next)?"%s%2d %s\n":"%s%2d %s",
				    list,t,l->Version);
		}
	MUTEX_UNLOCK(lists);
	return list;
}
Exemple #21
0
extern "C" void* _Offload_shared_aligned_arena_malloc(
    MyoArena arena,
    size_t size,
    size_t align
)
{
    OFFLOAD_DEBUG_TRACE(3, "%s(%u, %lld, %lld)\n",
        __func__, arena, size, align);

    if (__offload_myoLoadLibrary()) {
        void *p = myo_wrapper.SharedAlignedArenaMalloc(arena, size, align);
        OFFLOAD_DEBUG_TRACE(3, "%s(%u, %lld, %lld)->%p\n",
            __func__, arena, size, align, p);
        return p;
    }
    else {
        if (align < sizeof(void*)) {
            align = sizeof(void*);
        }
        return _mm_malloc(size, align);
    }
}
Exemple #22
0
int main()
{
	int i;
	float *a;
    double t, sum;
    		
    a = (float *)_mm_malloc(sizeof(*a) * N, 16);
	for (i = 0; i < N; i++)
		a[i] = 1.0;

    t = hpctimer_getwtime();
    for (i = 0; i < NREPS; i++) {
        sum = reduction_sum(a, N);
        // sum = reduction_sum_sse(a, N);
    }
    t = (hpctimer_getwtime() - t) / NREPS;
    printf("Reduction sum: %.4f (real %.4f)\n", sum, (float)N);
    printf("Elapsed time: %.6f sec.\n", t);

	_mm_free(a);
    return 0;
}
Exemple #23
0
void 
Query_thd::init(workload * h_wl, int thread_id) {
	uint64_t request_cnt;
	q_idx = 0;
	request_cnt = WARMUP / g_thread_cnt + MAX_TXN_PER_PART + 4;
#if WORKLOAD == YCSB	
	queries = (ycsb_query *) 
		mem_allocator.alloc(sizeof(ycsb_query) * request_cnt, thread_id);
	srand48_r(thread_id + 1, &buffer);
#elif WORKLOAD == TPCC
	queries = (tpcc_query *) _mm_malloc(sizeof(tpcc_query) * request_cnt, 64);
#endif
	for (UInt32 qid = 0; qid < request_cnt; qid ++) {
#if WORKLOAD == YCSB	
		new(&queries[qid]) ycsb_query();
		queries[qid].init(thread_id, h_wl, this);
#elif WORKLOAD == TPCC
		new(&queries[qid]) tpcc_query();
		queries[qid].init(thread_id, h_wl);
#endif
	}
}
bool ult_nn_lrn_fp_check_outputs(
    nn::data<int16_t, 3>* output,
    int16_t* output_ref,
    uint_least32_t num_feature_maps,
    uint_least32_t feature_map_width,
    uint_least32_t feature_map_height,
    uint_least32_t batch
    )
{
    // zxy -> xyz
    uint_least32_t output_size = feature_map_width * feature_map_height * num_feature_maps * sizeof(int16_t);
    int16_t* outputT = (int16_t*)_mm_malloc(output_size, 64);
    int16_t * outputOpt = (int16_t *)output->buffer;
    uint32_t OFMOutBlock = 8;

    for (size_t y = 0; y < feature_map_height; y++)
    {
        for (size_t x = 0; x < feature_map_width; x++)
        {
            for (size_t z = 0; z < num_feature_maps; z++)
            {
                outputT[z * feature_map_width * feature_map_height + y * feature_map_height + x]
                    = outputOpt[z + x * num_feature_maps + y * num_feature_maps * feature_map_width];
            }
        }
    }

    bool passed = true;
    for (uint_least32_t i = 0; i < (output_size / sizeof(int16_t)) && passed; i++)
    if ((outputT[i] <  output_ref[i] - 3) ||
        (outputT[i] >  output_ref[i] + 3))

        passed = false;

    _mm_free(outputT);

    return passed;
}
Exemple #25
0
void leibniz1(){
	int nmic;
	mic_init(nmic);
	long n = 1l*1000*1000*800;
	long nbytes = n*8;
	double* v = (double *)_mm_malloc(nbytes, 64);
	leibniz_init(v, n);
	
	double sum; 
#pragma offload target(mic:0)					\
	mandatory						\
	in(v:length(n) align(64))	
	{
		hostmic_scale(v, n);
		hostmic_scale(v, n);
		hostmic_scale(v, n);
		sum = hostmic_sum(v, n);
	}

	printf("    leibniz1:  sum = %f\n", sum);
	_mm_free(v);
	mic_exit();
}
Exemple #26
0
 // -1 -1 -1 -1 -1
 // -1  1  1  1 -1
 // -1  1  8  1 -1
 // -1  1  1  1 -1
 // -1 -1 -1 -1 -1
 void blob5x5( const uint8_t* in, int16_t* out, int w, int h ) {
   int32_t* integral = (int32_t*)( _mm_malloc( w*h*sizeof( int32_t ), 16 ) );
   detail::integral_image( in, integral, w, h );
   int16_t* out_ptr   = out + 3 + 3*w;
   int16_t* out_end   = out + w * h - 2 - 2*w;
   const int32_t* i00 = integral;
   const int32_t* i50 = integral + 5;
   const int32_t* i05 = integral + 5*w;
   const int32_t* i55 = integral + 5 + 5*w;
   const int32_t* i11 = integral + 1 + 1*w;
   const int32_t* i41 = integral + 4 + 1*w;
   const int32_t* i14 = integral + 1 + 4*w;
   const int32_t* i44 = integral + 4 + 4*w;    
   const uint8_t* im22 = in + 3 + 3*w;
   for( ; out_ptr != out_end; out_ptr++, i00++, i50++, i05++, i55++, i11++, i41++, i14++, i44++, im22++ ) {
     int32_t result = 0;
     result = -( *i55 - *i50 - *i05 + *i00 );
     result += 2*( *i44 - *i41 - *i14 + *i11 );
     result += 7* *im22;
     *out_ptr = result;
   }
   _mm_free( integral );
 }
Exemple #27
0
void
p_init(size_t sz, uint* buf, uint* p)
{
  uint* p_;
  
  p_ = (uint*) _mm_malloc(sz * sizeof(uint), 64);

  uint i;
  for(i = 0; i < sz; i++) p[i] = i;

  uint* l  = p;
  uint* h  = p + sz;

#pragma omp parallel
  {
#pragma omp single
    {
      p_sort(buf, l, h, p_);
    }
  }

  _mm_free(p_);
}
static bool ult_nn_convolution_fixedpoint_comp_check_outputs(
    nn::data<int16_t, 3>* output,
    int16_t* output_ref,
    uint_least32_t num_output_feature_maps,
    uint_least32_t output_feature_map_width,
    uint_least32_t output_feature_map_height,
    uint_least32_t center_x,
    uint_least32_t center_y
)
{
    // zxy -> xyz
    uint_least32_t output_size = (output_feature_map_width + 2 * center_x) * (output_feature_map_height + 2 * center_y) * num_output_feature_maps * sizeof(int16_t);
    int16_t* outputT = (int16_t*)_mm_malloc(output_size, 64);
    int16_t * outputOpt = (int16_t *)output->buffer;
    uint32_t OFMOutBlock = 16;

    for (size_t y = 0; y < output_feature_map_height; y++)
    {
        for (size_t x = 0; x < output_feature_map_width; x++)
        {
            for (size_t z = 0; z < num_output_feature_maps; z++)
            {
                outputT[z * output_feature_map_width * output_feature_map_height + y * output_feature_map_height + x]
                    = outputOpt[z + x * num_output_feature_maps + y * num_output_feature_maps * output_feature_map_width];
            }
        }
    }

    bool passed = true;
    for (uint_least32_t i = 0; i < (output_size / sizeof(int16_t)) && passed; i++)
        if (output_ref[i] != outputT[i])
            passed = false;

    _mm_free(outputT);

    return passed;
}
extern void *OFFLOAD_MALLOC(
    size_t size,
    size_t align
)
{
    void *ptr;
    int   err;

    OFFLOAD_DEBUG_TRACE(2, "%s(%lld, %lld)\n", __func__, size, align);

    if (align < sizeof(void*)) {
        align = sizeof(void*);
    }

    ptr = _mm_malloc(size, align);
    if (ptr == NULL) {
        LIBOFFLOAD_ERROR(c_offload_malloc, size, align);
        exit(1);
    }

    OFFLOAD_DEBUG_TRACE(2, "%s returned %p\n", __func__, ptr);

    return ptr;
}
void square_dgemm(const int M, const double *A, const double *B, double *C) {

	// Allocate the B packed memory
	if(M > B_PREALLOC_SIZE)
	{
		B_pack = _mm_malloc(A_BLOCK_LEN*M*sizeof(double), MEM_ALIGN);
	}
	else
	{
		B_pack = B_pack_prealloc;
	}

	// Calculate the number of accum blocks in the master loop
	const int num_acc_blocks = CALC_NUM_BLOCKS(M, A_BLOCK_LEN);

	// For each accumulation block (row of B, column of A)
	for(int acc_block = 0; acc_block < num_acc_blocks; ++acc_block)
	{
		const int acc_pos = acc_block * A_BLOCK_LEN;
		const int num_acc = CALC_CUR_BLOCK_WIDTH(acc_pos, A_BLOCK_LEN, M);

		// Get the current column of A and row of B
		const double* cur_A = A + acc_pos * M;
		const double* cur_B = B + acc_pos;

		// Do panel panel product
		gepp_blk_var1(M, cur_A, num_acc, cur_B, C);
	}

	// Free the dynamically sized blcok
	if(M > B_PREALLOC_SIZE)
	{
		_mm_free(B_pack);
		B_pack = 0;
	}
}