int main(int argc, char *argv[])
{
	double* matA = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64);
	double* matB = _mm_malloc((WIDTH*HEIGHT)*sizeof(double), 64);
	double* prod = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64);
	double* prod_ref = _mm_malloc(WIDTH*HEIGHT*sizeof(double), 64);

	int read_flag = read_matrix(TEST_FILENAME, prod_ref, matA, matB);
	if (read_flag == 1)
		printf("Cannot open test file\n");
	else if (read_flag == 2)
		printf("Error while reading data from test file");
	else if (read_flag == 3)
		printf("Error while closing the test file");
	if (read_flag)
		return 0;

	uint64_t start = timestamp_us();
	matmul_optimize(prod, matA, matB); /* run the optimization functions. */
	uint64_t time = timestamp_us() - start;

	if (compare_matrix(prod, prod_ref)) {
		printf("%lu incorrect\n", time);
	} else {
		printf("%lu\n", time);
	}
	_mm_free(prod_ref);
	_mm_free(prod);
	_mm_free(matB);
	_mm_free(matA);
	return 0;
}
예제 #2
0
int main(int argc, char **argv)
{
	int i;
	float *a, *b;
    double t;
    		
    a = (float *)_mm_malloc(sizeof(float) * N, 16);
    b = (float *)_mm_malloc(sizeof(float) * N, 16);
	for (i = 0; i < N; i++) {
		a[i] = 1.0;
	}
			      
    t = hpctimer_getwtime();
    for (i = 0; i < NREPS; i++) {
        fun_def(a, b, N);
        // fun_sse(a, b, N);
    }
    t = hpctimer_getwtime() - t;
    t = t / NREPS;
       
    //print_vec(b, N);
    
    printf("Elapsed time: %.6f sec.\n", t);

	_mm_free(a);
	_mm_free(b);

    return 0;
}
예제 #3
0
static void STX_Cleanup(void)
{
    _mm_free(stxbuf);
    _mm_free(paraptr);
    _mm_free(poslookup);
    _mm_free(mh);
}
static void ult_nn_lrn_fp_both_dealloc(
    int16_t* &input,
    int16_t* &output,
    int16_t* &input_ref,
    int16_t* &output_ref)
{
    if (input != 0)
    {
        _mm_free(input);
        input = 0;
    }

    if (output != 0)
    {
        _mm_free(output);
        output = 0;
    }

    if (input_ref != 0)
    {
        _mm_free(input_ref);
        input_ref = 0;
    }

    if (output_ref != 0)
    {
        _mm_free(output_ref);
        output_ref = 0;
    }
}
예제 #5
0
void S3M_Cleanup(void)
{
	_mm_free(s3mbuf);
	_mm_free(paraptr);
	_mm_free(poslookup);
	_mm_free(mh);
	_mm_free(origpositions);
}
예제 #6
0
BluePaintBSDF::~BluePaintBSDF()
{
	_mm_free(xy); _mm_free(z); _mm_free(e);
	
	for (int i = 0; i < numBxDFs; i++)
		delete bxdfs[i];

	delete[] bxdfs;
}
예제 #7
0
파일: drv_nds_sw.c 프로젝트: sypherce/dslua
static void NDS_SW_Exit(void)
{
	MikMod9_SendCommand(NDS_SW_CMD_EXIT << 28);
	VC_Exit();
	_mm_free(ipc->buffer);
	ipc->buffer = NULL;
	_mm_free(ipc);
	ipc = NULL;
}
예제 #8
0
void free_matrix(struct matrix_t* m) {
    msize_t n;
    for (n = 0; n < m->n_rows; n++) {
        _mm_free(m->data[n]);
    }

    _mm_free((void *)m->data);
    _mm_free(m);
}
예제 #9
0
 void sobel5x5( const uint8_t* in, uint8_t* out_v, uint8_t* out_h, int w, int h ) {
   int16_t* temp_h = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
   int16_t* temp_v = (int16_t*)( _mm_malloc( w*h*sizeof( int16_t ), 16 ) );
   detail::convolve_cols_5x5( in, temp_v, temp_h, w, h );
   detail::convolve_12021_row_5x5_16bit( temp_v, out_v, w, h );
   detail::convolve_14641_row_5x5_16bit( temp_h, out_h, w, h );
   _mm_free( temp_h );
   _mm_free( temp_v );
 }
void copyOutResults(std::vector<cv::Mat> &_outputPlanes)
{   
    copyToCVMatF(outputPlanes, _outputPlanes[0], ioHeight, ioWidth, ioWidth);
    
    if (weights != NULL)        _mm_free(weights);
    if (packed_weights != NULL) _mm_free(packed_weights);
    if (inputPlanes != NULL)    _mm_free(inputPlanes);
    if (outputPlanes != NULL)   _mm_free(outputPlanes);
    if (biases != NULL)         _mm_free(biases);
}
예제 #11
0
ieImageMem::~ieImageMem()
{
	if (pbBitmap) {
		pbBitmap -= nBitmapOffs;
		_mm_free(pbBitmap);
		pbBitmap = nullptr;
	}

	if (pCLUT) {
		_mm_free(pCLUT);
		pCLUT = nullptr;
	}
}
예제 #12
0
/**
 * main application
 *
 * @param argc number of cli arguments
 * @param argv values of cli arguments
 */
int main(int argc, char* argv[])
{
	if (argc != 6)
	{
		std::cout << "cg_max_iterations" << std::endl;
		std::cout << "cg_eps" << std::endl;
		std::cout << "mpiGridX" << std::endl;
		std::cout << "mpiGridY" << std::endl;
		std::cout << "gridwidth" << std::endl;
		std::cout << std::endl;

		std::cout << "example:" << std::endl;
		std::cout << "./app 10 1e-5 2 5 128" << std::endl;

		return -1;
	}

	// input parameters
	size_t cg_max_iterations = atoi(argv[1]); 
	double cg_eps = atof(argv[2]);
	const int mpiGridX = atoi(argv[3]);
	const int mpiGridY = atoi(argv[4]);
	grid_points_1d = adaptMeshSize(mpiGridX, mpiGridY, atoi(argv[5]));

	std::printf("max_iter: %d, eps: %f, grid: (%d, %d), n: %d \n", 
		static_cast<int>(cg_max_iterations), 
		cg_eps, 
		mpiGridX, mpiGridY, 
		static_cast<int>(grid_points_1d));

	double* gridS = (double*)_mm_malloc(grid_points_1d*grid_points_1d*sizeof(double), 64);
	double* bS = (double*)_mm_malloc(grid_points_1d*grid_points_1d*sizeof(double), 64);

	// TEST single
	// initialize the gird and rights hand side
	init_grid(gridS);
	init_b(bS);
	
	// solve Poisson equation using CG method
	Timer tS;
	tS.start();
	single::solve(gridS, bS, cg_max_iterations, cg_eps);
	double timeS = tS.stop();
	
	std::cout << std::endl << "Needed time single: " << timeS << " s" << std::endl << std::endl;

	_mm_free(gridS);
	_mm_free(bS);

	return 0;
}
예제 #13
0
Neighbor::~Neighbor()
{
#ifdef ALIGNMALLOC
  if(numneigh) _mm_free(numneigh);
  if(neighbors) _mm_free(neighbors);
#else 
  if(numneigh) free(numneigh);
  if(neighbors) free(neighbors);
#endif
  
  if(bincount) free(bincount);

  if(bins) free(bins);
}
예제 #14
0
int
main(int argc, char *argv[])
{
        /* Initialize the matrices with some "random" data. */
        init();

        run_multiply();

        _mm_free(mat_a);
        _mm_free(vec_b);
        _mm_free(vec_c);
        _mm_free(vec_ref);

        return 0;
}
static void ult_nn_convolution_fixedpoint_comp_both_dealloc(
    int16_t* &input,
    int16_t* &output,
    int32_t* &biases,
    int16_t* &kernel,
    int16_t* &input_ref,
    int16_t* &output_ref,
    int32_t* &biases_ref,
    int16_t* &kernel_ref)
{
    if (input != 0)
    {
        _mm_free(input);
        input = 0;
    }

    if (output != 0)
    {
        _mm_free(output);
        output = 0;
    }

    if (biases != 0)
    {
        _mm_free(biases);
        biases = 0;
    }

    if (kernel != 0)
    {
        _mm_free(kernel);
        kernel = 0;
    }

    if (input_ref != 0)
    {
        _mm_free(input_ref);
        input_ref = 0;
    }

    if (output_ref != 0)
    {
        _mm_free(output_ref);
        output_ref = 0;
    }

    if (biases_ref != 0)
    {
        _mm_free(biases_ref);
        biases_ref = 0;
    }

    if (kernel_ref != 0)
    {
        _mm_free(kernel_ref);
        kernel_ref = 0;
    }
}
static void ult_nn_fc_both_dealloc(
    int16_t* &input,
    T_output_type* &output,
    int32_t* &biases,
    int16_t* &kernel,
    int16_t* &input_ref,
    T_output_type* &output_ref,
    int32_t* &biases_ref,
    int16_t* &kernel_ref)
{
    if (input != 0)
    {
        _mm_free(input);
        input = 0;
    }

    if (output != 0)
    {
        _mm_free(output);
        output = 0;
    }

    if (biases != 0)
    {
        _mm_free(biases);
        biases = 0;
    }

    if (kernel != 0)
    {
        _mm_free(kernel);
        kernel = 0;
    }

    if (input_ref != 0)
    {
        _mm_free(input_ref);
        input_ref = 0;
    }

    if (output_ref != 0)
    {
        _mm_free(output_ref);
        output_ref = 0;
    }

    if (biases_ref != 0)
    {
        _mm_free(biases_ref);
        biases_ref = 0;
    }

    if (kernel_ref != 0)
    {
        _mm_free(kernel_ref);
        kernel_ref = 0;
    }
}
예제 #17
0
inline UMatrix2D<T>& UMatrix2D<T>::operator  = (UMatrix2D<T>& M)
                                        {
#ifdef _SAFE_ACCESS_
    CheckLocker cl1(GetLocker());
    CheckLocker cl2(M.GetLocker());
#endif //_SAFE_ACCESS_
    nX  = M.GetX();
    nY  = M.GetY();
    if(mt == MXT_MEM) {
        if(Ptr !=  NULL) 
#ifdef __ICC
            _mm_free(Ptr);
#else
            free(Ptr);
#endif //__ICC        
#ifdef __ICC
            Ptr = (T*)(_mm_malloc(sizeof(T)*nX*nY,_ALIGN));
#else         
            Ptr = (T*)(malloc(sizeof(T)*nX*nY));
#endif         
        memcpy(Ptr,M.GetMatrixPtr(),sizeof(T)*nX*nY);
    } else {
        Ptr = M.GetMatrixPtr();
    }

    ms = M.GetMatrixState();

    return *this;
}
예제 #18
0
/*
 * Does a single fwd+bwd fft of size n and checks it against python.
 * To test fftw replace fft_mkl with fft_fftw and set flag to MKL_ALIGN.
 */
void test_mkl(int n, enum mkl_align_flag flag){
	double *space = (double *)_mm_malloc((4*n+2)*sizeof(double), 64);
	double *v;
	switch(flag){
	case MKL_ALIGN:
		v = space;
		break;
	case MKL_NOALIGN:
		v = space+1;
		break;
	}
	double *w = v + 2*n;
	
	for(int i=0; i < n; i++){
		w[2*i] = v[2*i] = rand()*1.0/RAND_MAX - 0.5;
		w[2*i+1] = v[2*i+1] = rand()*1.0/RAND_MAX - 0.5;
	}

	verify_dir("DBG/");
	array_out(v, 2, n, "DBG/v.dat");
	fft_mkl fft(n);
	fft.fwd(v);
	array_out(v, 2, n, "DBG/vf.dat");
	system("test_fft.py DBG/v.dat DBG/vf.dat");

	fft.bwd(v);
	array_diff(v, w, 2*n);
	double rerror = array_max(v, 2*n)/array_max(w, 2*n);
	std::cout<<"\n\tfwd+bwd error in complex mkl 1D fft"<<std::endl;
	std::cout<<"\tn = "<<n<<std::endl;
	std::cout<<"\trel error = "<<rerror<<std::endl;

	_mm_free(space);
}
예제 #19
0
파일: vmath.c 프로젝트: OpenEdition/bilbo
/* xvm_free:
 *   Free a vector allocated by xvm_new.
 */
void xvm_free(double x[]) {
#if defined(__SSE2__) && !defined(XVM_ANSI)
	_mm_free(x);
#else
	free(x);
#endif
}
예제 #20
0
파일: cache.c 프로젝트: rlugojr/rufus
void *libfat_get_sector(struct libfat_filesystem *fs, libfat_sector_t n)
{
    struct libfat_sector *ls;

    for (ls = fs->sectors; ls; ls = ls->next) {
        if (ls->n == n)
            return ls->data;	/* Found in cache */
    }

    /* Not found in cache */
    ls = _mm_malloc(sizeof(struct libfat_sector) + LIBFAT_SECTOR_SIZE, 16);
    if (!ls) {
        libfat_flush(fs);
        ls = _mm_malloc(sizeof(struct libfat_sector) + LIBFAT_SECTOR_SIZE, 16);

        if (!ls)
            return NULL;	/* Can't allocate memory */
    }

    if (fs->read(fs->readptr, ls->data, LIBFAT_SECTOR_SIZE, n)
            != LIBFAT_SECTOR_SIZE) {
        _mm_free(ls);
        return NULL;		/* I/O error */
    }

    ls->n = n;
    ls->next = fs->sectors;
    fs->sectors = ls;

    return ls->data;
}
예제 #21
0
void deinit_pcl_dgemm (void)
{
#ifdef __INTEL_OFFLOAD
    if (!usemic)
        return;
    
    #pragma offload target(mic : 0) \
        in(pcl_a_mic: length(max_pcl_matrix_size*max_pcl_matrix_size) FREE align(64)) \
        in(pcl_b_mic: length(max_pcl_matrix_size*max_pcl_matrix_size) FREE align(64)) \
        in(pcl_c_mic: length(max_pcl_matrix_size*max_pcl_matrix_size) FREE align(64))

    _mm_free (pcl_a_mic);
    _mm_free (pcl_b_mic);
    _mm_free (pcl_c_mic);
#endif
}
예제 #22
0
파일: smart.c 프로젝트: 10se1ucgo/rufus
BOOL Identify(HANDLE hPhysical)
{
	ATA_PASSTHROUGH_CMD Command = {0};
	IDENTIFY_DEVICE_DATA* idd;
	int i, r;

	Command.AtaCmd = ATA_IDENTIFY_DEVICE;

	// You'll get an error here if your compiler does not properly pack the IDENTIFY struct
	COMPILE_TIME_ASSERT(sizeof(IDENTIFY_DEVICE_DATA) == 512);

	idd = (IDENTIFY_DEVICE_DATA*)_mm_malloc(sizeof(IDENTIFY_DEVICE_DATA), 0x10);
	if (idd == NULL)
		return FALSE;

	for (i=0; i<ARRAYSIZE(pt); i++) {
		r = pt[i].fn(hPhysical, &Command, idd, sizeof(IDENTIFY_DEVICE_DATA), SPT_TIMEOUT_VALUE);
		if (r == SPT_SUCCESS) {
			uprintf("Success using %s\n", pt[i].type);
			if (idd->CommandSetSupport.SmartCommands) {
				DumpBufferHex(idd, sizeof(IDENTIFY_DEVICE_DATA));
				uprintf("SMART support detected!\n");
			} else {
				uprintf("No SMART support\n");
			}
			break;
		}
		uprintf("No joy with: %s (%s)\n", pt[i].type, SptStrerr(r));
	}
	if (i >= ARRAYSIZE(pt))
		uprintf("NO ATA FOR YOU!\n");

	_mm_free(idd);
	return TRUE;
}
예제 #23
0
void leibniz3(){
	int nmic;
	mic_init(nmic);
	assrt(nmic > 0);
	long n = 1l*1000*1000*800;
	long nbytes = n*8;
	printf("            nbytes = %ld\n",nbytes);
	double* v = (double *)_mm_malloc(nbytes, 64);
	leibniz_init(v, n);
	printf("    host pointer v = %p \n", v);

#pragma offload target(mic:0)					\
	in(v:length(n) align(64) alloc_if(1) free_if(0))
	{}

#pragma offload target(mic:0) nocopy(v:length(n) alloc_if(0) free_if(0))
	hostmic_scale(v, n);

#pragma offload target(mic:0)					\
	out(v:length(n) align(64) alloc_if(0) free_if(0))
	hostmic_scale(v, n);
	
	hostmic_scale(v, n);
	
	double sum;
#pragma offload target(mic:0)					\
	in(v:length(n) align(64) alloc_if(0) free_if(1))
	sum = hostmic_sum(v, n);

	printf("               sum = %f\n", sum);
	_mm_free(v);
	mic_exit();
}
예제 #24
0
void leibniz2(){
	int nmic;
	mic_init(nmic);
	long n = 1l*1000*1000*800;
	long nbytes = n*8;
	double* v = (double *)_mm_malloc(nbytes, 64);
	leibniz_init(v, n);
	
	double sum=-1;
#pragma offload target(mic:0)					\
	in(v:length(n) align(64))				\
	signal(v)
	{
		hostmic_scale(v, n);
		hostmic_scale(v, n);
		hostmic_scale(v, n);
		sum = hostmic_sum(v, n);
	}

#pragma offload_wait target(mic:0)  wait(v)
	printf("    leibniz2:  sum = %f\n", sum);

	_mm_free(v);
	mic_exit();
}
예제 #25
0
static void pipe_Exit(void)
{
#if defined unix || (defined __APPLE__ && defined __MACH__)
	int pstat;
	pid_t pid2;
#endif

	VC_Exit();
	_mm_free(audiobuffer);
	if(pipeout) {
		_mm_delete_file_writer(pipeout);
		pipeout=NULL;
	}
	if(pipefile) {
#if !defined unix && (!defined __APPLE__ || !defined __MACH__)
#ifdef __WATCOMC__
		_pclose(pipefile);
#else
		pclose(pipefile);
#endif
#ifdef __EMX__
		_fsetmode(stdout,"t");
#endif
#else
		fclose(pipefile);
		do {
			pid2=waitpid(pid,&pstat,0);
		} while (pid2==-1 && errno==EINTR);
#endif
		pipefile=NULL;
	}
}
예제 #26
0
void time_chain() {
    /*
     * 10^9 random entries
     */
    long int *list;
    long int n = 1000*1000*1000;
    list = (long int*)_mm_malloc(n*sizeof(long int), 64);
    for(long int i=0; i < n; i++)
        list[i] = rand();

    printf("\t\t\t chained access of array of size 10^9\n");
    printf("\t\t\t each entry is in [0,RAND_MAX]\n");
    printf("\t\t\t10^9/RAND_MAX = %f\n",1.0*n/RAND_MAX);

    int count = 6000;

    printf("\t\t\t number of accesses = %d\n", count);

    TimeStamp clk;
    clk.tic();
    double xx = chain_walk(list, n, count);
    double cycles = clk.toc();

    printf("\tcycles per access       =  %f\n", cycles/count);
    int repeats = countrepeats(list, n, count);
    printf("\tnumber of repeats       =  %d\n", repeats);
    double prob = probNoR(n, count);
    printf("\ttheor prob of 0 repeats =  %f\n\n", prob);

    _mm_free(list);
}
예제 #27
0
파일: md5.c 프로젝트: clcarwin/mdrotor
static void release(struct EngineThread *eng)
{
	SSE_CTX *ctx = eng->priv;
	result128_free(ctx->res);
	ssresult_free(ctx->sres);
	_mm_free(ctx);
}
예제 #28
0
파일: drv_nds_sw.c 프로젝트: sypherce/dslua
static BOOL NDS_SW_Init(void)
{
	md_mode|=DMODE_SOFT_MUSIC|DMODE_SOFT_SNDFX;
	md_mode &= ~DMODE_STEREO;

	ipc = (NDS_SW_IPC*)_mm_malloc(sizeof(NDS_SW_IPC));
	if (ipc == NULL) {
		MikMod_errno = MMERR_OUT_OF_MEMORY;
		return 1;
	}

	ipc->buffer = (SBYTE*)_mm_malloc(BUFFERSIZE);
	if (ipc->buffer == NULL) {
		_mm_free(ipc);
		ipc = NULL;
		MikMod_errno = MMERR_OUT_OF_MEMORY;
		return 1;
	}

	if (VC_Init()) {
		return 1;
	}

	ipc->bufferSize = BUFFERSIZE;
	ipc->sampleRate = md_mixfreq;
	ipc->format = (md_mode & DMODE_16BITS) ? 16 : 8;

	MikMod9_SendCommand(NDS_SW_CMD_INIT << 28 | (u32)ipc);
	
	return 0;
}
예제 #29
0
void HostSpace::deallocate( void * const arg_alloc_ptr
    , const size_t
#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
    arg_alloc_size
#endif
    ) const
{
  if ( arg_alloc_ptr ) {

    if ( m_alloc_mech == STD_MALLOC ) {
      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
      free( alloc_ptr );
    }

#if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
      _mm_free( arg_alloc_ptr );
    }
#endif

#if defined( KOKKOS_ENABLE_POSIX_MEMALIGN )
    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
      free( arg_alloc_ptr );
    }
#endif

#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
    else if ( m_alloc_mech == POSIX_MMAP ) {
      munmap( arg_alloc_ptr , arg_alloc_size );
    }
#endif

  }
}
예제 #30
0
	void FreeMemory ( void * ptr, size_t capacity )
	{
		if( !m_memory_init ) InitPool ();
		else if (IsPoolFull())
		{
			_mm_free(ptr);
		}

		// find the best place (insertion sort)

		FreeMemoryHolder* it(m_free_memory);
		FreeMemoryHolder * const it_end(&(m_free_memory[HMM_MAX_FREE_OBJECTS]));
		do
		{
			if (it->m_ptr == nullptr || it->m_capacity > capacity)
			{
				break;
			}
		} while (++it != it_end);

		// move other containers up by 1 index

		FreeMemoryHolder* it2(it_end);
		FreeMemoryHolder* it3(it2 - 2);
		while (--it2 != it)
		{
			it2->Copy(it3);
			it3->Zero();
			--it3;
		}
		
		it->m_ptr = ptr;
		it->m_capacity = capacity;
	}