Exemplo n.º 1
0
void* FMallocTBB::Malloc( SIZE_T Size, uint32 Alignment )
{
	IncrementTotalMallocCalls();

	MEM_TIME(MemTime -= FPlatformTime::Seconds());

	void* NewPtr = NULL;
	if( Alignment != DEFAULT_ALIGNMENT )
	{
		Alignment = FMath::Max(Size >= 16 ? (uint32)16 : (uint32)8, Alignment);
		NewPtr = scalable_aligned_malloc( Size, Alignment );
	}
	else
	{
		NewPtr = scalable_malloc( Size );
	}

	if( !NewPtr && Size )
	{
		OutOfMemory(Size, Alignment);
	}
#if UE_BUILD_DEBUG || UE_BUILD_DEVELOPMENT
	else if (Size)
	{
		FMemory::Memset(NewPtr, DEBUG_FILL_NEW, Size); 
	}
#endif
	MEM_TIME(MemTime += FPlatformTime::Seconds());
	return NewPtr;
}
Exemplo n.º 2
0
    void* alignedMalloc(size_t size, size_t align)
    {
      assert((align & (align-1)) == 0);
#if defined(TASKING_TBB)
      return scalable_aligned_malloc(size,align);
#else
#  ifdef _WIN32
      return _aligned_malloc(size, align);
#  else // __UNIX__
      return _mm_malloc(size, align);
#  endif
#endif
    }
Exemplo n.º 3
0
// regression test against incorrect work of msize/realloc
// for aligned objects
void TestAlignedMsize()
{
    const int NUM = 4;
    char *p[NUM];
    size_t objSizes[NUM];
    size_t allocSz[] = {4, 8, 512, 2*1024, 4*1024, 8*1024, 16*1024, 0};
    size_t align[] = {8, 512, 2*1024, 4*1024, 8*1024, 16*1024, 0};

    for (int a=0; align[a]; a++)
        for (int s=0; allocSz[s]; s++) {
            for (int i=0; i<NUM; i++) {
                p[i] = (char*)scalable_aligned_malloc(allocSz[s], align[a]);
                ASSERT(is_aligned(p[i], align[a]), NULL);
            }

            for (int i=0; i<NUM; i++) {
                objSizes[i] = scalable_msize(p[i]);
                ASSERT(objSizes[i] >= allocSz[s],
                       "allocated size must be not less than requested");
                memset(p[i], i, objSizes[i]);
            }
            for (int i=0; i<NUM; i++) {
                for (unsigned j=0; j<objSizes[i]; j++)
                    ASSERT(((char*)p[i])[j] == i, "Error: data broken");
            }

            for (int i=0; i<NUM; i++) {
                p[i] = (char*)scalable_aligned_realloc(p[i], 2*allocSz[s], align[a]);
                ASSERT(is_aligned(p[i], align[a]), NULL);
                memset((char*)p[i]+allocSz[s], i+1, allocSz[s]);
            }
            for (int i=0; i<NUM; i++) {
                for (unsigned j=0; j<allocSz[s]; j++)
                    ASSERT(((char*)p[i])[j] == i, "Error: data broken");
                for (size_t j=allocSz[s]; j<2*allocSz[s]; j++)
                    ASSERT(((char*)p[i])[j] == i+1, "Error: data broken");
            }
            for (int i=0; i<NUM; i++)
                scalable_free(p[i]);
        }
}
Exemplo n.º 4
0
void* safer_aligned_malloc( size_t size, size_t alignment )
{
    // workaround for "is power of 2 pow N" bug that accepts zeros
    return scalable_aligned_malloc( size, alignment>sizeof(size_t*)?alignment:sizeof(size_t*) );
}
Exemplo n.º 5
0
int main(int argc, char* argv[])
{
    double
	sTime, eTime;

    double sum_delta  = 0.0;
    double sum_ref    = 0.0;
    double max_delta  = 0.0;
    double sumReserve = 0.0;

    printf("Monte Carlo European Option Pricing Single Precision\n\n");
    printf("Compiler Version  = %d\n", __INTEL_COMPILER/100);
    printf("Release Update    = %d\n", __INTEL_COMPILER_UPDATE);
    printf("Build Time        = %s %s\n", __DATE__, __TIME__);
    printf("Path Length       = %d\n", RAND_N);
    printf("Number of Options = %d\n", OPT_N);
    printf("Block Size        = %d\n", RAND_BLOCK_LENGTH);
    printf("Worker Threads    = %d\n\n", NTHREADS);

    const int mem_size  = sizeof(float)*OPT_PER_THREAD;

#ifndef _OPENMP
    NTHREADS = 1;
#endif

    float *samples[MAX_THREADS];
    VSLStreamStatePtr Streams[MAX_THREADS];
    const int nblocks = RAND_N/RAND_BLOCK_LENGTH;
#pragma omp parallel reduction(+ : sum_delta) reduction(+ : sum_ref) reduction(+ : sumReserve) reduction(max : max_delta)
{
#ifdef _OPENMP
    int threadID = omp_get_thread_num();
#else
    int threadID = 0;
#endif
    unsigned int randseed = RANDSEED + threadID;
    srand(randseed);
    float *CallResultList     = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *CallConfidenceList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *StockPriceList     = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *OptionStrikeList   = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    float *OptionYearsList    = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN);
    for(int i = 0; i < OPT_PER_THREAD; i++)
    {
        CallResultList[i]     = 0.0f;
        CallConfidenceList[i] = 0.0f;
        StockPriceList[i]     = RandFloat_T(5.0f, 50.0f, &randseed);
        OptionStrikeList[i]   = RandFloat_T(10.0f, 25.0f, &randseed);
        OptionYearsList[i]    = RandFloat_T(1.0f, 5.0f, &randseed);
    }
    samples[threadID] = (float *)scalable_aligned_malloc(RAND_BLOCK_LENGTH * sizeof(float), SIMDALIGN);
    vslNewStream(&(Streams[threadID]), VSL_BRNG_MT2203 + threadID, RANDSEED);

#pragma omp barrier
    if (threadID == 0)
    {
        printf("Starting options pricing...\n");
        sTime = second();
        start_cyc = _rdtsc();
    }

    for(int opt = 0; opt < OPT_PER_THREAD; opt++)
    {
        const float VBySqrtT = VLog2E * sqrtf(OptionYearsList[opt]);
	const float MuByT    = MuLog2E * OptionYearsList[opt];
        const float Y        = StockPriceList[opt];
        const float Z        = OptionStrikeList[opt];
		            
        float v0 = 0.0f;
        float v1 = 0.0f;
        for(int block = 0; block < nblocks; ++block)
        {
            float *rand = samples[threadID];
            vsRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, Streams[threadID], RAND_BLOCK_LENGTH, rand, MuByT, VBySqrtT); 
#pragma vector aligned
#pragma simd reduction(+:v0) reduction(+:v1)
#pragma unroll(4)
            for(int i=0; i < RAND_BLOCK_LENGTH; i++) 
            {
                float callValue  = Y * exp2f(rand[i]) - Z;
                callValue = (callValue > 0.0) ? callValue : 0.0;
                v0 += callValue;
                v1 += callValue * callValue;
            }
        }
        const float  exprt      = exp2f(RLog2E*OptionYearsList[opt]);
        CallResultList[opt]     = exprt * v0 * INV_RAND_N;
        const float  stdDev     = sqrtf((F_RAND_N * v1 - v0 * v0) * STDDEV_DENOM);
        CallConfidenceList[opt] = (float)(exprt * stdDev * CONFIDENCE_DENOM);
    } //end of opt 

#pragma omp barrier
    if (threadID == 0) {
        end_cyc = _rdtsc();
        eTime = second();
        printf("Parallel simulation completed in %f seconds.\n", eTime-sTime);
        printf("Validating the result...\n");
    }

    double delta = 0.0, ref = 0.0, L1norm = 0.0;
    int max_index = 0;
    double max_local  = 0.0;
    for(int i = 0; i < OPT_PER_THREAD; i++)
    {
        double callReference, putReference;
        BlackScholesBodyCPU(
            callReference,
            putReference,
            StockPriceList[i],
            OptionStrikeList[i], OptionYearsList[i],  RISKFREE, VOLATILITY );
        ref   = callReference;
        delta = fabs(callReference - CallResultList[i]);
        sum_delta += delta;
        sum_ref   += fabs(ref);
        if(delta > 1e-6)
             sumReserve += CallConfidenceList[i] / delta;
        max_local = delta>max_local? delta: max_local;
    }
    max_delta = max_local>max_delta? max_local: max_delta;
    vslDeleteStream(&(Streams[threadID]));
    scalable_aligned_free(samples[threadID]);
    scalable_aligned_free(CallResultList);
    scalable_aligned_free(CallConfidenceList);
    scalable_aligned_free(StockPriceList);
    scalable_aligned_free(OptionStrikeList);
    scalable_aligned_free(OptionYearsList);
}//end of parallel block

    sumReserve          /= (double)OPT_N;
    const double L1norm  = sum_delta / sum_ref;

    printf("L1_Norm          = %4.3E\n", L1norm);
    printf("Average RESERVE  = %4.3f\n", sumReserve);
    printf("Max Error        = %4.3E\n", max_delta);

    const unsigned long long cyc       = end_cyc - start_cyc;
    const double             optcyc    = (double)cyc/(double)OPT_N;

    printf("==========================================\n");
    printf("Total Cycles = %lld\n", cyc);
    printf("Cyc/opt      = %8.3f\n", optcyc);
    printf("Time Elapsed = %8.3f\n", eTime-sTime);
    printf("Options/sec  = %8.3f\n", OPT_N/(eTime-sTime));
    printf("==========================================\n");
    return 0;
}