void MainWindow::randomizeSigma_1() { double x1; VSLStreamStatePtr stream; vslNewStream( &stream, VSL_BRNG_MCG31, this->seed ); vdRngUniform( 0, stream, model->nI(), &model->Sigma[0], 0.0, 1.0 ); vslDeleteStream( &stream ); for (int i=0; i < model->nI(); ++i) { std::pair<int,int> ends = model->ends(i); int from = ends.first; int to = ends.second; std::pair<double,double> xy0 = model->xy(from); std::pair<double,double> xy1 = model->xy(to); if (xy0.first==0 && xy1.first==0 || xy0.first==0 && xy1.first==1 || xy0.first==1 && xy1.first==0 || xy0.first==1 && xy1.first==1 || xy0.first==model->xmax() && xy1.first==model->xmax() || xy0.first==model->xmax()-1 && xy1.first==model->xmax() || xy0.first==model->xmax() && xy1.first==model->xmax()-1 || xy0.first==model->xmax()-1 && xy1.first==model->xmax()-1 ) { model->Sigma[i]=this->sigmaU; } else {x1=model->Sigma[i]; if (x1 < this->fraction) model->Sigma[i] = CUTOFF_SIGMA; else model->Sigma[i] =1; } } }
PNL_BEGIN void pnlSeed(int s) { vslDeleteStream(&g_RNG.m_vslStream); vslNewStream(&g_RNG.m_vslStream, _VSL_UNI_METHOD_, s); }
static void bernoulli_generate(int n, double p, int* r) { int seed = 17 + caffe_rng_rand() % 4096; #ifdef _OPENMP int nthr = omp_get_max_threads(); int threshold = nthr * caffe::cpu::OpenMpManager::getProcessorSpeedMHz() / 3; bool run_parallel = (Caffe::mode() != Caffe::GPU) && (omp_in_parallel() == 0) && (n >= threshold); if (!run_parallel) nthr = 1; # pragma omp parallel num_threads(nthr) { const int ithr = omp_get_thread_num(); const int avg_amount = (n + nthr - 1) / nthr; const int my_offset = ithr * avg_amount; const int my_amount = std::min(my_offset + avg_amount, n) - my_offset; #else { const int my_amount = n; const int my_offset = 0; #endif VSLStreamStatePtr stream; vslNewStream(&stream, VSL_BRNG_MCG31, seed); vslSkipAheadStream(stream, my_offset); viRngBernoulli(VSL_RNG_METHOD_BERNOULLI_ICDF, stream, my_amount, r + my_offset, p); vslDeleteStream(&stream); } }
void Caffe::set_random_seed(unsigned int seed) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, seed)); VSL_CHECK(vslDeleteStream(&Get().vsl_stream_)); VSL_CHECK(vslNewStream(&Get().vsl_stream_, VSL_BRNG_MT19937, seed)); }
JNIEXPORT jint JNICALL Java_edu_berkeley_bid_VSL_vslNewStream (JNIEnv *env, jclass clazz, jobject jstream, jint brng, jint seed) { VSLStreamStatePtr streamp; int status = vslNewStream(&streamp, brng, seed); setStream(env, clazz, jstream, streamp); return (jint)status; }
int main() { const int n = 500; // Number of atoms, molecules const int mt = 20; // Max time steps const int dtxyz = 100; // Time interval to output xyz int i; int j; double *x; double *v; double *f; const double domain = 300; // Domain size (a.u.) const double dt = 10; // Time interval (a.u.) const double ms = 0.0; // Max speed (a.u.) const double em = 1822.88839 * 28.0134; // Effective mass of N2 const double lje = 0.000313202; // Lennard-Jones epsilon of N2 const double ljs = 6.908841465; // Lennard-Jones sigma of N2 #ifdef MKLRNG VSLStreamStatePtr stream; vslNewStream(&stream, VSL_BRNG_MT19937, 5489); // Initiation, type, seed //vslNewStream(&stream, VSL_BRNG_SFMT19937, 5489); // Initiation, type, seed #endif x = (double *) malloc(n * 3 * sizeof(double)); v = (double *) malloc(n * 3 * sizeof(double)); f = (double *) malloc(n * 3 * sizeof(double)); // Initialization for (i=0; i<n; i++) for (j=0; j<3; j++) x[i*3+j] = domain * rand() / (RAND_MAX + 1.0); for (i=0; i<n; i++) for (j=0; j<3; j++) v[i*3+j] = ms * (rand() / (RAND_MAX + 1.0) - 0.5); // Dynamics printf("# Index dTime KinEng PotEng TotEng\n"); for (i=0; i<mt; i++) { Force(n, lje, ljs, x, f); Solver(n, dt, em, x, v, f); Output_energy(i, n, dt, em, lje, ljs, x, v); if (i % dtxyz == 0) Output_xyz(i, n, x); } Output_xyz(i, n, x); return 0; }
Caffe::Caffe() : mode_(Caffe::CPU), phase_(Caffe::TRAIN), cublas_handle_(NULL), curand_generator_(NULL), vsl_stream_(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); //TODO: original caffe code has bug here! CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, 1701ULL)); VSL_CHECK(vslNewStream(&vsl_stream_, VSL_BRNG_MT19937, 1701)); }
double * get_vector(int size, int i) { double *vec; VSLStreamStatePtr stream; vslNewStream( &stream, VSL_BRNG_MT19937, i*time(0) ); vec = (double *)calloc(size, sizeof(double)); vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_BOXMULLER, stream, size, vec, 1.0, 3.0 ); vslDeleteStream( &stream ); return vec; }
void GeneticAlgorithm::initializeRandomNumberGenerators(){ SYSTEMTIME t; GetLocalTime(&t); unsigned int max = _nPopulation - 1; vslNewStream( & stream, VSL_BRNG_SFMT19937, t.wMilliseconds ); ints1 = new int[_nPopulation]; ints2 = new int[_nPopulation]; ints3 = new int[_nPopulation]; shuffleIndex = new int[_nPopulation]; }
void MainWindow::randRcr() { int i_Rcr = model->index_of_Rcr(); elementCr = fabs((model->I[ i_Rcr ])); this->sigmaMin=model->Sigma[i_Rcr ]; VSLStreamStatePtr stream; vslNewStream( &stream, VSL_BRNG_MCG31, this->seed ); vdRngUniform( 0, stream, model->nI(), &model->Sigma[0], 0.0, 1.0 ); vslDeleteStream( &stream ); double x1=model->Sigma[i_Rcr]; this->randc=x1; this->rand=x1; }
void GeneticAlgorithm2::initializeRandomNumberGenerators(){ SYSTEMTIME t; GetLocalTime(&t); //_randomNumberGenerator = new boost::random::mt19937(t.wMilliseconds); //_randomNumberGenerator = new boost::random::mt19937(0); // _doubleDistribution = new boost::random::uniform_int_distribution<>(0, RAND_MAX); unsigned int max = _nPopulation - 1; // _integerDistribution = new boost::random::uniform_int_distribution<>(0,max); vslNewStream( & stream, VSL_BRNG_SFMT19937, t.wMilliseconds ); ints1 = new int[_nPopulation]; ints2 = new int[_nPopulation]; ints3 = new int[_nPopulation]; }
int main(){ unsigned int iter=200000000; int i,j; double x, y; double dUnderCurve=0.0; double pi=0.0; VSLStreamStatePtr stream; //You need one stream for each thread double end_time,start_time; start_time=clock(); #pragma omp parallel private(stream,x,y,i) reduction(+:dUnderCurve) { double r[BLOCK_SIZE*2]; //Careful!!! //you need a private copy of whole array for each thread vslNewStream( &stream, BRNG, (int)clock() ); #pragma omp for for(j=0;j<iter/BLOCK_SIZE;j++) { vdRngUniform( METHOD, stream, BLOCK_SIZE*2, r, 0.0, 1.0 ); //Create random numbers into array r for (i=0;i<BLOCK_SIZE;i++) { x=r[i]; //X Coordinate y=r[i+BLOCK_SIZE]; //Y Coordinate if (x*x + y*y <= 1.0) { //is distance from Origin under Curve dUnderCurve++; } } } vslDeleteStream( &stream ); } pi = dUnderCurve / (double) iter * 4 ; end_time=clock(); printf ("pi = %10.9f\n", pi); printf ("Seconds = %10.9f\n",(double)((end_time-start_time)/1000.0)); return 0; }
int main(int argc, char **argv) { long i; long Ncirc = 0; double pi, xy[2]; double r = 1.0; // radius of circle double r2 = r*r; int rank, size, manager = 0; MPI_Status status; long my_trials, temp; int j; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); VSLStreamStatePtr stream; my_trials = num_trials/size; if (num_trials%(long)size > (long)rank) my_trials++; vslNewStream(&stream, VSL_BRNG_MT2203+rank, 1); for (i = 0; i < my_trials; i++) { vdRngUniform(VSL_RNG_METHOD_UNIFORMBITS_STD, stream, 2, xy, 0.0, 1.0); if ((xy[0]*xy[0] + xy[1]*xy[1]) <= r2) Ncirc++; } if (rank == manager) { for (j = 1; j < size; j++) { MPI_Recv(&temp, 1, MPI_LONG, j, j, MPI_COMM_WORLD, &status); Ncirc += temp; } pi = 4.0 * ((double)Ncirc)/((double)num_trials); printf("\n \t Computing pi using MPI and MKL for random number generator: \n"); printf("\t For %ld trials, pi = %f\n", num_trials, pi); printf("\n"); } else { MPI_Send(&Ncirc, 1, MPI_LONG, manager, rank, MPI_COMM_WORLD); } MPI_Finalize(); return 0; }
void hard_mkl() { /*char *results_file = "hard_mkl.txt"; FILE *res; if((res=fopen(results_file, "w"))==NULL) { printf("Can't open file %s.\n", results_file); exit(1); }*/ for(int i = 10; i <= ARRAY_SIZE; i*=10) { VSLStreamStatePtr stream; vslNewStream( &stream, VSL_BRNG_MT19937, i*time(0) ); double *ar1, *ar2, *ar3, *ar4, *ar5, *ar6; ar1 = (double *)malloc(i*sizeof(double)); ar2 = (double *)malloc(i*sizeof(double)); ar3 = (double *)malloc(i*sizeof(double)); ar4 = (double *)malloc(i*sizeof(double)); ar5 = (double *)malloc(i*sizeof(double)); ar6 = (double *)malloc(i*sizeof(double)); vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_BOXMULLER, stream, i, ar1, 1.0, 3.0 ); vdRngGaussian( VSL_RNG_METHOD_GAUSSIAN_BOXMULLER, stream, i, ar2, 1.0, 3.0 ); double start = omp_get_wtime(); for(int j = 0; j < EXPERIMENTS_NUM; j++) { vdCos (i, ar1, ar3); vdLn (i, ar1, ar4); vdPow (i, ar1, ar2, ar5); vdCosh(i, ar2, ar6); } double end = omp_get_wtime(); free(ar1); free(ar2); free(ar3); free(ar4); free(ar5); free(ar6); //fprintf(res, "%lf\n", end-start); printf("%lf, i=%d\n", end-start, i); vslDeleteStream( &stream ); } //fclose(res); }
int main(int argc, char* argv[]) { // construct lattice unsigned int rows = 1; unsigned int columns = 300; lattice_t* lattice = lattice_create(rows, columns, periodic, periodic, periodic, periodic); // initialise lattice positioning unsigned int const kNumStdDevs = 5; unsigned int const kStdDevsRepeatCount = 1; unsigned int const kRepeatCount = 1000; double stddevs[] = { 0.1, 0.2, 0.3, 0.4, 0.5 }; unsigned int const kTimeSetsNum = 3; unsigned int timeSets[] = { 200, 500, 1000 }; // initialise random number storage VSLStreamStatePtr stream; float randomNumbers[columns]; // initialise temporary node storage double xPosition; double yPosition = 0.0; coordinate_t coord; // initialise loop variables bool trackedLatticeLayout; char latticeLayoutFileName[50]; char latticeProfileFilename[100]; // initialise agent tracking information unsigned int numTrackedAgents = 0; coordinate_t* trackedPositions = NULL; int* trackedAgentIds = NULL; // set motility properties double motilityProbability = 1.0; double xShiftPreference = 0; double yShiftPreference = 0; bool agentExclusion = false; // generation random lattices for (int stdDevIndex = 0; stdDevIndex < kNumStdDevs; stdDevIndex++) { for (int boundRepeatCount = 0; boundRepeatCount < kStdDevsRepeatCount; boundRepeatCount++) { // generate any required random numbers (uniform dist) vslNewStream(&stream, BRNG, arc4random()); vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER, stream, columns, randomNumbers, 0.0f, stddevs[stdDevIndex]); // perturb and sort node locations for (int col = 0; col < columns; col++) { randomNumbers[col] += col; } qsort(randomNumbers, columns, sizeof(float), compare); // specify node locations for (int row = 0; row < rows; row++) { for (int col = 0; col < columns; col++) { coord.row = row; coord.column = col; xPosition = (double)randomNumbers[col]; lattice_specify_position(lattice, coord, xPosition, yPosition); } } // save node locations bool saveNodeLocations = true; if (saveNodeLocations) { sprintf(latticeLayoutFileName, "node_positions_%0.02f_%d_ghosts.txt", stddevs[stdDevIndex], boundRepeatCount); trackedLatticeLayout = lattice_parser_node_positions(lattice, rows, columns, latticeLayoutFileName, "output/"); if (!trackedLatticeLayout) { printf("Error: failed storing lattice layout information (case: %0.02f %d).\n", stddevs[stdDevIndex], boundRepeatCount); } } // perform simulations bool performSimulation = true; if (performSimulation) { // perform simulations for (int repeatCount = 0; repeatCount < kRepeatCount; repeatCount++) { // populate lattice int* agentId; int currentAgentId = 1; coordinate_t agentPos; for (int j = 130; j < 171; j++) { agentId = malloc(sizeof(int)); *agentId = currentAgentId++; agentPos.row = 0; agentPos.column = j; lattice_push_agent(lattice, agentPos, agentId); } // perform simulation for (int timeStep = 0; timeStep < timeSets[kTimeSetsNum-1]; timeStep++) { performMotilityEvents(lattice, rows, columns, motilityProbability, xShiftPreference, yShiftPreference, agentExclusion, trackedAgentIds, numTrackedAgents, trackedPositions); for (int j = 0; j < kTimeSetsNum; j++) { if (timeStep == timeSets[j]-1) { // store lattice profile sprintf(latticeProfileFilename, "lattice_profile_%0.02f_%d_%d_%d_ghosts.txt", stddevs[stdDevIndex], boundRepeatCount, repeatCount, timeStep+1); bool isTracked = lattice_occupancy_parser(lattice, rows, columns, latticeProfileFilename, "output/"); if (!isTracked) { printf("Error: failed to store lattice profile.\n"); } } } } // clear lattice and deallocate memory lattice_clear(lattice, rows, columns, true); } } } } // deallocate memory lattice_destroy(&lattice, rows, columns, true); return EXIT_SUCCESS; }
int main(int argc, char* argv[]) { double sTime, eTime; double sum_delta = 0.0; double sum_ref = 0.0; double max_delta = 0.0; double sumReserve = 0.0; printf("Monte Carlo European Option Pricing Single Precision\n\n"); printf("Compiler Version = %d\n", __INTEL_COMPILER/100); printf("Release Update = %d\n", __INTEL_COMPILER_UPDATE); printf("Build Time = %s %s\n", __DATE__, __TIME__); printf("Path Length = %d\n", RAND_N); printf("Number of Options = %d\n", OPT_N); printf("Block Size = %d\n", RAND_BLOCK_LENGTH); printf("Worker Threads = %d\n\n", NTHREADS); const int mem_size = sizeof(float)*OPT_PER_THREAD; #ifndef _OPENMP NTHREADS = 1; #endif float *samples[MAX_THREADS]; VSLStreamStatePtr Streams[MAX_THREADS]; const int nblocks = RAND_N/RAND_BLOCK_LENGTH; #pragma omp parallel reduction(+ : sum_delta) reduction(+ : sum_ref) reduction(+ : sumReserve) reduction(max : max_delta) { #ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif unsigned int randseed = RANDSEED + threadID; srand(randseed); float *CallResultList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *CallConfidenceList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *StockPriceList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *OptionStrikeList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); float *OptionYearsList = (float *)scalable_aligned_malloc(mem_size, SIMDALIGN); for(int i = 0; i < OPT_PER_THREAD; i++) { CallResultList[i] = 0.0f; CallConfidenceList[i] = 0.0f; StockPriceList[i] = RandFloat_T(5.0f, 50.0f, &randseed); OptionStrikeList[i] = RandFloat_T(10.0f, 25.0f, &randseed); OptionYearsList[i] = RandFloat_T(1.0f, 5.0f, &randseed); } samples[threadID] = (float *)scalable_aligned_malloc(RAND_BLOCK_LENGTH * sizeof(float), SIMDALIGN); vslNewStream(&(Streams[threadID]), VSL_BRNG_MT2203 + threadID, RANDSEED); #pragma omp barrier if (threadID == 0) { printf("Starting options pricing...\n"); sTime = second(); start_cyc = _rdtsc(); } for(int opt = 0; opt < OPT_PER_THREAD; opt++) { const float VBySqrtT = VLog2E * sqrtf(OptionYearsList[opt]); const float MuByT = MuLog2E * OptionYearsList[opt]; const float Y = StockPriceList[opt]; const float Z = OptionStrikeList[opt]; float v0 = 0.0f; float v1 = 0.0f; for(int block = 0; block < nblocks; ++block) { float *rand = samples[threadID]; vsRngGaussian (VSL_RNG_METHOD_GAUSSIAN_ICDF, Streams[threadID], RAND_BLOCK_LENGTH, rand, MuByT, VBySqrtT); #pragma vector aligned #pragma simd reduction(+:v0) reduction(+:v1) #pragma unroll(4) for(int i=0; i < RAND_BLOCK_LENGTH; i++) { float callValue = Y * exp2f(rand[i]) - Z; callValue = (callValue > 0.0) ? callValue : 0.0; v0 += callValue; v1 += callValue * callValue; } } const float exprt = exp2f(RLog2E*OptionYearsList[opt]); CallResultList[opt] = exprt * v0 * INV_RAND_N; const float stdDev = sqrtf((F_RAND_N * v1 - v0 * v0) * STDDEV_DENOM); CallConfidenceList[opt] = (float)(exprt * stdDev * CONFIDENCE_DENOM); } //end of opt #pragma omp barrier if (threadID == 0) { end_cyc = _rdtsc(); eTime = second(); printf("Parallel simulation completed in %f seconds.\n", eTime-sTime); printf("Validating the result...\n"); } double delta = 0.0, ref = 0.0, L1norm = 0.0; int max_index = 0; double max_local = 0.0; for(int i = 0; i < OPT_PER_THREAD; i++) { double callReference, putReference; BlackScholesBodyCPU( callReference, putReference, StockPriceList[i], OptionStrikeList[i], OptionYearsList[i], RISKFREE, VOLATILITY ); ref = callReference; delta = fabs(callReference - CallResultList[i]); sum_delta += delta; sum_ref += fabs(ref); if(delta > 1e-6) sumReserve += CallConfidenceList[i] / delta; max_local = delta>max_local? delta: max_local; } max_delta = max_local>max_delta? max_local: max_delta; vslDeleteStream(&(Streams[threadID])); scalable_aligned_free(samples[threadID]); scalable_aligned_free(CallResultList); scalable_aligned_free(CallConfidenceList); scalable_aligned_free(StockPriceList); scalable_aligned_free(OptionStrikeList); scalable_aligned_free(OptionYearsList); }//end of parallel block sumReserve /= (double)OPT_N; const double L1norm = sum_delta / sum_ref; printf("L1_Norm = %4.3E\n", L1norm); printf("Average RESERVE = %4.3f\n", sumReserve); printf("Max Error = %4.3E\n", max_delta); const unsigned long long cyc = end_cyc - start_cyc; const double optcyc = (double)cyc/(double)OPT_N; printf("==========================================\n"); printf("Total Cycles = %lld\n", cyc); printf("Cyc/opt = %8.3f\n", optcyc); printf("Time Elapsed = %8.3f\n", eTime-sTime); printf("Options/sec = %8.3f\n", OPT_N/(eTime-sTime)); printf("==========================================\n"); return 0; }
int main() { const int n = 500; // Number of atoms, molecules const int mt = 100; // Max time steps const int dtxyz = 100; // Time interval to output xyz int i; int j; double *x; double *v; double *f; const double domain = 300; // Domain size (a.u.) const double dt = 10; // Time interval (a.u.) const double ms = 0.00001; // Max speed (a.u.) const double em = 1822.88839 * 28.0134; // Effective mass of N2 const double lje = 0.000313202; // Lennard-Jones epsilon of N2 const double ljs = 6.908841465; // Lennard-Jones sigma of N2 #ifdef MKLRNG VSLStreamStatePtr stream; vslNewStream(&stream, VSL_BRNG_MT19937, 5489); // Initiation, type, seed //vslNewStream(&stream, VSL_BRNG_SFMT19937, 5489); // Initiation, type, seed #endif x = (double *) malloc(n * 3 * sizeof(double)); v = (double *) malloc(n * 3 * sizeof(double)); f = (double *) malloc(n * 3 * sizeof(double)); // Initialization #ifdef MKLRNG for (i=0; i<n; i++) { int nRN = 3; double GRN[3]; vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2, stream, nRN, GRN, 0.5 * domain, domain); for (j=0; j<3; j++) x[i*3+j] = GRN[j]; vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2, stream, nRN, GRN, 0.0, 0.5 * ms); for (j=0; j<3; j++) v[i*3+j] = GRN[j]; } #else for (i=0; i<n; i++) for (j=0; j<3; j++) x[i*3+j] = domain * rand() / (RAND_MAX + 1.0); for (i=0; i<n; i++) for (j=0; j<3; j++) v[i*3+j] = ms * (rand() / (RAND_MAX + 1.0) - 0.5); #endif // Dynamics for (i=0; i<mt; i++) { Force(n, lje, ljs, x, f); Solver(n, dt, em, x, v, f); Output_energy(i, n, dt, em, lje, ljs, x, v); if (i % dtxyz == 0) Output_xyz(i, n, x); } Output_xyz(i, n, x); return 0; }
void rngInit(RngEngine* rng, RngSeedType* seedValue, RngErrorType* info) { *info = vslNewStream(&(rng->m_stream), kVSLBRNGMethod, *seedValue); }
CRNG() { m_vslStream = 0; vslNewStream(&m_vslStream, _VSL_UNI_METHOD_, 0 ); }
int main(int argc, char *argv[]) { unsigned long long count = 0; double EPSILON = X0*1.0E-2; double err; double PXend; const double dt = T/N; const double rootdt = sqrt((double)T/N); int nCal = N/Ncache; const int left = N%Ncache; VSLStreamStatePtr stream; int errcode = vslNewStream(&stream, VSL_BRNG_MT2203, 0);//seed=0 start_timer(); for (unsigned long long m = 0; m < M; ++m){ // one-time MC simulation err = 0.0; vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER, stream, Ncache, NRV, 0.0f, 1.0f);// leaves the rest of random numbers generated by the idle thread BM[0] = rootdt*NRV[0]; PX[0] = X0; for (int k = 0; k < nCal; ++k){ //rootdt:firstprivate??? #pragma omp parallel default(none) shared(NRV, BM, PX, stream, err, PXend, rootdt, dt, k) { double errloc = 0.0; double upbd, tmp; //GUIDED_CHUNK too large: load imbalance //GUIDED_CHUNK too small: scheduling overhead //#pragma omp for schedule(guided, GUIDED_CHUNK) #pragma omp for schedule(guided) //tunable for (int i = 1; i < Ncache; ++i){ //tmp = BM[0]; tmp = 0.0; #pragma simd reduction(+:tmp) vectorlengthfor(double) assert for (int j = 1; j <= i; ++j){ //tmp += rootdt*NRV[j]; tmp += NRV[j]; } //BM[i] = tmp; BM[i] = BM[0] + tmp*rootdt; //PX[i+1] = X0*exp(-0.5*SIGMA*SIGMA*(k*Ncache+i+1)*dt+SIGMA*tmp); PX[i+1] = X0*exp(-0.5*SIGMA*SIGMA*(k*Ncache+i+1)*dt+SIGMA*BM[i]); } #pragma omp single { PX[1] = X0*exp(-0.5*SIGMA*SIGMA*(k*Ncache+1)*dt+SIGMA*BM[0]); } //maybe vary the scheduling strategy? #pragma omp for reduction(+:err) nowait for (int i = 0; i < Ncache; ++i){ int j = k*Ncache+i; double Tj = j*(double)T/N; upbd = (log(PX[i]/K)+0.5*SIGMA*SIGMA*(T-Tj))/(SIGMA*sqrt(T-Tj)); //errloc -= 1/(sqrt(2*PI))*(PX[i+1]-PX[i])*vNormalIntegral(upbd); err += -1/(sqrt(2*PI))*(PX[i+1]-PX[i])*vNormalIntegral(upbd); } #pragma omp single { vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_BOXMULLER, stream, Ncache, NRV, 0.0f, 1.0f);// leaves the rest of random numbers generated by the idle thread }//single }//parallel BM[0] = BM[Ncache-1] + rootdt*NRV[0]; PX[0] = PX[Ncache]; }//for nCal PXend = PX[Ncache]; #pragma omp parallel default(none) shared(NRV, BM, PX, err, rootdt, dt, nCal, left, PXend) { double errloc = 0.0; double upbd, tmp; if(left!=0){ //GUIDED_CHUNK too large: load imbalance //GUIDED_CHUNK too small: scheduling overhead //#pragma omp for schedule(guided, GUIDED_CHUNK) #pragma omp for schedule(guided) //tunable for (int i = 1; i < left; ++i){ //tmp = BM[0]; tmp = 0.0; #pragma simd reduction(+:tmp) vectorlengthfor(double) assert for (int j = 1; j <= i; ++j){ //tmp += rootdt*NRV[j]; tmp += NRV[j]; } //BM[i] = tmp; BM[i] = BM[0] + tmp*rootdt; //PX[i+1] = X0*exp(-0.5*SIGMA*SIGMA*(nCal*Ncache+i+1)*dt+SIGMA*BM[i]); PX[i+1] = X0*exp(-0.5*SIGMA*SIGMA*(nCal*Ncache+i+1)*dt+SIGMA*BM[i]); } #pragma omp single { PX[1] = X0*exp(-0.5*SIGMA*SIGMA*(nCal*Ncache+1)*dt+SIGMA*BM[0]); PXend = PX[left]; } //maybe vary the scheduling strategy? #pragma omp for reduction(+:err) nowait for (int i = 0; i < left; ++i){ int j = nCal*Ncache+i; double Tj = j*(double)T/N; upbd = (log(PX[i]/K)+0.5*SIGMA*SIGMA*(T-Tj))/(SIGMA*sqrt(T-Tj)); err += -1/sqrt((2*PI))*(PX[i+1]-PX[i])*vNormalIntegral(upbd); } }//if #pragma omp single nowait { upbd = (log(X0/K) + 0.5*SIGMA*SIGMA*T)/(SIGMA*sqrt(T)); errloc -= X0/(sqrt(2*PI))*vNormalIntegral(upbd); #pragma omp atomic err += errloc; } #pragma omp single nowait { upbd = (log(X0/K) - 0.5*SIGMA*SIGMA*T)/(SIGMA*sqrt(T)); errloc += K/(sqrt(2*PI))*vNormalIntegral(upbd); #pragma omp atomic err += errloc; } #pragma omp single nowait { if(PXend > K) errloc += PXend - K; #pragma omp atomic err += errloc; } }//parallel err = fabs(err); if(err < EPSILON) count++; //printf("err=%.10lf\n",err); }//MC simulation printf ("time %g ms\n", stop_timer()); printf("err=%.20lf\n",err); printf("count=%llu, M=%llu\n", count, M); printf("%.5g\n", (double)count/(double)M); vslDeleteStream(&stream); return 0; }
int main(int argc, char** argv){ double* A; double* B; double* C; double alpha = 1.0; double beta = 0.0; int i; struct timeval t1,t2, t3, t4; const int SEED = 1; const int METHOD = 0; const int BRNG = VSL_BRNG_MCG31; VSLStreamStatePtr stream; int errcode; cublasStatus_t status; cublasHandle_t handle; double a=0.0, b= 1.0; // Uniform distribution between 0 and 1 errcode = vslNewStream(&stream, BRNG, SEED); int width = 100; if (argc > 1){ width = atoi(argv[1]); } /* Allocate memory for A, B, and C */ if (cudaMallocManaged(&A, width * width * sizeof(double)) != cudaSuccess){ fprintf(stderr, "!!!! device memory alocation error (allocate A)\n"); return EXIT_FAILURE; } if (cudaMallocManaged(&B, width * width * sizeof(double)) != cudaSuccess){ fprintf(stderr, "!!!! device memory alocation error (allocate B)\n"); return EXIT_FAILURE; } if (cudaMallocManaged(&C, width * width * sizeof(double)) != cudaSuccess){ fprintf(stderr, "!!!! device memory alocation error (allocate C)\n"); return EXIT_FAILURE; } /* Generate width * width random numbers between 0 and 1 to fill matrices A and B. */ errcode = vdRngUniform(METHOD, stream, width * width, A, a, b); CheckVslError(errcode); errcode = vdRngUniform(METHOD, stream, width * width, B, a, b); CheckVslError(errcode); /* Now prepare the call to CUBLAS */ status = cublasCreate(&handle); if (status != CUBLAS_STATUS_SUCCESS) { fprintf (stderr, "!!!! CUBLAS initialization error\n"); return EXIT_FAILURE; } gettimeofday(&t3, NULL); /* Perform calculation */ status = cublasDgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T, width, width, width, &alpha, A, width, B, width, &beta, C, width); if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! kernel execution error.\n"); return EXIT_FAILURE; } cudaDeviceSynchronize(); gettimeofday(&t4, NULL); const double time = (double) (t4.tv_sec - t3.tv_sec) + 1e-6 * (t4.tv_usec - t3.tv_usec); const double Gflops = 2. * width * width * width / (double) time * 10e-9; printf("Call to cublasDGEMM took %lf\n", time); printf("Gflops: %lf\n", Gflops); cudaFree(A); cudaFree(B); cudaFree(C); status = cublasDestroy(handle); if (status != CUBLAS_STATUS_SUCCESS){ fprintf(stderr, "!!!! shutdown error\n"); return EXIT_FAILURE; } return 0; }
Random<CPU>::Random (const int did) : did_(did) { rand_check (vslNewStream (&vStream_, VSL_BRNG_MT19937, 1)); }
double integrateVegas(double * limits , int threads, double * params){ //Setting the number of threads omp_set_num_threads(threads); //How many iterations to perform int iterations =15; //Which iteration to start sampling more int switchIteration = 7; //How many points to sample in total int samples = 100000; //How many points to sample after grid set up int samplesAfter = 5000000; //How many intervals for each dimension int intervals = 10; //How many subIntervals int subIntervals = 1000; //Parameter alpha controls convergence rate double alpha = 0.5; int seed = 40847516; //double to store volume integrated over double volume = 1.0; for(int i=0; i<dimensions; i++){ volume*= (limits[(2*i)+1]-limits[2*i]); }; //Number of boxes int numBoxes = intervals; for(int i=1; i<dimensions; i++){ numBoxes *= intervals; } //CHANGE SEED WHEN YOU KNOW IT WORKS //Setting up one random number stream for each thread VSLStreamStatePtr * streams; streams = ( VSLStreamStatePtr * )_mm_malloc(sizeof(VSLStreamStatePtr)*threads,64); for(int i=0; i<threads; i++){ vslNewStream(&streams[i], VSL_BRNG_MT2203+i,seed); } //Arrays to store integral and uncertainty for each iteration double * integral = (double *)_mm_malloc(sizeof(double)*iterations,64); double * sigmas = (double *)_mm_malloc(sizeof(double)*iterations,64); for(int i=0; i<iterations; i++){ integral[i] = 0; sigmas[i] = 0; } //Points per each box int pointsPerBox = samples/numBoxes; //Array storing the box limits (stores x limits then y limits and so on) intervals+1 to store all limits double * boxLimits = (double *)_mm_malloc(sizeof(double)*(intervals+1)*dimensions,64); //Array to store average function values for each box double * heights = (double *)_mm_malloc(sizeof(double)*dimensions*intervals,64); //Array storing values of m double * mValues = (double *)_mm_malloc(sizeof(double)*intervals,64); //Array storing widths of sub boxes double * subWidths = (double *) _mm_malloc(sizeof(double)*intervals,64); //Getting initial limits for the boxes for(int i=0; i<dimensions; i++){ double boxWidth = (limits[(2*i)+1]-limits[2*i])/intervals; //0th iteration boxLimits[i*(intervals+1)] = limits[2*i]; for(int j=1; j<=intervals; j++){ int x = (i*(intervals+1))+j; boxLimits[x] = boxLimits[x-1]+boxWidth; } }; //Pointer to store random generated numbers double randomNums[dimensions]__attribute__((aligned(64))); int binNums[dimensions]__attribute__((aligned(64))); //Double to store p(x) denominator for monte carlo double prob; //Values to store integral and sigma for each thread so they can be reduced in OpenMp double integralTemp; double sigmaTemp; double heightsTemp[dimensions*intervals]__attribute__((aligned(64))); int threadNum; #pragma omp parallel default(none) private(sigmaTemp,integralTemp,binNums,randomNums,prob,threadNum,heightsTemp) shared(iterations,subIntervals,alpha,mValues,subWidths,streams,samples,boxLimits,intervals, integral, sigmas, heights, threads, volume, samplesAfter, switchIteration, params) { for(int iter=0; iter<iterations; iter++){ //Stepping up to more samples when grid calibrated if(iter==switchIteration){ samples = samplesAfter; } //Performing iterations for(int i=0; i<dimensions*intervals; i++){ heightsTemp[i] = 0; } integralTemp = 0; sigmaTemp = 0; //Getting chunk sizes for each thread threadNum = omp_get_thread_num(); int seg = ceil((double)samples/threads); int lower = seg*threadNum; int upper = seg*(threadNum+1); if(upper > samples){ upper = samples; }; //Spliting monte carlo up for(int i=0; i<seg; i++){ prob = 1; //Randomly choosing bins to sample from viRngUniform(VSL_RNG_METHOD_UNIFORM_STD,streams[threadNum],dimensions,binNums,0,intervals); vdRngUniform(VSL_RNG_METHOD_UNIFORM_STD,streams[threadNum],dimensions,randomNums,0,1); //Getting samples from bins for(int j=0; j<dimensions; j++){ int x = ((intervals+1)*j)+binNums[j]; randomNums[j] *= (boxLimits[x+1]-boxLimits[x]); randomNums[j] += boxLimits[x]; prob *= 1.0/(intervals*(boxLimits[x+1]-boxLimits[x])); } //Performing evaluation of function and adding it to the total integral double eval = evaluate(randomNums,params); integralTemp += eval/prob; sigmaTemp += (eval*eval)/(prob*prob); //Calculating the values of f for bin resising for(int j=0; j<dimensions; j++){ int x = binNums[j]+(j*intervals); //May need to initialize heights // #pragma omp atomic // printf("heightsTemp before=%f\n",heightsTemp[x]); heightsTemp[x] += eval; // printf("heightsTemp=%f x=%d eval=%f thread=%d\n",heightsTemp[x],x,eval,omp_get_thread_num()); } } #pragma omp critical { integral[iter] += integralTemp; sigmas[iter] += sigmaTemp; for(int k=0; k<dimensions*intervals; k++){ // printf("heightTemp[k]=%f k=%d\n",heightsTemp[k],k); heights[k] += heightsTemp[k]; } } #pragma omp barrier #pragma omp single { //Calculating the values of sigma and the integral integral[iter] /= samples; sigmas[iter] /= samples; sigmas[iter] -= (integral[iter]*integral[iter]); sigmas[iter] /= (samples-1); // printf("integral=%f\n",integral[iter]); //Readjusting the box widths based on the heights //Creating array to store values of m and their sum int totalM=0; //Doing for each dimension seperately for(int i=0; i<dimensions; i++){ double sum = 0; //Getting the sum of f*delta x for(int j=0; j<intervals; j++){ int x = (i*(intervals))+j ; //May be bug with these indicies sum += heights[x]*(boxLimits[x+1+i]-boxLimits[x+i]); } //Performing the rescaling for(int j=0; j<intervals; j++){ int x = (i*(intervals))+j; double value = heights[x]*(boxLimits[x+1+i]-boxLimits[x+i]); mValues[j] = ceil(subIntervals*pow((value-1)*(1.0/log(value)),alpha)); subWidths[j] = (boxLimits[x+1+i]-boxLimits[x+i])/mValues[j]; totalM += mValues[j]; } int mPerInterval = totalM/intervals; int mValueIterator = 0; //Adjusting the intervals going from 1 to less than intervals to keep the edges at the limits for(int j=1; j<intervals; j++){ double width = 0; for(int y=0; y<mPerInterval; y++){ width += subWidths[mValueIterator]; mValues[mValueIterator]--; if(mValues[mValueIterator]==0){ mValueIterator++; } } //NEED TO SET BOX LIMITS NOW int x = j+(i*(intervals+1)); boxLimits[x] = boxLimits[x-1]+width; } //Setting mvalues etc. (reseting memory allocated before the dimensions loop to 0) totalM = 0; for(int k=0; k<intervals; k++){ subWidths[k] = 0; mValues[k] = 0; } } //Setting heights to zero for next iteration for(int i=0; i<intervals*dimensions; i++ ){ heights[i] = 0; } } } } //All iterations done //Free stuff _mm_free(subWidths); _mm_free(mValues); _mm_free(boxLimits); _mm_free(streams); _mm_free(heights); //Calculating the final value of the integral double denom = 0; double numerator =0; for(int i=7; i<iterations; i++){ numerator += integral[i]*((integral[i]*integral[i])/(sigmas[i]*sigmas[i])); denom += ((integral[i]*integral[i])/(sigmas[i]*sigmas[i])); // printf("integral=%f sigma=%f\n",integral[i],sigmas[i]); } double output = numerator/denom; //Calculating value of x^2 to check if result can be trusted double chisq = 0; for(int i=0; i<iterations; i++){ chisq += (((integral[i]-output)*(integral[i]-output))/(sigmas[i]*sigmas[i])); } if(chisq>iterations){ printf("Chisq value is %f, it should be not much greater than %d (iterations-1) Integral:%f Analytical Value=%f\n",chisq,iterations-1,output,normValue(params)); } _mm_free(integral); _mm_free(sigmas); return output; }
int main() { SetThreads(); PrintInfo(); double Start = omp_get_wtime(); double * restrict ResultPrices; ResultPrices = malloc(sizeof(double) * HISTORY); #pragma offload target(mic) out(ResultPrices:length(HISTORY)) { SetMICThreads(); double * restrict Prices; double * restrict Epsilon; Prices = malloc(sizeof(double) * HISTORY); Epsilon = malloc(sizeof(double) * HISTORY); //Creating random stream VSLStreamStatePtr RndStream; vslNewStream(&RndStream, VSL_BRNG_SFMT19937, (int)time(NULL)); long double Buff; for (unsigned int iter = 0; iter < TE; iter++) { //Randomize volumes vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, RndStream, HISTORY, Epsilon, 0, 0.002); #pragma omp parallel for shared(Prices, ResultPrices) for (unsigned long long int i = 0; i < HISTORY; i++) { //Buff = i * i * powl(10, (-21.65) - i * 4.5 * powl(10, (-10.65)); //Prices[i] = (((i * i * powl(10, (-24.65))) - (i * 4.5 * powl(10, (-13.65))) + 1.095) + Epsilon[i]); Prices[i] = ( ( i * i * powl(10, (-24.65)) - i * 4.5 * powl(10, (-13.65)) + 1.095 ) + Epsilon[i]); ResultPrices[i] += Prices[i]; } } #pragma omp parallel for shared(ResultPrices) for (unsigned long long int j = 0; j < HISTORY; j++) { ResultPrices[j] = ResultPrices[j] / TE;; } free(Prices); free(Epsilon); Prices = NULL; Epsilon = NULL; } double End = omp_get_wtime(); printf("%lf\n", (End - Start)); FILE *FpResultHistory; //unsigned long long int Buff; FpResultHistory = fopen("res_history.txt", "wb"); if (FpResultHistory) { printf("//================================================================\n"); printf("|| Result history file status : open\n"); for (unsigned long long int i = 0; i < HISTORY; i++) { //Buff = (i); fprintf(FpResultHistory, "%llu %lf\n", (i * 10), ResultPrices[i]); //fprintf(fp_result, "%lf %lf %lf\n", ResultPrices[i], ResultVolumeUp[i], ResultVolumeDown[i]); } fclose(FpResultHistory); printf("|| Result history file status : close\n||\n"); printf("\\================================================================\n\n"); } free(ResultPrices); ResultPrices = NULL; return 0; }
void init_random_bit (uint32_t seed) { assert (0 == vslNewStream (&stream, VSL_BRNG_MT19937, (unsigned int) seed)); assert (0 == atexit (deinit_random_bit)); }
void Random<CPU>::set_seed (int seed) { rand_check (vslDeleteStream (&vStream_)); rand_check (vslNewStream (&vStream_, VSL_BRNG_MT19937, seed)); }
void mkl_srand(unsigned int s) { vslNewStream(&(__UTIL_sRNG.stream),__UTIL_sRNG.brng,s); __UTIL_sRNG.ind = RNG_BLOCK_SIZE; }