int main() { int n; //char fn[256]; printf("Input dimension (n): "); scanf("%d", &n); if (n <= 0) return -1; //printf("Input source file name (or 'func' to use function): "); //scanf("%255s", fn); double * a = new double[n*n]; double * ac = new double[n*n]; double * ai = new double[n*n]; printf("Filling...\n"); /* if (fn[0] == 'f') */ FillMatrix(a, n); /* else ReadMatrix(fn, a, n, n); */ printf("Saving...\n"); for (int i = 0; i < n*n; i++) ac[i] = a[i]; printf("Inverting...\n"); clock_t ts = clock(); if (!S_Reflect(a, n, ai)) { printf("Bad matrix!\n"); return -2; } clock_t te = clock(); // printf("Result:\n"); PrintMatrix(ai, 0, n); printf("Calculating error...\n"); printf("SINGLE-THREADED: E=%1.20lf, T=%1.4lf s.\n", CalcError(ac, ai, n), double(te-ts)/double(CLOCKS_PER_SEC)); delete a; delete ac; delete ai; return 0; }
int main() { try { int n; char fn[256]; printf("Input dimension (n): "); scanf("%d", &n); if (n <= 0) throw -10; // printf("Input source file name (or 'func' to use function): "); // scanf("%255s", fn); real * a = new real[n*n]; real * ac = new real[n*n]; real * q = new real[n*n]; real * ev = new real[n]; // eigenvalues //if (fn[0] == 'f' && fn[1] == 0) FillMatrix(a, n); //else ReadMatrix(fn, a, n, n); memcpy(ac, a, sizeof(real)*n*n); // PrintMatrix(a,0,n); printf("Input accuracy: "); real DEPbMO; scanf("%Lf", &DEPbMO); printf("let's begin!.."); clock_t ts = clock(); if (!S_Reflect(DEPbMO, a, n, q, ev)) { printf("Bad matrix!\n"); throw -100; } clock_t te = clock(); printf("done.\n Eigenvalues of matrix are:\n"); for (int i = 0; i < n; i++) { printf("%1.10Lf ", ev[i]); } printf("\n"); printf("Elapsed time: %.3Lf\n", real(te-ts)/real(CLOCKS_PER_SEC)); printf("Error: %1.15Lf\n", CalcError(ac, ev, n)); delete a; delete ac; delete q; throw 0; } catch(int err) { switch (err) { case 0: return 0; default: printf("Error in main program. err = %d\n", err); return err; } } }
bool ribi::PlaneZ::IsInPlane(const Coordinat3D& coordinat) const noexcept { try { const double error = CalcError(coordinat); const double max_error = CalcMaxError(coordinat); return error <= max_error; } catch (std::exception& e) { // TRACE("ERROR"); // TRACE(e.what()); assert(!"Should not get here"); throw; } }
void main() { //CALL THE INITIALIZING FUNCTION initport(); initpwm(); while(1) { indicator(); CalcError(); if((error == 0) && (s4+s5==2)) { T1CON.TMR1ON = 0; motor_LF(); //FWD AT FULL SPEED motor_RF(); PWM1_CHANGE_DUTY(255); PWM2_CHANGE_DUTY(255); delay_ms(10); } if((s1+s2+s3+s4+s5+s6+s7+s8) == 0) //ROBOT HAS OVERSHOOT { T1CON.TMR1ON = 0; if(lastreading == 'r') //CHECKS IF THE LAST SENSOR ACTIVATED WAS RIGHT { T1CON.TMR1ON = 0; motor_RB(); //TURN RIGHT AT FULL SPEED motor_LF(); PWM1_CHANGE_DUTY(255); PWM2_CHANGE_DUTY(255); delay_ms(10); //error=0; } else if(lastreading == 'l') //CHECKS IF THE LAST SENSOR ACTIVATED WAS LEFT { T1CON.TMR1ON = 0; motor_LB(); //TURN LEFT AT FULL SPEED motor_RF(); PWM1_CHANGE_DUTY(255); PWM2_CHANGE_DUTY(255); delay_ms(10); //error=0; } } if ( counter>200) { T1CON.TMR1ON = 0; PORTC.F7 = 0; PORTC.F6 = 0; PORTC.F5 = 0; PORTC.F4 = 0; while(1); } if( (s1+s2+s3+s4+s5+s6+s7) == 7 || (s2+s3+s4+s5+s6+s7+s8) == 7 || (s1+s2+s3+s4+s5+s6+s7+s8) == 8) // TO STOP THE MOTOR AT THE END OF LINE { T1CON.TMR1ON = 1; // enable timer1 // delay_ms(3) ; // if((s1+s2+s3+s4+s5+s6+s7+s8) == 0) /* { PORTC.F7 = 0; PORTC.F6 = 0; PORTC.F5 = 0; PORTC.F4 = 0; */ } else //ROBOT ON THE LINE { T1CON.TMR1ON = 0; PROPORTIONAL = error * kp; INTEGRAL += error ; INTEGRAL *= ki; DERIVATIVE = (error - perror); correction = ( (PROPORTIONAL) + (INTEGRAL) + (DERIVATIVE*kd)); rightpulse = basespeed + (correction/2); leftpulse = basespeed - (correction/2); motor_RF(); motor_LF(); if(leftpulse > 255) //LEFT CORRECTION EXCEED leftpulse = 255; if(rightpulse > 255) //RIGHT CORRECTION EXCEED rightpulse = 255; if(leftpulse < 0) //LEFT CORRECTION EXCEED leftpulse = 0; if(rightpulse < 0) //RIGHT CORRECTION EXCEED rightpulse = 0; PWM1_CHANGE_DUTY(rightpulse); PWM2_CHANGE_DUTY(leftpulse); } delay_ms(10); } }
//////////////////////////////////////////////////////////////////////////////// // Program Main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { int Nx, Ny, Nz, max_iters; int blockX, blockY, blockZ; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", argv[0]); exit(1); } // Get the number of GPUS int number_of_devices; checkCuda(cudaGetDeviceCount(&number_of_devices)); if (number_of_devices < 2) { printf("Less than two devices were found.\n"); printf("Exiting...\n"); return -1; } // Decompose along the Z-axis int _Nz = Nz/number_of_devices; // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Check if ECC is turned on ECCCheck(number_of_devices); // Set the number of OpenMP threads omp_set_num_threads(number_of_devices); #pragma omp parallel { unsigned int tid = omp_get_num_threads(); #pragma omp single { printf("Number of OpenMP threads: %d\n", tid); } } // CPU memory operations int dt_size = sizeof(_DOUBLE_); _DOUBLE_ *u_new, *u_old; u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate arrays on the host size_t pitch_bytes; size_t pitch_gc_bytes; _DOUBLE_ *h_Unew, *h_Uold; _DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices]; _DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices]; _DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices]; h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(h_Uold, h_Unew, h, Nx, Ny, Nz); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // GPU memory operations _DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices]; _DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices]; _DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices]; #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); CopyToConstantMemory(c0, c1); checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); } // Copy data from host to the device double HtD_timer = 0.; HtD_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); } HtD_timer += omp_get_wtime(); int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); double compute_timer = 0.; compute_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); for(int iterations = 0; iterations < max_iters; iterations++) { // Compute inner nodes checkCuda(cudaSetDevice(tid)); ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz); // Copy right boundary data to host if (tid == 0) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } // Copy left boundary data to host if (tid == 1) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } #pragma omp barrier // Copy right boundary data to device 1 if (tid == 1) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); } // Copy left boundary data to device 0 if (tid == 0) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); } // Swap pointers on the host #pragma omp barrier checkCuda(cudaSetDevice(tid)); checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]); } } compute_timer += omp_get_wtime(); // Copy data from device to host double DtH_timer = 0; DtH_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost)); } DtH_timer += omp_get_wtime(); // Merge sub-domains into a one big domain #pragma omp parallel { unsigned int tid = omp_get_thread_num(); merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); #endif float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); #if defined(DEBUG) || defined(_DEBUG) //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz); #endif #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaFree(d_s_Unews[tid])); checkCuda(cudaFree(d_s_Uolds[tid])); checkCuda(cudaFree(d_right_send_buffer[tid])); checkCuda(cudaFree(d_left_send_buffer[tid])); checkCuda(cudaFree(d_right_receive_buffer[tid])); checkCuda(cudaFree(d_left_receive_buffer[tid])); checkCuda(cudaFreeHost(h_s_Unews[tid])); checkCuda(cudaFreeHost(h_s_Uolds[tid])); checkCuda(cudaFreeHost(left_send_buffer[tid])); checkCuda(cudaFreeHost(right_send_buffer[tid])); checkCuda(cudaFreeHost(left_receive_buffer[tid])); checkCuda(cudaFreeHost(right_receive_buffer[tid])); checkCuda(cudaDeviceReset()); } free(u_old); free(u_new); return 0; }
void cScenarioArmEval::UpdateTrackError() { double err = CalcError(); mAvgErr = cMathUtil::AddAverage(mAvgErr, mErrSampleCount, err, 1); ++mErrSampleCount; }
void REMEZ_CreateFilter(double h[], int numtaps, int numband, double bands[], const double des[], const double weight[], int type) { double *Grid, *W, *D, *E; int i, iter, gridsize, r, *Ext; double *taps, c; double *x, *y, *ad; int symmetry; if (type == REMEZ_BANDPASS) symmetry = POSITIVE; else symmetry = NEGATIVE; r = numtaps / 2; /* number of extrema */ if ((numtaps % 2) && (symmetry == POSITIVE)) r++; /* Predict dense grid size in advance for memory allocation * .5 is so we round up, not truncate */ gridsize = 0; for (i = 0; i < numband; i++) { gridsize += (int) (2 * r * GRIDDENSITY * (bands[2 * i + 1] - bands[2 * i]) + .5); } if (symmetry == NEGATIVE) { gridsize--; } /* Dynamically allocate memory for arrays with proper sizes */ Grid = (double *) Util_malloc(gridsize * sizeof(double)); D = (double *) Util_malloc(gridsize * sizeof(double)); W = (double *) Util_malloc(gridsize * sizeof(double)); E = (double *) Util_malloc(gridsize * sizeof(double)); Ext = (int *) Util_malloc((r + 1) * sizeof(int)); taps = (double *) Util_malloc((r + 1) * sizeof(double)); x = (double *) Util_malloc((r + 1) * sizeof(double)); y = (double *) Util_malloc((r + 1) * sizeof(double)); ad = (double *) Util_malloc((r + 1) * sizeof(double)); /* Create dense frequency grid */ CreateDenseGrid(r, numtaps, numband, bands, des, weight, &gridsize, Grid, D, W, symmetry); InitialGuess(r, Ext, gridsize); /* For Differentiator: (fix grid) */ if (type == REMEZ_DIFFERENTIATOR) { for (i = 0; i < gridsize; i++) { /* D[i] = D[i] * Grid[i]; */ if (D[i] > 0.0001) W[i] = W[i] / Grid[i]; } } /* For odd or Negative symmetry filters, alter the * D[] and W[] according to Parks McClellan */ if (symmetry == POSITIVE) { if (numtaps % 2 == 0) { for (i = 0; i < gridsize; i++) { c = cos(Pi * Grid[i]); D[i] /= c; W[i] *= c; } } } else { if (numtaps % 2) { for (i = 0; i < gridsize; i++) { c = sin(Pi2 * Grid[i]); D[i] /= c; W[i] *= c; } } else { for (i = 0; i < gridsize; i++) { c = sin(Pi * Grid[i]); D[i] /= c; W[i] *= c; } } } /* Perform the Remez Exchange algorithm */ for (iter = 0; iter < MAXITERATIONS; iter++) { CalcParms(r, Ext, Grid, D, W, ad, x, y); CalcError(r, ad, x, y, gridsize, Grid, D, W, E); Search(r, Ext, gridsize, E); if (isDone(r, Ext, E)) break; } #ifndef ASAP if (iter == MAXITERATIONS) { Log_print("remez(): reached maximum iteration count. Results may be bad."); } #endif CalcParms(r, Ext, Grid, D, W, ad, x, y); /* Find the 'taps' of the filter for use with Frequency * Sampling. If odd or Negative symmetry, fix the taps * according to Parks McClellan */ for (i = 0; i <= numtaps / 2; i++) { if (symmetry == POSITIVE) { if (numtaps % 2) c = 1; else c = cos(Pi * (double) i / numtaps); } else { if (numtaps % 2) c = sin(Pi2 * (double) i / numtaps); else c = sin(Pi * (double) i / numtaps); } taps[i] = ComputeA((double) i / numtaps, r, ad, x, y) * c; } /* Frequency sampling design with calculated taps */ FreqSample(numtaps, taps, h, symmetry); /* Delete allocated memory */ free(Grid); free(W); free(D); free(E); free(Ext); free(taps); free(x); free(y); free(ad); }
void NeuralNet::Process() { FeedForward(); CalcError(); BackPropogate(); }
//////////////////////////////////////////////////////////////// //Called by CSoundOut worker thread to get new samples from queue // This routine is called from a worker thread so must be careful. // STEREO version //////////////////////////////////////////////////////////////// void CSoundOut::GetOutQueue(int numsamples, TYPESTEREO16* pData ) { int i; bool underflow = false; m_Mutex.lock(); if(m_Startup) { //if no data in queue yet just stuff in silence until something is put in queue for( i=0; i<numsamples; i++) { pData[i].re = 0; pData[i].im = 0; } if(m_OutQLevel>OUTQSIZE/2) { m_Startup = false; m_RateUpdateCount = -5*SOUNDCARD_RATE; //delay first error update to let settle m_PpmError = 0; m_AveOutQLevel = m_OutQLevel; m_UpdateToggle = true; } else { m_Mutex.unlock(); return; } } for( i=0; i<numsamples; i++) { if(m_OutQHead!=m_OutQTail) { pData[i] = m_OutQueueStereo[m_OutQTail++]; m_OutQTail &= (OUTQSIZE-1); m_OutQLevel--; } else //queue went empty { //backup queue ptr and use previous data in queue m_OutQTail -= (OUTQSIZE/4); m_OutQTail &= (OUTQSIZE-1); pData[i] = m_OutQueueStereo[m_OutQTail]; m_OutQLevel += (OUTQSIZE/4); underflow = true; } } if(m_BlockingMode) { //if in blocking mode just return m_Mutex.unlock(); return; } //calculate average Queue fill level m_AveOutQLevel = (1.0-FILTERQLEVEL_ALPHA)*m_AveOutQLevel + FILTERQLEVEL_ALPHA*m_OutQLevel; if(underflow) { qDebug()<<"Snd Underflow"; m_AveOutQLevel = m_OutQLevel; } // See if time to update rate error calculation routine m_RateUpdateCount += numsamples; if(m_RateUpdateCount >= SOUNDCARD_RATE) //every second { CalcError(); m_RateUpdateCount = 0; } m_Mutex.unlock(); }
/////////////////////// // Main program entry /////////////////////// int main(int argc, char** argv) { unsigned int max_iters, Nx, Ny, Nz, blockX, blockY, blockZ; int rank, numberOfProcesses; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z\n", argv[0]); exit(1); } InitializeMPI(&argc, &argv, &rank, &numberOfProcesses); AssignDevices(rank); ECCCheck(rank); // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Copy constants to Constant Memory on the GPUs CopyToConstantMemory(c0, c1); // Decompose along the z-axis const int _Nz = Nz/numberOfProcesses; const int dt_size = sizeof(_DOUBLE_); // Host memory allocations _DOUBLE_ *u_new, *u_old; _DOUBLE_ *h_Uold; u_new = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); if (rank == 0) { h_Uold = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); } init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate host subdomains _DOUBLE_ *h_s_Uolds, *h_s_Unews, *h_s_rbuf[numberOfProcesses]; _DOUBLE_ *left_send_buffer, *left_receive_buffer; _DOUBLE_ *right_send_buffer, *right_receive_buffer; h_s_Unews = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { h_s_rbuf[i] = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); checkCuda(cudaHostAlloc((void**)&h_s_rbuf[i], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); } } #endif right_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer = (_DOUBLE_*)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds, dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer, dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds, u_old, Nx, Ny, _Nz, rank); // GPU stream operations cudaStream_t compute_stream; cudaStream_t data_stream; checkCuda(cudaStreamCreate(&compute_stream)); checkCuda(cudaStreamCreate(&data_stream)); // GPU Memory Operations size_t pitch_bytes, pitch_gc_bytes; _DOUBLE_ *d_s_Unews, *d_s_Uolds; _DOUBLE_ *d_right_send_buffer, *d_left_send_buffer; _DOUBLE_ *d_right_receive_buffer, *d_left_receive_buffer; checkCuda(cudaMallocPitch((void**)&d_s_Uolds, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews, &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer, &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); // Copy subdomains from host to device and get walltime double HtD_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(d_s_Uolds, pitch_bytes, h_s_Uolds, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews, pitch_bytes, h_s_Unews, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); HtD_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); unsigned int ghost_width = 1; int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); //MPI_Status status; MPI_Status status[numberOfProcesses]; MPI_Request gather_send_request[numberOfProcesses]; MPI_Request right_send_request[numberOfProcesses], left_send_request[numberOfProcesses]; MPI_Request right_receive_request[numberOfProcesses], left_receive_request[numberOfProcesses]; double compute_timer = 0.; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); for(unsigned int iterations = 0; iterations < max_iters; iterations++) { // Compute right boundary data on device 0 if (rank == 0) { int kstart = (_Nz+1)-ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2DAsync(right_send_buffer, dt_size*(Nx+2), d_right_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &right_send_request[rank])); } else { int kstart = 1; int kstop = 1+ghost_width; ComputeInnerPointsAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); CopyBoundaryRegionToGhostCellAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_send_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2DAsync(left_send_buffer, dt_size*(Nx+2), d_left_send_buffer, pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault, data_stream)); checkCuda(cudaStreamSynchronize(data_stream)); MPI_CHECK(MPI_Isend(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 1, MPI_COMM_WORLD, &left_send_request[rank])); } // Compute inner nodes for device 0 if (rank == 0) { int kstart = 1; int kstop = (_Nz+1)-ghost_width; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Compute inner nodes for device 1 else { int kstart = 1+ghost_width; int kstop = _Nz+1; ComputeInnerPointsAsync(thread_blocks, threads_per_block, compute_stream, d_s_Unews, d_s_Uolds, pitch, Nx, Ny, _Nz, kstart, kstop); } // Receive data from device 1 if (rank == 0) { MPI_CHECK(MPI_Irecv(left_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 1, 1, MPI_COMM_WORLD, &right_receive_request[rank])); } else { MPI_CHECK(MPI_Irecv(right_send_buffer, (Nx+2)*(Ny+2)*(_GC_DEPTH), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &left_receive_request[rank])); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_right_receive_buffer, pitch_gc_bytes, left_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_right_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 0); } else { MPI_CHECK(MPI_Wait(&left_receive_request[rank], &status[rank])); checkCuda(cudaMemcpy2DAsync(d_left_receive_buffer, pitch_gc_bytes, right_send_buffer, dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault, data_stream)); CopyGhostCellToBoundaryRegionAsync(thread_blocks_halo, threads_per_block, data_stream, d_s_Unews, d_left_receive_buffer, Nx, Ny, _Nz, pitch, gc_pitch, 1); } if (rank == 0) { MPI_CHECK(MPI_Wait(&right_send_request[rank], MPI_STATUS_IGNORE)); } else { MPI_CHECK(MPI_Wait(&left_send_request[rank], MPI_STATUS_IGNORE)); } // Swap pointers on the host checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews, d_s_Uolds); } MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); compute_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Copy data from device to host double DtH_timer = 0; MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer -= MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); checkCuda(cudaMemcpy2D(h_s_Uolds, dt_size*(Nx+2), d_s_Uolds, pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDefault)); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); DtH_timer += MPI_Wtime(); MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD)); // Gather results from subdomains MPI_CHECK(MPI_Isend(h_s_Uolds, (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &gather_send_request[rank])); if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { MPI_CHECK(MPI_Recv(h_s_rbuf[i], (Nx+2)*(Ny+2)*(_Nz+2), MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &status[rank])); merge_domains(h_s_rbuf[i], h_Uold, Nx, Ny, _Nz, i); } } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); } #endif if (rank == 0) { float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); } Finalize(); // Free device memory checkCuda(cudaFree(d_s_Unews)); checkCuda(cudaFree(d_s_Uolds)); checkCuda(cudaFree(d_right_send_buffer)); checkCuda(cudaFree(d_left_send_buffer)); checkCuda(cudaFree(d_right_receive_buffer)); checkCuda(cudaFree(d_left_receive_buffer)); // Free host memory checkCuda(cudaFreeHost(h_s_Unews)); checkCuda(cudaFreeHost(h_s_Uolds)); #if defined(DEBUG) || defined(_DEBUG) if (rank == 0) { for (int i = 0; i < numberOfProcesses; i++) { checkCuda(cudaFreeHost(h_s_rbuf[i])); } free(h_Uold); } #endif checkCuda(cudaFreeHost(left_send_buffer)); checkCuda(cudaFreeHost(left_receive_buffer)); checkCuda(cudaFreeHost(right_send_buffer)); checkCuda(cudaFreeHost(right_receive_buffer)); checkCuda(cudaDeviceReset()); free(u_old); free(u_new); return 0; }