cl_double mwDeviceEstimateGFLOPs(const DevInfo* di, cl_bool useDouble) { cl_double gflops = 0.0; if (di->devType == CL_DEVICE_TYPE_GPU) { if (mwIsNvidiaGPUDevice(di)) { gflops = mwCUDAEstimateGFLOPs(di, useDouble); } else if (mwIsAMDGPUDevice(di)) { gflops = mwAMDEstimateGFLOPs(di, useDouble); } else { mw_printf("Unhandled GPU vendor '%s' (0x%x)\n", di->vendor, di->vendorID); gflops = 100.0; } } else { mw_printf("Missing flops estimate for device type %s\n", showCLDeviceType(di->devType)); return 1.0; } return gflops; }
/* Query one device specified by type, create a context and command * queue, as well as retrieve device information */ cl_int mwSetupCL(CLInfo* ci, const CLRequest* clr) { cl_int err; err = mwGetCLInfo(ci, clr); if (err != CL_SUCCESS) { mw_printf("Failed to get information about device\n"); return err; } err = mwGetDevInfo(&ci->di, ci->dev); if (err != CL_SUCCESS) { mw_printf("Failed to get device info\n"); return err; } if (mwIsFirstRun()) { if (clr->verbose) { mwPrintDevInfo(&ci->di); } else { mwPrintDevInfoShort(&ci->di); } } return mwCreateCtxQueue(ci, CL_FALSE, clr->enableProfiling); }
void nbReportTreeIncest(const NBodyCtx* ctx, NBodyState* st) { if (!st->treeIncest) /* don't repeat warning */ { st->treeIncest = TRUE; if (!ctx->quietErrors) /* Avoid massive printing of tests causing incest */ { if (ctx->allowIncest) { mw_printf("[tree-incest detected at step %u / %u (%f%%)]\n", st->step, ctx->nStep, 100.0 * (real) st->step / (real) ctx->nStep ); } else { mw_printf("tree-incest detected (fatal) at step %u / %u (%f%%)\n", st->step, ctx->nStep, 100.0 * (real) st->step / (real) ctx->nStep ); } } } }
static IntegralArea* prepareParameters(const SeparationFlags* sf, AstronomyParameters* ap, BackgroundParameters* bgp, Streams* streams) { IntegralArea* ias; ias = setupSeparation(ap, bgp, streams, sf); /* Try the new file first. If that doesn't work, try the old one. */ if (!ias) { mw_printf("Error reading astronomy parameters from file '%s'\n" " Trying old parameters file\n", sf->ap_file); ias = readParameters(sf->ap_file, ap, bgp, streams); } if (!ias) { mw_printf("Failed to read parameters file\n"); return NULL; } if (sf->numArgs && setParameters(ap, bgp, streams, sf->numArgs, sf->nForwardedArgs)) { mwFreeA(ias); freeStreams(streams); return NULL; } return ias; }
int equalBody(const Body* a, const Body* b) { if (Mass(a) != Mass(b)) { mw_printf("mass differ\n"); return FALSE; } if (Type(a) != Type(b)) { mw_printf("type ndiffer\n"); return FALSE; } if (!equalVector(&Pos(a), &Pos(b))) { mw_printf("pos differ\n"); return FALSE; } if (!equalVector(&Vel(a), &Vel(b))) { mw_printf("VElocity differ\n"); return FALSE; } return TRUE; }
static int nbglGetExclusiveSceneAccess(scene_t* scene) { int pid = (int) getpid(); int oldPID = OPA_cas_int(&scene->attachedLock, 0, pid); if (oldPID != 0) { if (mwProcessIsAlive(oldPID)) { mw_printf("Could not get exclusive access to simulation shared segment " "(Owned by process %d)\n", oldPID); return 1; } else { mw_printf("Simulation shared segment owned by dead process %d, stealing it\n", oldPID); /* Process is dead, steal the lock */ nbglReleaseSceneLocks(scene); return 0; } } else { OPA_store_int(&scene->attachedPID, 0); return 0; } }
static void nbPrintVersion(int boincTag, int verbose) { char versionStr[2048]; snprintf(versionStr, sizeof(versionStr), "%s %u.%02u %s %s %s %s %s, %s", NBODY_PROJECT_NAME, NBODY_VERSION_MAJOR, NBODY_VERSION_MINOR, MILKYWAY_SYSTEM_NAME, ARCH_STRING, PRECSTRING, DENORMAL_STRING, NBODY_EXTRAVER, NBODY_EXTRALIB); if (boincTag) { mw_printf("<search_application> %s </search_application>\n", versionStr); } else { mw_printf("%s %s\n", versionStr, BOINC_APPLICATION ? "BOINC" : ""); } if (verbose) { mw_printf("Commit %s\n", MILKYWAY_GIT_COMMIT_ID); } }
cl_double mwAMDEstimateGFLOPs(const DevInfo* di, cl_bool useDouble) { cl_ulong flops, flopsFloat, flopsDouble; cl_double gflops; flopsFloat = 2 * (di->maxCompUnits * di->aluPerCU) * (cl_ulong) di->clockFreq * 1000000; flopsDouble = flopsFloat / di->doubleFrac; mw_printf("Estimated AMD GPU GFLOP/s: %.0f SP GFLOP/s, %.0f DP FLOP/s\n", 1.0e-9 * (cl_double) flopsFloat, 1.0e-9 * (cl_double) flopsDouble); flops = useDouble ? flopsDouble : flopsFloat; gflops = floor(1.0e-9 * (cl_double) flops); /* At different times the AMD drivers have reported 0 as the clock * speed, so try to catch that. We could test the GPU and figure * out what the FLOPs should be to get a better estimate. */ if (gflops <= 100.0) { mw_printf("Warning: Bizarrely low flops (%.0f). Defaulting to %.0f\n", gflops, 100.0); gflops = 100.0; } return gflops; }
static int nbSetNumThreads(int numThreads) { #ifdef _OPENMP int nProc = omp_get_num_procs(); int nBoinc = mwGetBoincNumCPU(); if (nProc <= 0) /* It's happened before... */ { mw_printf("Number of processors %d is crazy\n", nProc); return 1; } /* If command line argument not given, and BOINC gives us a value use that */ if (numThreads <= 0 && nBoinc > 0) { numThreads = nBoinc; } if (numThreads != 0) { omp_set_num_threads(numThreads); mw_printf("Using OpenMP %d max threads on a system with %d processors\n", omp_get_max_threads(), nProc); } #endif return 0; }
/* If possible, resume from a checkpoint. Otherwise do the necessary * initialization for a new run */ static NBodyStatus nbResumeOrNewRun(NBodyCtx* ctx, NBodyState* st, const NBodyFlags* nbf) { if (nbResolveCheckpoint(st, nbf->checkpointFileName)) { mw_printf("Failed to resolve checkpoint\n"); return NBODY_ERROR; } /* If the checkpoint exists (and we want to use it), try to use it */ if (nbf->ignoreCheckpoint || !nbResolvedCheckpointExists(st)) { if (!nbf->inputFile) { mw_printf("No input file and no checkpoint\n"); return NBODY_USER_ERROR; } if (nbSetup(ctx, st, nbf)) { mw_printf("Failed to read input parameters file\n"); return NBODY_PARAM_FILE_ERROR; } } else /* Resume from checkpoint */ { if (nbf->inputFile && !BOINC_APPLICATION) { mw_printf("Warning: input file '%s' unused\n", nbf->inputFile); } if (nbReadCheckpoint(ctx, st)) { mw_report("Failed to read checkpoint\n"); return NBODY_CHECKPOINT_ERROR; } else { mw_report("Resumed from checkpoint '%s'\n", nbf->checkpointFileName); } } if (ctx->potentialType == EXTERNAL_POTENTIAL_CUSTOM_LUA) { /* We're using a custom potential, so we'll reevaluate the * script. We must do this once per thread. */ if (nbOpenPotentialEvalStatePerThread(st, nbf)) { return NBODY_PARAM_FILE_ERROR; } } return NBODY_SUCCESS; }
static int worker(const SeparationFlags* sf) { AstronomyParameters ap; BackgroundParameters bgp = EMPTY_BACKGROUND_PARAMETERS; Streams streams = EMPTY_STREAMS; IntegralArea* ias = NULL; StreamConstants* sc = NULL; SeparationResults* results = NULL; int rc; CLRequest clr; memset(&ap, 0, sizeof(ap)); memset(&clr, 0, sizeof(clr)); setCLReqFlags(&clr, sf); ias = prepareParameters(sf, &ap, &bgp, &streams); if (!ias) return 1; rc = setAstronomyParameters(&ap, &bgp); if (rc) { mwFreeA(ias); freeStreams(&streams); return 1; } setExpStreamWeights(&ap, &streams); sc = getStreamConstants(&ap, &streams); if (!sc) { mw_printf("Failed to get stream constants\n"); mwFreeA(ias); freeStreams(&streams); return 1; } results = newSeparationResults(ap.number_streams); rc = evaluate(results, &ap, ias, &streams, sc, sf->star_points_file, &clr, sf->do_separation, sf->ignoreCheckpoint, sf->separation_outfile); if (rc) mw_printf("Failed to calculate likelihood\n"); printSeparationResults(results, ap.number_streams); mwFreeA(ias); mwFreeA(sc); freeStreams(&streams); freeSeparationResults(results); return rc; }
static cl_int runIntegral(CLInfo* ci, SeparationCLMem* cm, RunSizes* runSizes, EvaluationState* es, const CLRequest* clr, const AstronomyParameters* ap, const IntegralArea* ia) { cl_int err = CL_SUCCESS; double t1, t2, dt; double tAcc = 0.0; for (; es->nu_step < ia->nu_steps; es->nu_step++) { if (clr->enableCheckpointing && timeToCheckpointGPU(es, ia)) { err = checkpointCL(ci, cm, ia, es); if (err != CL_SUCCESS) break; } t1 = mwGetTimeMilli(); err = runNuStep(ci, ia, runSizes, es->nu_step); if (err != CL_SUCCESS) { mwPerrorCL(err, "Failed to run nu step"); return err; } t2 = mwGetTimeMilli(); dt = t2 - t1; tAcc += dt; reportProgress(ap, ia, es, es->nu_step + 1, dt); } es->nu_step = 0; mw_printf("Integration time: %f s. Average time per iteration = %f ms\n", tAcc / 1000.0, tAcc / (double) ia->nu_steps); if (err == CL_SUCCESS) { err = readKernelResults(ci, cm, es, ia); if (err != CL_SUCCESS) mw_printf("Failed to read final kernel results\n"); /* Add final episode to running totals */ addTmpCheckpointSums(es); } return err; }
/* Set any flags based on project preferences that weren't specified * on the command line. * * This is a bit convoluted since we need to boinc_init before we read * the preferences, but that needs to be delayed until after argument * reading in case we want to disable output redirection, and then we * still want the command line to supersede the project prefs / the * device specified by app_init_data */ static void setFlagsFromPreferences(SeparationFlags* flags, const SeparationPrefs* prefs, const char* progName) { if (flags->useDevNumber < 0) { /* Try to use BOINC's suggestion from app_init_data stuff; We might not get it so just use the first device. */ flags->useDevNumber = mwGetBoincOpenCLDeviceIndex(); if (flags->useDevNumber < 0) { flags->useDevNumber = 0; } } if (!flags->preferredPlatformVendor) { const char* vendor = mwGetBoincOpenCLPlatformVendor(); if (vendor) { mw_printf("BOINC GPU type suggests using OpenCL vendor '%s'\n", vendor); } else { /* If BOINC doesn't tell us, guess based on the binary name */ vendor = mwGuessPreferredPlatform(progName); if (vendor) { mw_printf("Guessing preferred OpenCL vendor '%s'\n", vendor); } } flags->preferredPlatformVendor = vendor ? strdup(vendor) : NULL; } if (flags->targetFrequency <= 0.0) { flags->targetFrequency = prefs->gpuTargetFrequency; } if (flags->nonResponsive < 0) { flags->nonResponsive = prefs->gpuNonResponsive; } if (flags->processPriority == MW_PRIORITY_INVALID) { /* For GPU versions, default to using a higher process priority if not set */ if (SEPARATION_OPENCL && !flags->forceNoOpenCL) { flags->processPriority = prefs->gpuProcessPriority; } } }
static int readIntegralArea(lua_State* luaSt, IntegralArea* iaOut, int table) { uint64_t r, mu, nu; static IntegralArea ia; static real nuStepsf, muStepsf, rStepsf; static const MWNamedArg iaArgTable[] = { { "nu_min", LUA_TNUMBER, NULL, TRUE, &ia.nu_min }, { "nu_max", LUA_TNUMBER, NULL, TRUE, &ia.nu_max }, { "nu_steps", LUA_TNUMBER, NULL, TRUE, &nuStepsf }, { "mu_min", LUA_TNUMBER, NULL, TRUE, &ia.mu_min }, { "mu_max", LUA_TNUMBER, NULL, TRUE, &ia.mu_max }, { "mu_steps", LUA_TNUMBER, NULL, TRUE, &muStepsf }, { "r_min", LUA_TNUMBER, NULL, TRUE, &ia.r_min }, { "r_max", LUA_TNUMBER, NULL, TRUE, &ia.r_max }, { "r_steps", LUA_TNUMBER, NULL, TRUE, &rStepsf }, END_MW_NAMED_ARG }; handleNamedArgumentTable(luaSt, iaArgTable, table); ia.nu_steps = (unsigned int) nuStepsf; ia.mu_steps = (unsigned int) muStepsf; ia.r_steps = (unsigned int) rStepsf; r = (uint64_t) ia.r_steps; mu = (uint64_t) ia.mu_steps; nu = (uint64_t) ia.nu_steps; if (nu == 0 || mu == 0 || r == 0) { mw_printf("Integral size { %u, %u, %u } cannot be 0\n", nu, mu, r); return 1; } if ((r > UINT64_MAX / mu) || ((r * mu) > UINT64_MAX / nu)) { mw_printf("Integral size { %u, %u, %u } will overflow progress calculation\n", ia.nu_steps, ia.mu_steps, ia.r_steps); return 1; } calcIntegralStepSizes(&ia); *iaOut = ia; return 0; }
static int emdIterateSolution(EMDState* state) { int result; float min_delta; float eps = EMD_EPS * state->max_cost; /* if ssize = 1 or dsize = 1 then we are done, else ... */ if (state->ssize > 1 && state->dsize > 1) { int itr; for (itr = 1; itr < MAX_ITERATIONS; itr++) { /* find basic variables */ result = emdFindBasicVariables(state->cost, state->is_x, state->u, state->v, state->ssize, state->dsize); if (result < 0) { break; } /* check for optimality */ min_delta = emdIsOptimal(state->cost, state->is_x, state->u, state->v, state->ssize, state->dsize, state->enter_x); if (min_delta == EMD_INF) { mw_printf("Iteration didn't converge"); return 1; } /* if no negative deltamin, we found the optimal solution */ if (min_delta >= -eps) { break; } /* improve solution */ if (!emdNewSolution(state)) { mw_printf("Iteration didn't converge"); return 1; } } } return 0; }
int pushType(lua_State* luaSt, const char* typeName, size_t typeSize, void* p) { void* lp; lp = lua_newuserdata(luaSt, typeSize); if (!lp) { mw_printf("Creating userdata '%s' failed\n", typeName); return 0; } assert((uintptr_t) lp % MW_LUA_ALIGN == 0); /* This must be true for dSFMT intrinsics stuff to work */ luaL_getmetatable(luaSt, typeName); lua_setmetatable(luaSt, -2); #if 0 /* Give this object a new function environment; for installing * arbitrary lua functions into a type */ lua_newtable(luaSt); lua_setfenv(luaSt, -2); #endif memcpy(lp, p, typeSize); return 1; }
StreamConstants* getStreamConstants(const AstronomyParameters* ap, const Streams* streams) { int i; StreamConstants* sc; real stream_sigma; real sigma_sq2; sc = (StreamConstants*) mwMallocA(streams->number_streams * sizeof(StreamConstants)); for (i = 0; i < streams->number_streams; ++i) { stream_sigma = streams->parameters[i].sigma; if (stream_sigma == 0.0) { mw_printf("stream sigma 0.0 is invalid\n"); mwFreeA(sc); return NULL; } sc[i].large_sigma = (stream_sigma > SIGMA_LIMIT || stream_sigma < -SIGMA_LIMIT); sigma_sq2 = 2.0 * sqr(stream_sigma); sc[i].sigma_sq2_inv = 1.0 / sigma_sq2; sc[i].a = streamA(&streams->parameters[i]); sc[i].c = streamC(ap, ap->wedge, streams->parameters[i].mu, streams->parameters[i].r); } return sc; }
/*Functions for reading parameters from files */ real* fread_double_array(FILE* file, const char* array_name, unsigned int* sizeOut) { unsigned int i, size; int rc; real* arr; fscanf(file, array_name); fscanf(file, "[%u]: ", &size); arr = (real*) mwMalloc(sizeof(real) * size); for (i = 0; i < size; i++) { rc = fscanf(file, READ_DOUBLE_ARRAY_READ_STR, &arr[i]); if (rc != 1) { mw_printf("Error reading into %s\n", array_name); free(arr); return NULL; } if (i < size - 1) fscanf(file, ", "); } fscanf(file, "\n"); if (sizeOut) *sizeOut = size; return arr; }
/* If using BOINC try reading a few of the settings from the project * preferences. If command line arguments are used, those will * override the preferences. The command line arguments will also * still work without BOINC */ static void separationReadPreferences(SeparationPrefs* prefsOut) { static SeparationPrefs prefs; static MWProjectPrefs sepPrefs[] = { { "gpu_target_frequency", MW_PREF_DOUBLE, FALSE, &prefs.gpuTargetFrequency }, { "gpu_wait_factor", MW_PREF_DOUBLE, FALSE, &prefs.gpuWaitFactor }, { "gpu_non_responsive", MW_PREF_BOOL, FALSE, &prefs.gpuNonResponsive }, { "gpu_process_priority", MW_PREF_INT, FALSE, &prefs.gpuProcessPriority }, { "no_gpu_checkpoint", MW_PREF_BOOL, FALSE, &prefs.gpuDisableCheckpoint }, END_MW_PROJECT_PREFS }; prefs.gpuTargetFrequency = DEFAULT_TARGET_FREQUENCY; prefs.gpuWaitFactor = DEFAULT_WAIT_FACTOR; prefs.gpuNonResponsive = DEFAULT_NON_RESPONSIVE; prefs.gpuProcessPriority = DEFAULT_GPU_PRIORITY; prefs.gpuDisableCheckpoint = DEFAULT_DISABLE_GPU_CHECKPOINTING; if (BOINC_APPLICATION) { if (mwGetAppInitData()) { mw_printf("Error reading app init data. Project preferences will not be used\n"); } else { mwReadProjectPrefs(sepPrefs, mwGetProjectPrefs()); } } *prefsOut = prefs; }
static cl_int runNuStep(CLInfo* ci, const IntegralArea* ia, const RunSizes* runSizes, cl_uint nu_step) { cl_uint i; cl_int err = CL_SUCCESS; size_t offset[1]; err = setNuKernelArgs(ia, nu_step); if (err != CL_SUCCESS) { mw_printf("Failed to set nu kernel argument\n"); return err; } mw_begin_critical_section(); offset[0] = 0; for (i = 0; i < runSizes->nChunk && err == CL_SUCCESS; ++i) { err = runIntegralKernel(ci, runSizes, offset); checkQuitRequest(); /* Kernel has finished by now */ offset[0] += runSizes->global[0]; } mw_end_critical_section(); return err; }
void mwPrintDevInfoShort(const DevInfo* di) { mw_printf("Device '%s' (%s:0x%x) (%s)\n" "Driver version: %s\n" "Version: %s\n" "Compute capability: %u.%u\n" "Max compute units: %u\n" "Clock frequency: %u Mhz\n" "Global mem size: "LLU"\n" "Local mem size: "LLU"\n" "Max const buf size: "LLU"\n" "Double extension: %s\n", di->devName, di->vendor, di->vendorID, showCLDeviceType(di->devType), di->driver, di->version, di->computeCapabilityMajor, di->computeCapabilityMinor, di->maxCompUnits, di->clockFreq, di->memSize, di->localMemSize, di->maxConstBufSize, showMWDoubleExts(di->doubleExts) ); }
int* fread_int_array(FILE *file, const char *array_name, unsigned int* sizeOut) { unsigned int i, size; int* arr; fscanf(file, array_name); fscanf(file, "[%u]: ", &size); arr = (int*) mwMalloc(sizeof(int) * size); for (i = 0; i < size; i++) { if (fscanf(file, "%d", &arr[i]) != 1) { mw_printf("Error reading into %s\n", array_name); free(arr); return NULL; } if (i < size - 1) fscanf(file, ", "); } fscanf(file, "\n"); if (sizeOut) *sizeOut = size; return arr; }
/* The transactional stuff is only available on Vista and later */ static void initW32TransactionalFunctions() { HMODULE ktm32Lib; HMODULE kernel32Lib; transactionFuncsInit = TRUE; kernel32Lib = LoadLibrary("Kernel32.dll"); if (!kernel32Lib) { mwPerrorW32("Could not load Kernel32.dll"); return; } ktm32Lib = LoadLibrary("KtmW32.dll"); if (!ktm32Lib) { mwPerrorW32("Could not load Ktm32.dll"); return; } __CreateTransaction = GetProcAddress(ktm32Lib, "CreateTransaction"); __CommitTransaction = GetProcAddress(ktm32Lib, "CommitTransaction"); __MoveFileTransacted = GetProcAddress(kernel32Lib, "MoveFileTransactedA"); transactionFuncsOK = (__CreateTransaction && __MoveFileTransacted && __CommitTransaction); if (!transactionFuncsOK) { mw_printf("Failed to get transaction functions\n"); } }
NBodyStatus nbRunSystemPlain(const NBodyCtx* ctx, NBodyState* st) { NBodyStatus rc = NBODY_SUCCESS; rc |= nbGravMap(ctx, st); /* Calculate accelerations for 1st step this episode */ if (nbStatusIsFatal(rc)) return rc; while (st->step < ctx->nStep) { nbAddTracePoint(ctx, st); nbUpdateDisplayedBodies(ctx, st); rc |= nbStepSystemPlain(ctx, st); if (nbStatusIsFatal(rc)) /* advance N-body system */ return rc; rc |= nbCheckpoint(ctx, st); if (nbStatusIsFatal(rc)) return rc; nbReportProgress(ctx, st); } if (BOINC_APPLICATION || ctx->checkpointT >= 0) { mw_report("Making final checkpoint\n"); if (nbWriteCheckpoint(ctx, st)) { mw_printf("Failed to write final checkpoint\n"); return NBODY_CHECKPOINT_ERROR; } } return rc; }
cl_int mwSelectDevice(CLInfo* ci, const cl_device_id* devs, const CLRequest* clr, const cl_uint nDev) { cl_int err = CL_SUCCESS; if (clr->devNum >= nDev) { mw_printf("Requested device is out of range of number found devices\n"); return MW_CL_ERROR; } ci->dev = devs[clr->devNum]; err = mwGetDeviceType(ci->dev, &ci->devType); if (err != CL_SUCCESS) mw_printf("Failed to find type of device %u\n", clr->devNum); return err; }
static void CL_CALLBACK contextCallback(const char* errInfo, const void* privateInfo, size_t cb, void* userData) { (void) privateInfo, (void) cb, (void) userData; mw_printf("CL context error: %s\n", errInfo); }
NBodyStatus nbInitNBodyStateCL(NBodyState* st, const NBodyCtx* ctx, const CLRequest* clr) { cl_int err; /* Bodies must be set before trying to use this */ if (!st->bodytab) { return NBODY_CONSISTENCY_ERROR; } if (ctx->potentialType == EXTERNAL_POTENTIAL_CUSTOM_LUA) { mw_printf("Cannot use Lua potential with OpenCL\n"); return NBODY_UNSUPPORTED; } st->usesQuad = ctx->useQuad; st->usesExact = (ctx->criterion == Exact); st->usesCL = TRUE; st->ci = mwCalloc(1, sizeof(CLInfo)); st->nbb = mwCalloc(1, sizeof(NBodyBuffers)); st->workSizes = mwCalloc(1, sizeof(NBodyWorkSizes)); st->kernels = mwCalloc(1, sizeof(NBodyKernels)); err = mwSetupCL(st->ci, clr); if (err != CL_SUCCESS) return NBODY_CL_ERROR; if (!nbCheckDevCapabilities(&st->ci->di, ctx, st->nbody)) return NBODY_CAPABILITY_ERROR; if (nbSetThreadCounts(st->workSizes, &st->ci->di, ctx) || nbSetWorkSizes(st->workSizes, &st->ci->di)) return NBODY_ERROR; st->effNBody = nbFindEffectiveNBody(st->workSizes, st->usesExact, st->nbody); if (nbLoadKernels(ctx, st)) return NBODY_CL_ERROR; err = nbCreateBuffers(ctx, st); if (err != CL_SUCCESS) return NBODY_CL_ERROR; err = nbSetInitialTreeStatus(st); if (err != CL_SUCCESS) return NBODY_CL_ERROR; err = nbSetAllKernelArguments(st); if (err != CL_SUCCESS) return NBODY_CL_ERROR; err = nbMarshalBodies(st, CL_TRUE); if (err != CL_SUCCESS) return NBODY_CL_ERROR; return NBODY_SUCCESS; }
int mwDisableDenormalsSSE(void) { int oldMXCSR = _mm_getcsr(); int newMXCSR = oldMXCSR | 0x8040; _mm_setcsr(newMXCSR); mw_printf("Disabled denormals\n"); return oldMXCSR; }
int resolveCheckpoint(void) { int rc; rc = mw_resolve_filename(CHECKPOINT_FILE, resolvedCheckpointPath, sizeof(resolvedCheckpointPath)); if (rc) mw_printf("Error resolving checkpoint file '%s': %d\n", CHECKPOINT_FILE, rc); return rc; }
static void emdPrintFlowMatrix(const float* flow, int size1, int size2) { int i, j; const int flowStep = 1; if (!flow) { mw_printf("Empty flow\n"); return; } for (i = 0; i < size1; ++i) { for (j = 0; j < size2; ++j) { mw_printf("Flow[ % d][ % d] = % f\n", i, j, flow[flowStep * j + i]); } } }