int main(int argc, char **argv) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); std::cout << " === Possible cmd line options: -pregenerate, -cache === " << std::endl; /* set default camera */ g_camera.from = Vec3fa(1.5f,1.5f,-1.5f); g_camera.to = Vec3fa(0.0f,0.0f,0.0f); /*! Parse command line options. */ parseCommandLine(new ParseStream(new CommandLineStream(argc, argv)), FileName()); /*! Set the thread count in the Embree configuration string. */ if (g_numThreads) g_rtcore += ",threads=" + std::to_string((long long)g_numThreads); g_rtcore += g_subdiv_mode; /*! Initialize Embree state. */ init(g_rtcore.c_str()); /* render to disk */ if (outFilename.str() != "") renderToFile(outFilename); /* interactive mode */ if (g_interactive) { initWindowState(argc,argv,tutorialName, g_width, g_height, g_fullscreen); enterWindowRunLoop(); } return 0; }
void Mixer::fifoWriter::run() { // set denormal protection for this thread #ifdef __SSE3__ /* DAZ flag */ _MM_SET_DENORMALS_ZERO_MODE( _MM_DENORMALS_ZERO_ON ); #endif #ifdef __SSE__ /* FTZ flag */ _MM_SET_FLUSH_ZERO_MODE( _MM_FLUSH_ZERO_ON ); #endif #if 0 #ifdef LMMS_BUILD_LINUX #ifdef LMMS_HAVE_SCHED_H cpu_set_t mask; CPU_ZERO( &mask ); CPU_SET( 0, &mask ); sched_setaffinity( 0, sizeof( mask ), &mask ); #endif #endif #endif const fpp_t frames = m_mixer->framesPerPeriod(); while( m_writing ) { surroundSampleFrame * buffer = new surroundSampleFrame[frames]; const surroundSampleFrame * b = m_mixer->renderNextBuffer(); memcpy( buffer, b, frames * sizeof( surroundSampleFrame ) ); m_fifo->write( buffer ); } m_fifo->write( NULL ); }
static uintptr_t recordThread_(void *recordData_) { recordData_t *recordData = (recordData_t *)recordData_; int retval = 0; const int bufsize = 4096; MYFLT buf[bufsize]; _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); while (recordData->running) { pthread_mutex_lock(&recordData->mutex); pthread_cond_wait(&recordData->condvar, &recordData->mutex); int sampsread; do { sampsread = csoundReadCircularBuffer(NULL, recordData->cbuf, buf, bufsize); #ifdef USE_DOUBLE sf_write_double((SNDFILE *) recordData->sfile, buf, sampsread); #else sf_write_float((SNDFILE *) recordData->sfile, buf, sampsread); #endif } while(sampsread != 0); pthread_mutex_unlock(&recordData->mutex); } return (uintptr_t) ((unsigned int) retval); }
/* main function in embree namespace */ int main(int argc, char** argv) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); /* set default camera */ g_camera.from = Vec3fa(2.5f,2.5f,2.5f); g_camera.to = Vec3fa(0.0f,0.0f,0.0f); /* create stream for parsing */ Ref<ParseStream> stream = new ParseStream(new CommandLineStream(argc, argv)); /* parse command line */ parseCommandLine(stream, FileName()); if (g_numThreads) g_rtcore += ",threads=" + toString(g_numThreads); /* initialize ray tracing core */ init(g_rtcore.c_str()); /* render to disk */ if (outFilename.str() != "") { renderToFile(outFilename); return 0; } /* initialize GLUT */ initWindowState(argc,argv,tutorialName, g_width, g_height, g_fullscreen); /* enter the GLUT run loop */ enterWindowRunLoop(); return 0; }
void TimeLagFilterCore::DTCalcThread::run(){ //Disable denormalized floats _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); //Start float b, c; while(!threadShouldExit()){ if(!core.paramsChanged) wait(-1); if(threadShouldExit()) return; if(core.sampleSwapDT){ //Don't recalculate if waiting for sample to finish wait(1); //Try again soon }else{ //Copy write to calc { const ScopedWriteLock writeLock(core.ctLock); memcpy(core.ct_calc, core.ct_write, core.num_filters * sizeof (CTParams)); core.paramsChanged = false; } //Calculate filter coefficients const ScopedWriteLock writeLock(core.dtLock); for(int i=0; i<core.num_filters; ++i){ filtercalculations(core.reduced_fs, core.ct_calc[i].center, core.ct_calc[i].bw, &b, &c); core.dt_calc[i].b = b; core.dt_calc[i].c = c; } core.sampleSwapDT = true; } } }
/* main function in embree namespace */ int main(int argc, char** argv) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); /* initialize ray tracing core and force bvh4.triangle4v hierarchy for triangles */ rtcInit("tri_accel=bvh4.triangle4v"); /* set error handler */ rtcSetErrorFunction(error_handler); /* create scene */ g_scene = rtcNewScene(RTC_SCENE_STATIC,RTC_INTERSECT1); addCube(g_scene,Vec3fa(-1,0,0)); addCube(g_scene,Vec3fa(1,0,0)); addCube(g_scene,Vec3fa(0,0,-1)); addCube(g_scene,Vec3fa(0,0,1)); addHair(g_scene); addGroundPlane(g_scene); rtcCommit (g_scene); /* print triangle BVH */ print_bvh(g_scene); /* cleanup */ rtcDeleteScene (g_scene); rtcExit(); return 0; }
/* main function in embree namespace */ int main(int argc, char** argv) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); /* create stream for parsing */ Ref<ParseStream> stream = new ParseStream(new CommandLineStream(argc, argv)); /* parse command line */ parseCommandLine(stream, FileName()); /* load default scene if none specified */ if (filename.ext() == "") { FileName file = FileName::executableFolder() + FileName("models/cornell_box.ecs"); parseCommandLine(new ParseStream(new LineCommentFilter(file, "#")), file.path()); } /* configure number of threads */ if (g_numThreads) g_rtcore += ",threads=" + std::to_string((long long)g_numThreads); if (g_numBenchmarkFrames) g_rtcore += ",benchmark=1"; g_rtcore += g_subdiv_mode; /* load scene */ if (strlwr(filename.ext()) == std::string("obj")) { g_scene->add(loadOBJ(filename,g_subdiv_mode != "")); } else if (strlwr(filename.ext()) == std::string("xml")) { g_scene->add(loadXML(filename,one)); } else if (filename.ext() != "") THROW_RUNTIME_ERROR("invalid scene type: "+strlwr(filename.ext())); /* initialize ray tracing core */ init(g_rtcore.c_str()); /* send model */ g_obj_scene.add(g_scene.dynamicCast<SceneGraph::Node>(),g_instancing_mode); g_scene = nullptr; set_scene(&g_obj_scene); /* benchmark mode */ if (g_numBenchmarkFrames) renderBenchmark(outFilename); /* render to disk */ if (outFilename.str() != "") renderToFile(outFilename); /* interactive mode */ if (g_interactive) { initWindowState(argc,argv,tutorialName, g_width, g_height, g_fullscreen); enterWindowRunLoop(g_anim_mode); } return 0; }
static uintptr_t csoundPerformanceThread_(void *userData) { CsPerfThread_PerformScore p(userData); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); // perform the score int retval = p.Perform(); // return positive value if stopped or end of score, and negative on error return (uintptr_t) ((unsigned int) retval); }
int main(int argc, char* argv[]) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); ispcEntry(); return 0; }
bool Context::setFlushDenormal(bool on) { #ifdef USE_SSE3 // Setting flush-to-zero (FTZ) flag _MM_SET_FLUSH_ZERO_MODE(on ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF); // Setting denormals-are-zero (DAZ) flag _MM_SET_DENORMALS_ZERO_MODE(on ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF); return true; #else return false; #endif }
int main(int argc, char* argv[]) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); /* create new Embree device */ RTCDevice device = rtcNewDevice("verbose=1"); /* ddelete device again */ rtcDeleteDevice(device); return 0; }
void initMain(int argc, char** argv) { installLayerStackTracer(); std::string line; for (int i = 0; i < argc; ++i) { line += argv[i]; line += ' '; } #ifndef GFLAGS_GFLAGS_H_ namespace gflags = google; #endif gflags::ParseCommandLineFlags(&argc, &argv, true); initializeLogging(argc, argv); LOG(INFO) << "commandline: " << line; CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1]; installProfilerSwitch(); #ifdef __SSE__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #endif #ifdef __SSE3__ _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); #endif if (FLAGS_seed == 0) { unsigned int t = time(NULL); srand(t); ThreadLocalRand::initSeed(t); LOG(INFO) << "random number seed=" << t; } else { srand(FLAGS_seed); ThreadLocalRand::initSeed(FLAGS_seed); } if (FLAGS_use_gpu) { // This is the initialization of the CUDA environment, // need before runInitFunctions. // TODO(hedaoyuan) Can be considered in the runInitFunctions, // but to ensure that it is the first to initialize. hl_start(); hl_init(FLAGS_gpu_id); } version::printVersion(); checkCPUFeature().check(); runInitFunctions(); }
void _initialize_cpu_thread () { debug_on_thread_spawn (); #ifndef XRCORE_STATIC // fpu & sse FPU::m24r (); #endif // XRCORE_STATIC if (CPU::ID.feature&_CPU_FEATURE_SSE) { //_mm_setcsr ( _mm_getcsr() | (_MM_FLUSH_ZERO_ON+_MM_DENORMALS_ZERO_ON) ); _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON); if (_denormals_are_zero_supported) { __try { _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON); } __except(EXCEPTION_EXECUTE_HANDLER) { _denormals_are_zero_supported = FALSE; } } }
void TimeLagFilterCore::DlyCalcThread::run(){ //Disable denormalized floats _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); //Start float phasesum, groupsum; float a, b, center, bw, ctrsquared; float phase, group; float w, wsquared, twow, dw; int f, nw; while(!threadShouldExit()){ if(!core.paramsChangedDelay) wait(-1); if(threadShouldExit()) return; { const ScopedReadLock readLock(core.ctLock); //Calculate delays dw = core.getMaxCtr() / ResponseGraph::NUM_RESP_W; w = 0.0f; for(nw = 0; nw < ResponseGraph::NUM_RESP_W; ++nw){ phasesum = 0.0f; groupsum = 0.0f; wsquared = w * w; twow = 2.0f * w; for(f=0; f<core.num_filters; ++f){ center = core.ct_write[f].center; bw = core.ct_write[f].bw; ctrsquared = center * center; a = ctrsquared - wsquared; b = twow * bw * center; phase = -2.0f * atan2(b, a); group = -4.0f * bw * center * (ctrsquared - (core.getMaxBW() * wsquared)) / (a*a + b*b); phasesum += phase; groupsum += group; } core.phasedelay[nw] = phasesum; core.groupdelay[nw] = groupsum; w += dw; } core.paramsChangedDelay = false; } } }
void MixerWorkerThread::run() { // set denormal protection for this thread #ifdef __SSE3__ /* DAZ flag */ _MM_SET_DENORMALS_ZERO_MODE( _MM_DENORMALS_ZERO_ON ); #endif #ifdef __SSE__ /* FTZ flag */ _MM_SET_FLUSH_ZERO_MODE( _MM_FLUSH_ZERO_ON ); #endif QMutex m; while( m_quit == false ) { m.lock(); queueReadyWaitCond->wait( &m ); globalJobQueue.run(); m.unlock(); } }
void RayEngine::embreeInit() { cout << "Starting Embree..." << endl; // Init library Embree.device = rtcNewDevice(NULL); _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); // Generate texture glGenTextures(1, &Embree.texture); glBindTexture(GL_TEXTURE_2D, Embree.texture); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glBindTexture(GL_TEXTURE_2D, 0); // Init scenes userData = this; for (uint i = 0; i < scenes.size(); i++) scenes[i]->embreeInit(Embree.device); }
/* exported for Rembedded.h */ void fpu_setup(Rboolean start) { if (start) { #ifdef __FreeBSD__ fpsetmask(0); #endif #ifdef NEED___SETFPUCW __setfpucw(_FPU_IEEE); #endif #if (defined(__i386) || defined(__x86_64)) && defined(__INTEL_COMPILER) && __INTEL_COMPILER > 800 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); #endif } else { #ifdef __FreeBSD__ fpsetmask(~0); #endif #ifdef NEED___SETFPUCW __setfpucw(_FPU_DEFAULT); #endif } }
/* main function in embree namespace */ int main(int argc, char** argv) { /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); /* create stream for parsing */ Ref<ParseStream> stream = new ParseStream(new CommandLineStream(argc, argv)); /* parse command line */ parseCommandLine(stream, FileName()); /* load default scene if none specified */ if (filename.ext() == "") { FileName file = FileName::executableFolder() + FileName("models/cornell_box.ecs"); parseCommandLine(new ParseStream(new LineCommentFilter(file, "#")), file.path()); } /* configure number of threads */ if (g_numThreads) g_rtcore += ",threads=" + std::to_string((long long)g_numThreads); if (g_numBenchmarkFrames) g_rtcore += ",benchmark=1"; g_rtcore += g_subdiv_mode; /* load scene */ if (strlwr(filename.ext()) == std::string("obj")) { if (g_subdiv_mode != "") { std::cout << "enabling subdiv mode" << std::endl; loadOBJ(filename,one,g_obj_scene,true); } else loadOBJ(filename,one,g_obj_scene); } else if (strlwr(filename.ext()) == std::string("xml")) loadXML(filename,one,g_obj_scene); else if (filename.ext() != "") THROW_RUNTIME_ERROR("invalid scene type: "+strlwr(filename.ext())); /* load keyframes */ if (keyframeList.str() != "") loadKeyFrameAnimation(keyframeList); /* initialize ray tracing core */ init(g_rtcore.c_str()); /* set shader mode */ switch (g_shader) { case SHADER_EYELIGHT: key_pressed(GLUT_KEY_F2); break; case SHADER_UV : key_pressed(GLUT_KEY_F4); break; case SHADER_NG : key_pressed(GLUT_KEY_F5); break; case SHADER_GEOMID : key_pressed(GLUT_KEY_F6); break; case SHADER_GEOMID_PRIMID: key_pressed(GLUT_KEY_F7); break; }; /* convert triangle meshes to subdiv meshes */ if (g_only_subdivs) g_obj_scene.convert_to_subdiv(); /* send model */ set_scene(&g_obj_scene); /* send keyframes */ if (g_keyframes.size()) set_scene_keyframes(&*g_keyframes.begin(),g_keyframes.size()); /* benchmark mode */ if (g_numBenchmarkFrames) renderBenchmark(outFilename); /* render to disk */ if (outFilename.str() != "") renderToFile(outFilename); /* interactive mode */ if (g_interactive) { initWindowState(argc,argv,tutorialName, g_width, g_height, g_fullscreen); enterWindowRunLoop(g_anim_mode); } return 0; }
TutorialApplication::TutorialApplication (const std::string& tutorialName, int features) : Application(features), tutorialName(tutorialName), shader(SHADER_DEFAULT), width(512), height(512), pixels(nullptr), outputImageFilename(""), skipBenchmarkFrames(0), numBenchmarkFrames(0), numBenchmarkRepetitions(1), interactive(true), fullscreen(false), window_width(512), window_height(512), windowID(0), time0(getSeconds()), debug_int0(0), debug_int1(0), mouseMode(0), clickX(0), clickY(0), speed(1.0f), moveDelta(zero), command_line_camera(false), print_frame_rate(false), avg_render_time(64,1.0), avg_frame_time(64,1.0), avg_mrayps(64,1.0), print_camera(false), debug0(0), debug1(0), debug2(0), debug3(0), iflags_coherent(RTC_INTERSECT_COHERENT), iflags_incoherent(RTC_INTERSECT_INCOHERENT) { /* only a single instance of this class is supported */ assert(instance == nullptr); instance = this; /* for best performance set FTZ and DAZ flags in MXCSR control and status register */ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); registerOption("c", [this] (Ref<ParseStream> cin, const FileName& path) { FileName file = path + cin->getFileName(); parseCommandLine(new ParseStream(new LineCommentFilter(file, "#")), file.path()); }, "-c <filename>: parses command line option from <filename>"); registerOption("o", [this] (Ref<ParseStream> cin, const FileName& path) { outputImageFilename = cin->getFileName(); interactive = false; }, "-o <filename>: output image filename"); /* camera settings */ registerOption("vp", [this] (Ref<ParseStream> cin, const FileName& path) { camera.from = cin->getVec3fa(); command_line_camera = true; }, "--vp <float> <float> <float>: camera position"); registerOption("vi", [this] (Ref<ParseStream> cin, const FileName& path) { camera.to = cin->getVec3fa(); command_line_camera = true; }, "--vi <float> <float> <float>: camera lookat position"); registerOption("vd", [this] (Ref<ParseStream> cin, const FileName& path) { camera.to = camera.from + cin->getVec3fa(); command_line_camera = true; }, "--vd <float> <float> <float>: camera direction vector"); registerOption("vu", [this] (Ref<ParseStream> cin, const FileName& path) { camera.up = cin->getVec3fa(); command_line_camera = true; }, "--vu <float> <float> <float>: camera up vector"); registerOption("fov", [this] (Ref<ParseStream> cin, const FileName& path) { camera.fov = cin->getFloat(); command_line_camera = true; }, "--fov <float>: vertical field of view"); /* framebuffer settings */ registerOption("size", [this] (Ref<ParseStream> cin, const FileName& path) { width = cin->getInt(); height = cin->getInt(); }, "--size <width> <height>: sets image size"); registerOption("fullscreen", [this] (Ref<ParseStream> cin, const FileName& path) { fullscreen = true; }, "--fullscreen: starts in fullscreen mode"); registerOption("benchmark", [this] (Ref<ParseStream> cin, const FileName& path) { skipBenchmarkFrames = cin->getInt(); numBenchmarkFrames = cin->getInt(); if (cin->peek() != "" && cin->peek()[0] != '-') numBenchmarkRepetitions = cin->getInt(); interactive = false; rtcore += ",benchmark=1,start_threads=1"; }, "--benchmark <N> <M> <R>: enabled benchmark mode, builds scene, skips N frames, renders M frames, and repeats this R times"); registerOption("nodisplay", [this] (Ref<ParseStream> cin, const FileName& path) { skipBenchmarkFrames = 0; numBenchmarkFrames = 2048; interactive = false; }, "--nodisplay: enabled benchmark mode, continously renders frames"); registerOption("print-frame-rate", [this] (Ref<ParseStream> cin, const FileName& path) { print_frame_rate = true; }, "--print-frame-rate: prints framerate for each frame on console"); registerOption("print-camera", [this] (Ref<ParseStream> cin, const FileName& path) { print_camera = true; }, "--print-camera: prints camera for each frame on console"); registerOption("debug0", [this] (Ref<ParseStream> cin, const FileName& path) { debug0 = cin->getInt(); }, "--debug0: sets internal debugging value"); registerOption("debug1", [this] (Ref<ParseStream> cin, const FileName& path) { debug1 = cin->getInt(); }, "--debug1: sets internal debugging value"); registerOption("debug2", [this] (Ref<ParseStream> cin, const FileName& path) { debug2 = cin->getInt(); }, "--debug2: sets internal debugging value"); registerOption("debug3", [this] (Ref<ParseStream> cin, const FileName& path) { debug3 = cin->getInt(); }, "--debug3: sets internal debugging value"); /* output filename */ registerOption("shader", [this] (Ref<ParseStream> cin, const FileName& path) { std::string mode = cin->getString(); if (mode == "default" ) shader = SHADER_DEFAULT; else if (mode == "eyelight") shader = SHADER_EYELIGHT; else if (mode == "occlusion") shader = SHADER_OCCLUSION; else if (mode == "uv" ) shader = SHADER_UV; else if (mode == "texcoords") shader = SHADER_TEXCOORDS; else if (mode == "texcoords-grid") shader = SHADER_TEXCOORDS_GRID; else if (mode == "Ng" ) shader = SHADER_NG; else if (mode == "cycles" ) { shader = SHADER_CYCLES; scale = cin->getFloat(); } else if (mode == "geomID" ) shader = SHADER_GEOMID; else if (mode == "primID" ) shader = SHADER_GEOMID_PRIMID; else if (mode == "ao" ) shader = SHADER_AMBIENT_OCCLUSION; else throw std::runtime_error("invalid shader:" +mode); }, "--shader <string>: sets shader to use at startup\n" " default: default tutorial shader\n" " eyelight: eyelight shading\n" " occlusion: occlusion shading\n" " uv: uv debug shader\n" " texcoords: texture coordinate debug shader\n" " texcoords-grid: grid texture debug shader\n" " Ng: visualization of shading normal\n" " cycles <float>: CPU cycle visualization\n" " geomID: visualization of geometry ID\n" " primID: visualization of geometry and primitive ID\n" " ao: ambient occlusion shader"); if (features & FEATURE_STREAM) { /* register parsing of stream mode */ registerOption("mode", [] (Ref<ParseStream> cin, const FileName& path) { std::string mode = cin->getString(); if (mode == "normal") g_mode = MODE_NORMAL; else if (mode == "stream") g_mode = MODE_STREAM; else throw std::runtime_error("invalid mode:" +mode); }, "--mode: sets rendering mode\n" " normal : normal mode\n" " stream : stream mode\n"); } registerOption("coherent", [this] (Ref<ParseStream> cin, const FileName& path) { g_iflags_coherent = iflags_coherent = RTC_INTERSECT_COHERENT; g_iflags_incoherent = iflags_incoherent = RTC_INTERSECT_COHERENT; }, "--coherent: force using RTC_INTERSECT_COHERENT hint when tracing rays"); registerOption("incoherent", [this] (Ref<ParseStream> cin, const FileName& path) { g_iflags_coherent = iflags_coherent = RTC_INTERSECT_INCOHERENT; g_iflags_incoherent = iflags_incoherent = RTC_INTERSECT_INCOHERENT; }, "--incoherent: force using RTC_INTERSECT_INCOHERENT hint when tracing rays"); }
int main(void) { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); int width = 800, height = 600; GLFWwindow* window; glfwSetErrorCallback(error_callback); if (!glfwInit()) exit(EXIT_FAILURE); window = glfwCreateWindow(width, height, "cpu-voxels", NULL, NULL); if (!window) { glfwTerminate(); return 1; } glfwMakeContextCurrent(window); glfwSwapInterval(0); glfwSetKeyCallback(window, key_callback); glfwSetMouseButtonCallback(window, mouse_button_callback); glfwSetCursorPosCallback(window, mouse_move_callback); glfwSetKeyCallback(window, key_callback); vec3 eye = vec3_create(0.0f, 0.0f, VOXEL_BRICK_SIZE * 4); vec3 center = vec3f(0.0f); vec3 up = vec3_create(0.0, 1.0, 0.0 ); orbit_camera_init(eye, center, up); // TODO: handle resize int dw, dh; glfwGetFramebufferSize(window, &dw, &dh); int stride = 3; int total = dw*dh*stride; uint8_t *data = malloc(total); vec3 ro; //, rd; mat4 m4inverted, view; mat4 projection; mat4_perspective( projection, M_PI/4.0, (float)width/(float)height, 0.1, 1000.0 ); GLuint texture[1]; #ifdef ENABLE_THREADS screen_area areas[TOTAL_THREADS]; threadpool thpool = thpool_init(TOTAL_THREADS); #else screen_area areas[1]; #endif glGenTextures(1, texture); float start = glfwGetTime(); int fps = 0; voxel_brick my_first_brick = voxel_brick_create(); // TODO: make this work when the brick lb corner is not oriented at 0,0,0 voxel_brick_position(my_first_brick, vec3f(0.0f)); voxel_brick_fill(my_first_brick, &brick_fill); while (!glfwWindowShouldClose(window)) { if (glfwGetKey(window, GLFW_KEY_LEFT) == GLFW_PRESS) { orbit_camera_rotate(0, 0, -.1, 0); } if (glfwGetKey(window, GLFW_KEY_RIGHT) == GLFW_PRESS) { orbit_camera_rotate(0, 0, .1, 0); } if (glfwGetKey(window, GLFW_KEY_UP) == GLFW_PRESS) { orbit_camera_rotate(0, 0, 0, .1); } if (glfwGetKey(window, GLFW_KEY_DOWN) == GLFW_PRESS) { orbit_camera_rotate(0, 0, 0, -.1); } glfwGetFramebufferSize(window, &width, &height); float now = glfwGetTime(); if (now - start > 1) { unsigned long long total_rays = (fps * width * height); printf("fps: %i (%f Mrays/s)@%ix%i - %i threads\n", fps, total_rays/1000000.0, width, height, TOTAL_THREADS); start = now; fps = 0; } fps++; orbit_camera_view(view); ro = mat4_get_eye(view); mat4_mul(m4inverted, projection, view); mat4_invert(m4inverted, m4inverted); // compute 3 points so that we can interpolate instead of unprojecting // on every point vec3 rda, rdb, planeYPosition, dcol, drow; vec3 t0 = vec3_create(0, 0, 0), tx = vec3_create(1, 0, 0), ty = vec3_create(0, 1, 0); vec4 viewport = { 0, 0, width, height }; rda = orbit_camera_unproject(t0, viewport, m4inverted); rdb = orbit_camera_unproject(tx, viewport, m4inverted); planeYPosition = orbit_camera_unproject(ty, viewport, m4inverted); dcol = planeYPosition - rda; drow = rdb - rda; int i=0, bh = height; #ifdef ENABLE_THREADS bh = (height/TOTAL_THREADS); for (i; i<TOTAL_THREADS; i++) { #endif areas[i].dcol = dcol; areas[i].drow = drow; areas[i].pos = planeYPosition; areas[i].ro = ro; areas[i].x = 0; areas[i].y = i*bh; areas[i].width = width; areas[i].height = areas[i].y + (int)(bh); areas[i].screen_height = (int)(height); areas[i].stride = stride; areas[i].data = data; areas[i].render_id = i; areas[i].brick = my_first_brick; #ifdef ENABLE_THREADS thpool_add_work(thpool, (void *)render_screen_area, (void *)(&areas[i])); } thpool_wait(thpool); #else render_screen_area((void *)(&areas[i])); #endif #ifdef RENDER glViewport(0, 0, width, height); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glDisable(GL_CULL_FACE); glMatrixMode(GL_PROJECTION); glLoadIdentity(); glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glMatrixMode(GL_TEXTURE); glLoadIdentity(); glScalef(1.0f, -1.0f, 1.0f); glEnable(GL_TEXTURE_2D); glBindTexture(GL_TEXTURE_2D, texture[0]); glTexImage2D(GL_TEXTURE_2D, 0, 3, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data); glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR); glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR); glBegin(GL_QUADS); glTexCoord2f(0.0f, 0.0f); glVertex2f( -1, -1); glTexCoord2f(1.0f, 0.0f); glVertex2f( 1, -1); glTexCoord2f(1.0f, 1.0f); glVertex2f( 1, 1); glTexCoord2f(0.0f, 1.0f); glVertex2f( -1, 1); glEnd(); glfwSwapBuffers(window); glDeleteTextures(1, &texture[0]); #endif glfwPollEvents(); } glfwDestroyWindow(window); glfwTerminate(); exit(EXIT_SUCCESS); }
c3_i main(c3_i argc, c3_c** argv) { // set both logging systems to unit-ed // u2K->inited_t = c3_false; c3_w kno_w; _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); // Parse options. // if ( u2_no == _main_getopt(argc, argv) ) { u2_ve_usage(argc, argv); return 1; } u2_ve_sysopt(); printf("~\n"); printf("welcome.\n"); printf("vere: urbit home is %s\n", u2_Host.cpu_c); printf("vere: hostname is %s\n", u2_Host.ops_u.nam_c); if ( u2_yes == u2_Host.ops_u.dem && u2_no == u2_Host.ops_u.bat ) { printf("Starting daemon\n"); } // Seed prng. Don't panic -- just for fuzz testing and election timeouts. // srand(getpid()); // Instantiate process globals. { u2_wr_check_init(u2_Host.cpu_c); u2_Host.xit_i = 0; if ( (u2_no == u2_Host.ops_u.nuu) && (u2_yes == u2_loom_load()) ) { u2_Host.wir_r = u2_ray_of(0, 0); u2_Wire = u2_Host.wir_r; u2_Host.arv_u = u2_Arv; u2_Arv->ova.egg_u = u2_Arv->ova.geg_u = 0; u2_lo_grab("init", u2_none); // Horrible ancient stuff. // kno_w = u2_Host.arv_u->kno_w; u2_Host.kno_w = kno_w; u2_ho_push(); } else { u2_loom_boot(); u2_Host.wir_r = u2_wr_init(c3__rock, u2_ray_of(0, 0), u2_ray_of(1, 0)); u2_Wire = u2_Host.wir_r; u2_Host.arv_u = u2_Arv; } } // If we have not loaded from checkpoint, build kernel. // if ( 0 != u2_Host.arv_u->ent_d ) { u2_reck_time(u2_Host.arv_u); u2_reck_numb(u2_Host.arv_u); { c3_c* dyt_c = u2_cr_string(u2_Host.arv_u->wen); printf("time: %s\n", dyt_c); free(dyt_c); } } else { // Set outside bail trap. Should not be used, but you never know... // if ( 0 != u2_cm_trap() ) { u2_ve_panic(argc, argv); } else { // Set boot and goal stages. { if ( (0 == u2_Host.ops_u.kno_w) || (u2_Host.ops_u.kno_w > 255) ) { kno_w = DefaultKernel; } else { kno_w = u2_Host.ops_u.kno_w; } } // Load the system. // { u2_Host.kno_w = u2_Host.ops_u.kno_w; u2_reck_boot(u2_Host.arv_u); } u2_cm_done(); } } // Install signal handlers and set buffers. // // Note that we use the sigmask-restoring variant. Essentially, when // we get a signal, we force the system back into the just-booted state. // If anything goes wrong during boot (above), it's curtains. { if ( 0 != sigsetjmp(Signal_buf, 1) ) { switch ( Sigcause ) { case sig_overflow: printf("[stack overflow]\r\n"); break; case sig_interrupt: printf("[interrupt]\r\n"); break; default: printf("[signal error!]\r\n"); break; } Sigcause = sig_none; signal(SIGINT, SIG_DFL); stackoverflow_deinstall_handler(); // Print the trace, do a GC, etc. // // This is half-assed at present, so we exit. // u2_lo_sway(0, u2k(u2_wire_tax(u2_Wire))); u2_lo_bail(u2_Host.arv_u); exit(1); } #if 1 if ( -1 == stackoverflow_install_handler (overflow_handler, Sigstk, SIGSTKSZ) ) { fprintf(stderr, "overflow_handler: install failed\n"); exit(1); } signal(SIGINT, interrupt_handler); signal(SIGIO, SIG_IGN); #endif } u2_lo_grab("main", u2_none); // booted in admin mode: do a task, then exit // booted in user mode: do command loop if (u2_Host.ops_u.adm_c != 0) { if (strcmp(u2_Host.ops_u.adm_c, "edmp") ==0) { u2_egz_admin_dump_egz(); } else if (strcmp(u2_Host.ops_u.adm_c, "etok") ==0) { u2_kafka_admin_egz_to_kafka(); } else if (strcmp(u2_Host.ops_u.adm_c, "ktoe") ==0) { u2_kafka_admin_kafka_to_egz(); } else if (strcmp(u2_Host.ops_u.adm_c, "kcnf") ==0) { u2_lo_loop(); } // do it in the app else { fprintf(stderr, "unsupported admin mode command %s\n", u2_Host.ops_u.adm_c); exit(1); } } else { u2_lo_loop(); } return 0; }
inline void AVOIDDENORMALS() { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); }
void update_div(matrix W, matrix H, matrix X, const float thresh, const int max_iter, double *t,int verbose){ //run iterative multiplicative updates on W,H //initialize temp matrices ----------------------- //matrix to hold W*H matrix WH; create_matrix(&WH, W.dim[0], H.dim[1], 0.0); //matrix to hold X./(W*H+EPS) matrix Z; create_matrix(&Z, X.dim[0], X.dim[1], 0.0); //matrix to hold W'*Z matrix WtZ; create_matrix(&WtZ, W.dim[1], Z.dim[1], 0.0); //matrix to hold Z*H' matrix ZHt; create_matrix(&ZHt, Z.dim[0], H.dim[0], 0.0); //matrix to hold sum(W) [sum cols of W] matrix sumW; create_matrix(&sumW, 1, W.dim[1] ,0.0); //matrix to hold sum(H,2) [sum rows of H] matrix sumH2; create_matrix(&sumH2, H.dim[0], 1, 0.0); int i; if(t==NULL){ double t_array[TIMERS]; t = t_array; for(i=0;i<TIMERS;i++) t[i] = 0; } //turn on the FTZ(15) and DAZ(6) bits in the floating point control register //FTZ = flush-to-zero, DAZ = denormal-as-zero //without these, sgemms slow down significantly as values approach zero _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); // the following does the same thing (by Waterman) /* unsigned int mxcsr; __asm__ __volatile__ ("stmxcsr (%0)" : : "r"(&mxcsr) : "memory"); //mxcsr = (mxcsr | (1<<15) | (1<<6)) & ~((1<<11) | (1<<8)); mxcsr = (mxcsr | (1<<15) | (1<<6)); __asm__ __volatile__ ("ldmxcsr (%0)" : : "r"(&mxcsr)); */ float diff,div,prev_div,change; matrix_multiply(W,H,WH,mkl_threads); diff = matrix_difference_norm(X,WH, check_threads); prev_div = matrix_div(X,WH,check_threads); div = prev_div; if(verbose) { printf("OpenMP threads: %i\n",omp_threads); printf("i: %4i, error: %6.4f, div: %8.4e\n",0,diff,prev_div); } t[0] -= get_time(); for(i=0;i<max_iter;i++){ //check for convergence, print status if(i % ITER_CHECK == 0 && i != 0){ double tt = get_time(); matrix_multiply(W,H,WH,mkl_threads); diff = matrix_difference_norm(X,WH,check_threads); prev_div = div; div = matrix_div(X,WH,check_threads); change = (prev_div-div)/prev_div; if(verbose) printf("i: %4i, error: %6.4f, div: %8.4e, change: %8.5f\n", i,diff,div,change); if(change < thresh){ printf("converged\n"); break; } tt = get_time()-tt; t[9] += tt; } /* matlab algorithm Z = X./(W*H+eps); H = H.*(W'*Z)./(repmat(sum(W)',1,F)); Z = X./(W*H+eps); W = W.*(Z*H')./(repmat(sum(H,2)',N,1)); */ // // UPDATE H ----------------------------- // //WH = W*H t[1] -= get_time(); t[10] -= get_time(); //matrix_eps(W,eps_threads); //matrix_eps(H,eps_threads); matrix_multiply(W,H,WH,mkl_threads); t[1] += get_time(); t[10] += get_time(); //WH = WH+EPS t[2] -= get_time(); matrix_eps(WH,eps_threads); t[2] += get_time(); //Z = X./WH t[3] -= get_time(); element_divide(X,WH,Z,vecdiv_threads); t[3] += get_time(); //sum cols of W into row vector t[6] -= get_time(); sum_cols(W,sumW,sumcols_threads); t[6] += get_time(); //convert sumW to col vector sumW.dim[0] = sumW.dim[1]; sumW.dim[1] = 1; //WtZ = W'*Z t[1] -= get_time(); t[11] -= get_time(); matrix_multiply_AtB(W,Z,WtZ,mkl_threads); t[1] += get_time(); t[11] += get_time(); //WtZ = WtZ./(repmat(sum(W)',1,H.dim[1]) //[element divide cols of WtZ by sumW'] t[7] -= get_time(); col_divide(WtZ,sumW,WtZ,coldiv_threads); t[7] += get_time(); //H = H.*WtZ t[4] -= get_time(); element_multiply(H,WtZ,H,vecmult_threads); t[4] += get_time(); // // UPDATE W --------------------------- // //WH = W*H t[1] -= get_time(); t[12] -= get_time(); matrix_multiply(W,H,WH,mkl_threads); t[1] += get_time(); t[12] += get_time(); //WH = WH+EPS t[2] -= get_time(); matrix_eps(WH,eps_threads); t[2] += get_time(); //Z = X./WH t[3] -= get_time(); element_divide(X,WH,Z,vecdiv_threads); t[3] += get_time(); //sum rows of H into col vector t[5] -= get_time(); sum_rows(H,sumH2,sumrows_threads); t[5] += get_time(); //convert sumH2 to row vector sumH2.dim[1] = sumH2.dim[0]; sumH2.dim[0] = 1; //ZHt = Z*H' t[1] -= get_time(); t[13] -= get_time(); matrix_multiply_ABt(Z,H,ZHt,mkl_threads); t[1] += get_time(); t[13] += get_time(); //ZHt = ZHt./(repmat(sum(H,2)',W.dim[0],1) //[element divide rows of ZHt by sumH2'] t[8] -= get_time(); row_divide(ZHt,sumH2,ZHt,rowdiv_threads); t[8] += get_time(); //W = W.*ZHt t[4] -= get_time(); element_multiply(W,ZHt,W,vecmult_threads); t[4] += get_time(); // ------------------------------------ //reset sumW to row vector sumW.dim[1] = sumW.dim[0]; sumW.dim[0] = 1; //reset sumH2 to col vector sumH2.dim[0] = sumH2.dim[1]; sumH2.dim[1] = 1; // --------------------------------------- } t[0] += get_time(); matrix_multiply(W,H,WH,mkl_threads); diff = matrix_difference_norm(X,WH,check_threads); prev_div = div; div = matrix_div(X,WH,check_threads); change = (prev_div-div)/prev_div; if(verbose){ printf("i: %4i, error: %6.4f, div: %8.4e, change: %8.5f\n", i,diff,div,change); printf("\n"); for(i=0;i<TIMERS;i++) printf("t[%i]: %8.3f (%6.2f %%) %s\n",i,t[i],t[i]/t[0]*100,tname[i]); } //free temporary matrices destroy_matrix(&WH); destroy_matrix(&Z); destroy_matrix(&WtZ); destroy_matrix(&ZHt); destroy_matrix(&sumW); destroy_matrix(&sumH2); }
EXTERN_C_ENTER JNIEXPORT void JNICALL Java_org_lwjgl_util_simd_SSE3__1MM_1SET_1DENORMALS_1ZERO_1MODE(JNIEnv *__env, jclass clazz, jint mode) { UNUSED_PARAMS(__env, clazz) _MM_SET_DENORMALS_ZERO_MODE(mode); }