extern "C" int test_shade (int argc, const char *argv[]) { OIIO::Timer timer; // Create a new shading system. We pass it the RendererServices // object that services callbacks from the shading system, NULL for // the TextureSystem (that just makes 'create' make its own TS), and // an error handler. shadingsys = ShadingSystem::create (&rend, NULL, &errhandler); // Remember that each shader parameter may optionally have a // metadata hint [[int lockgeom=...]], where 0 indicates that the // parameter may be overridden by the geometry itself, for example // with data interpolated from the mesh vertices, and a value of 1 // means that it is "locked" with respect to the geometry (i.e. it // will not be overridden with interpolated or // per-geometric-primitive data). // // In order to most fully optimize shader, we typically want any // shader parameter not explicitly specified to default to being // locked (i.e. no per-geometry override): shadingsys->attribute("lockgeom", 1); // Now we declare our shader. // // Each material in the scene is comprised of a "shader group." // Each group is comprised of one or more "layers" (a.k.a. shader // instances) with possible connections from outputs of // upstream/early layers into the inputs of downstream/later layers. // A shader instance is the combination of a reference to a shader // master and its parameter values that may override the defaults in // the shader source and may be particular to this instance (versus // all the other instances of the same shader). // // A shader group declaration typically looks like this: // // ss->ShaderGroupBegin (); // ss->Parameter ("paramname", TypeDesc paramtype, void *value); // ... and so on for all the other parameters of... // ss->Shader ("shadertype", "shadername", "layername"); // The Shader() call creates a new instance, which gets // all the pending Parameter() values made right before it. // ... and other shader instances in this group, interspersed with... // ss->ConnectShaders ("layer1", "param1", "layer2", "param2"); // ... and other connections ... // ss->ShaderGroupEnd (); // // and now grab an opaque reference to that shader group: // ShadingAttribStateRef shaderstate = s->state (); // // It looks so simple, and it really is, except that the way this // testshade program works is that all the Parameter() and Shader() // calls are done inside getargs(), as it walks through the command // line arguments, whereas the connections accumulate and have // to be processed at the end. Bear with us. // Start the shader group. shadingsys->ShaderGroupBegin (); // Get the command line arguments. That will set up all the shader // instances and their parameters for the group. getargs (argc, argv); // Now set up the connections for (size_t i = 0; i < connections.size(); i += 4) { if (i+3 < connections.size()) { std::cout << "Connect " << connections[i] << "." << connections[i+1] << " to " << connections[i+2] << "." << connections[i+3] << "\n"; shadingsys->ConnectShaders (connections[i].c_str(), connections[i+1].c_str(), connections[i+2].c_str(), connections[i+3].c_str()); } } // End the group shadingsys->ShaderGroupEnd (); // Now we should have a valid shading state, to get a reference to it. ShadingAttribStateRef shaderstate = shadingsys->state (); if (outputfiles.size() != 0) std::cout << "\n"; // Set up the named transformations, including shader and object. // For this test application, we just do this statically; in a real // renderer, the global named space (like "myspace") would probably // be static, but shader and object spaces may be different for each // object. setup_transformations (rend, Mshad, Mobj); // Set up the image outputs requested on the command line setup_output_images (shadingsys, shaderstate); // Set up shader globals and a little test grid of points to shade. ShaderGlobals shaderglobals; double setuptime = timer.lap (); std::vector<float> pixel; // Optional: high-performance apps may request this thread-specific // pointer in order to save a bit of time on each shade. Just like // the name implies, a multithreaded renderer would need to do this // separately for each thread, and be careful to always use the same // thread_info each time for that thread. // // There's nothing wrong with a simpler app just passing NULL for // the thread_info; in such a case, the ShadingSystem will do the // necessary calls to find the thread-specific pointer itself, but // this will degrade performance just a bit. OSL::PerThreadInfo *thread_info = shadingsys->create_thread_info(); // Request a shading context so that we can execute the shader. // We could get_context/release_constext for each shading point, // but to save overhead, it's more efficient to reuse a context // within a thread. ShadingContext *ctx = shadingsys->get_context (thread_info); // Allow a settable number of iterations to "render" the whole image, // which is useful for time trials of things that would be too quick // to accurately time for a single iteration for (int iter = 0; iter < iters; ++iter) { // Loop over all pixels in the image (in x and y)... for (int y = 0, n = 0; y < yres; ++y) { for (int x = 0; x < xres; ++x, ++n) { // In a real renderer, this is where you would figure // out what object point is visible in this pixel (or // this sample, for antialiasing). Once determined, // you'd set up a ShaderGlobals that contained the vital // information about that point, such as its location, // the normal there, the u and v coordinates on the // surface, the transformation of that object, and so // on. // // This test app is not a real renderer, so we just // set it up rigged to look like we're rendering a single // quadrilateral that exactly fills the viewport, and that // setup is done in the following function call: setup_shaderglobals (shaderglobals, shadingsys, x, y); // Actually run the shader for this point shadingsys->execute (*ctx, *shaderstate, shaderglobals); // Save all the designated outputs. But only do so if we // are on the last iteration requested, so that if we are // doing a bunch of iterations for time trials, we only // including the output pixel copying once in the timing. if (iter == (iters - 1)) { save_outputs (shadingsys, ctx, x, y); } } } } // We're done shading with this context. shadingsys->release_context (ctx); // Now that we're done rendering, release the thread=specific // pointer we saved. A simple app could skip this; but if the app // asks for it (as we have in this example), then it should also // destroy it when done with it. shadingsys->destroy_thread_info(thread_info); if (outputfiles.size() == 0) std::cout << "\n"; // Write the output images to disk for (size_t i = 0; i < outputimgs.size(); ++i) { if (outputimgs[i]) { outputimgs[i]->save(); delete outputimgs[i]; outputimgs[i] = NULL; } } // Print some debugging info if (debug || stats) { double runtime = timer(); std::cout << "\n"; std::cout << "Setup: " << OIIO::Strutil::timeintervalformat (setuptime,2) << "\n"; std::cout << "Run : " << OIIO::Strutil::timeintervalformat (runtime,2) << "\n"; std::cout << "\n"; std::cout << shadingsys->getstats (5) << "\n"; } // We're done with the shading system now, destroy it ShadingSystem::destroy (shadingsys); return EXIT_SUCCESS; }
void BackendLLVM::run () { // At this point, we already hold the lock for this group, by virtue // of ShadingSystemImpl::optimize_group. OIIO::Timer timer; std::string err; { #ifdef OSL_LLVM_NO_BITCODE // I don't know which exact part has thread safety issues, but it // crashes on windows when we don't lock. // FIXME -- try subsequent LLVM releases on Windows to see if this // is a problem that is eventually fixed on the LLVM side. static spin_mutex mutex; OIIO::spin_lock lock (mutex); #endif #ifdef OSL_LLVM_NO_BITCODE ll.module (ll.new_module ("llvm_ops")); #else ll.module (ll.module_from_bitcode (osl_llvm_compiled_ops_block, osl_llvm_compiled_ops_size, "llvm_ops", &err)); if (err.length()) shadingcontext()->error ("ParseBitcodeFile returned '%s'\n", err.c_str()); ASSERT (ll.module()); #endif // Create the ExecutionEngine if (! ll.make_jit_execengine (&err)) { shadingcontext()->error ("Failed to create engine: %s\n", err.c_str()); ASSERT (0); return; } // End of mutex lock, for the OSL_LLVM_NO_BITCODE case } m_stat_llvm_setup_time += timer.lap(); // Set up m_num_used_layers to be the number of layers that are // actually used, and m_layer_remap[] to map original layer numbers // to the shorter list of actually-called layers. int nlayers = group().nlayers(); m_layer_remap.resize (nlayers); m_num_used_layers = 0; for (int layer = 0; layer < group().nlayers(); ++layer) { bool lastlayer = (layer == (nlayers-1)); if (! group()[layer]->unused() || lastlayer) m_layer_remap[layer] = m_num_used_layers++; else m_layer_remap[layer] = -1; } shadingsys().m_stat_empty_instances += group().nlayers()-m_num_used_layers; initialize_llvm_group (); // Generate the LLVM IR for each layer. Skip unused layers. m_llvm_local_mem = 0; llvm::Function** funcs = (llvm::Function**)alloca(m_num_used_layers * sizeof(llvm::Function*)); for (int layer = 0; layer < nlayers; ++layer) { set_inst (layer); bool lastlayer = (layer == (nlayers-1)); int index = m_layer_remap[layer]; if (index != -1) funcs[index] = build_llvm_instance (lastlayer); } llvm::Function* entry_func = funcs[m_num_used_layers-1]; m_stat_llvm_irgen_time += timer.lap(); if (shadingsys().m_max_local_mem_KB && m_llvm_local_mem/1024 > shadingsys().m_max_local_mem_KB) { shadingcontext()->error ("Shader group \"%s\" needs too much local storage: %d KB", group().name(), m_llvm_local_mem/1024); } // Optimize the LLVM IR unless it's just a ret void group (1 layer, // 1 BB, 1 inst == retvoid) bool skip_optimization = m_num_used_layers == 1 && ll.func_is_empty(entry_func); // Label the group as being retvoid or not. group().does_nothing(skip_optimization); if (skip_optimization) { shadingsys().m_stat_empty_groups += 1; shadingsys().m_stat_empty_instances += 1; // the one layer is empty } else { ll.do_optimize(); } m_stat_llvm_opt_time += timer.lap(); if (llvm_debug()) { std::cout << "func after opt = " << ll.bitcode_string (entry_func) << "\n"; std::cout.flush(); } // Debug code to dump the resulting bitcode to a file if (llvm_debug() >= 2) { std::string name = Strutil::format ("%s_%d.bc", inst()->layername(), inst()->id()); ll.write_bitcode_file (name.c_str()); } // Force the JIT to happen now and retrieve the JITed function group().llvm_compiled_version ((RunLLVMGroupFunc) ll.getPointerToFunction(entry_func)); // Remove the IR for the group layer functions, we've already JITed it // and will never need the IR again. This saves memory, and also saves // a huge amount of time since we won't re-optimize it again and again // if we keep adding new shader groups to the same Module. for (int i = 0; i < m_num_used_layers; ++i) { ll.delete_func_body (funcs[i]); } // Free the exec and module to reclaim all the memory. This definitely // saves memory, and has almost no effect on runtime. ll.execengine (NULL); // N.B. Destroying the EE should have destroyed the module as well. ll.module (NULL); m_stat_llvm_jit_time += timer.lap(); m_stat_total_llvm_time = timer(); if (shadingsys().m_compile_report) { shadingcontext()->info ("JITed shader group %s:", group().name()); shadingcontext()->info (" (%1.2fs = %1.2f setup, %1.2f ir, %1.2f opt, %1.2f jit; local mem %dKB)", m_stat_total_llvm_time, m_stat_llvm_setup_time, m_stat_llvm_irgen_time, m_stat_llvm_opt_time, m_stat_llvm_jit_time, m_llvm_local_mem/1024); } }
extern "C" OSL_DLL_EXPORT int test_shade (int argc, const char *argv[]) { OIIO::Timer timer; // Create a new shading system. We pass it the RendererServices // object that services callbacks from the shading system, NULL for // the TextureSystem (that just makes 'create' make its own TS), and // an error handler. shadingsys = new ShadingSystem (&rend, NULL, &errhandler); // Register the layout of all closures known to this renderer // Any closure used by the shader which is not registered, or // registered with a different number of arguments will lead // to a runtime error. register_closures(shadingsys); // Remember that each shader parameter may optionally have a // metadata hint [[int lockgeom=...]], where 0 indicates that the // parameter may be overridden by the geometry itself, for example // with data interpolated from the mesh vertices, and a value of 1 // means that it is "locked" with respect to the geometry (i.e. it // will not be overridden with interpolated or // per-geometric-primitive data). // // In order to most fully optimize shader, we typically want any // shader parameter not explicitly specified to default to being // locked (i.e. no per-geometry override): shadingsys->attribute("lockgeom", 1); // Now we declare our shader. // // Each material in the scene is comprised of a "shader group." // Each group is comprised of one or more "layers" (a.k.a. shader // instances) with possible connections from outputs of // upstream/early layers into the inputs of downstream/later layers. // A shader instance is the combination of a reference to a shader // master and its parameter values that may override the defaults in // the shader source and may be particular to this instance (versus // all the other instances of the same shader). // // A shader group declaration typically looks like this: // // ShaderGroupRef shadergroup = ss->ShaderGroupBegin (); // ss->Parameter ("paramname", TypeDesc paramtype, void *value); // ... and so on for all the other parameters of... // ss->Shader ("shadertype", "shadername", "layername"); // The Shader() call creates a new instance, which gets // all the pending Parameter() values made right before it. // ... and other shader instances in this group, interspersed with... // ss->ConnectShaders ("layer1", "param1", "layer2", "param2"); // ... and other connections ... // ss->ShaderGroupEnd (); // // It looks so simple, and it really is, except that the way this // testshade program works is that all the Parameter() and Shader() // calls are done inside getargs(), as it walks through the command // line arguments, whereas the connections accumulate and have // to be processed at the end. Bear with us. // Start the shader group and grab a reference to it. shadergroup = shadingsys->ShaderGroupBegin (groupname); // Get the command line arguments. That will set up all the shader // instances and their parameters for the group. getargs (argc, argv); if (! shadergroup) { std::cerr << "ERROR: Invalid shader group. Exiting testshade.\n"; return EXIT_FAILURE; } shadingsys->attribute (shadergroup.get(), "groupname", groupname); // Now set up the connections for (size_t i = 0; i < connections.size(); i += 4) { if (i+3 < connections.size()) { std::cout << "Connect " << connections[i] << "." << connections[i+1] << " to " << connections[i+2] << "." << connections[i+3] << "\n"; shadingsys->ConnectShaders (connections[i].c_str(), connections[i+1].c_str(), connections[i+2].c_str(), connections[i+3].c_str()); } } // End the group shadingsys->ShaderGroupEnd (); if (verbose || do_oslquery) { std::string pickle; shadingsys->getattribute (shadergroup.get(), "pickle", pickle); std::cout << "Shader group:\n---\n" << pickle << "\n---\n"; std::cout << "\n"; ustring groupname; shadingsys->getattribute (shadergroup.get(), "groupname", groupname); std::cout << "Shader group \"" << groupname << "\" layers are:\n"; int num_layers = 0; shadingsys->getattribute (shadergroup.get(), "num_layers", num_layers); if (num_layers > 0) { std::vector<const char *> layers (size_t(num_layers), NULL); shadingsys->getattribute (shadergroup.get(), "layer_names", TypeDesc(TypeDesc::STRING, num_layers), &layers[0]); for (int i = 0; i < num_layers; ++i) { std::cout << " " << (layers[i] ? layers[i] : "<unnamed>") << "\n"; if (do_oslquery) { OSLQuery q; q.init (shadergroup.get(), i); for (size_t p = 0; p < q.nparams(); ++p) { const OSLQuery::Parameter *param = q.getparam(p); std::cout << "\t" << (param->isoutput ? "output " : "") << param->type << ' ' << param->name << "\n"; } } } } std::cout << "\n"; } if (archivegroup.size()) shadingsys->archive_shadergroup (shadergroup.get(), archivegroup); if (outputfiles.size() != 0) std::cout << "\n"; // Set up the named transformations, including shader and object. // For this test application, we just do this statically; in a real // renderer, the global named space (like "myspace") would probably // be static, but shader and object spaces may be different for each // object. setup_transformations (rend, Mshad, Mobj); // Set up the image outputs requested on the command line setup_output_images (shadingsys, shadergroup); if (debug) test_group_attributes (shadergroup.get()); if (num_threads < 1) num_threads = boost::thread::hardware_concurrency(); double setuptime = timer.lap (); // Allow a settable number of iterations to "render" the whole image, // which is useful for time trials of things that would be too quick // to accurately time for a single iteration for (int iter = 0; iter < iters; ++iter) { OIIO::ROI roi (0, xres, 0, yres); if (use_shade_image) OSL::shade_image (*shadingsys, *shadergroup, NULL, *outputimgs[0], outputvarnames, pixelcenters ? ShadePixelCenters : ShadePixelGrid, roi, num_threads); else { bool save = (iter == (iters-1)); // save on last iteration #if 0 shade_region (shadergroup.get(), roi, save); #else OIIO::ImageBufAlgo::parallel_image ( boost::bind (shade_region, shadergroup.get(), _1, save), roi, num_threads); #endif } // If any reparam was requested, do it now if (reparams.size() && reparam_layer.size()) { for (size_t p = 0; p < reparams.size(); ++p) { const ParamValue &pv (reparams[p]); shadingsys->ReParameter (*shadergroup, reparam_layer.c_str(), pv.name().c_str(), pv.type(), pv.data()); } } } double runtime = timer.lap(); if (outputfiles.size() == 0) std::cout << "\n"; // Write the output images to disk for (size_t i = 0; i < outputimgs.size(); ++i) { if (outputimgs[i]) { if (! print_outputs) { std::string filename = outputimgs[i]->name(); // JPEG, GIF, and PNG images should be automatically saved // as sRGB because they are almost certainly supposed to // be displayed on web pages. using namespace OIIO; if (Strutil::iends_with (filename, ".jpg") || Strutil::iends_with (filename, ".jpeg") || Strutil::iends_with (filename, ".gif") || Strutil::iends_with (filename, ".png")) { ImageBuf ccbuf; ImageBufAlgo::colorconvert (ccbuf, *outputimgs[i], "linear", "sRGB", false, "", ""); ccbuf.set_write_format (outputimgs[i]->spec().format); ccbuf.write (filename); } else { outputimgs[i]->write (filename); } } delete outputimgs[i]; outputimgs[i] = NULL; } } // Print some debugging info if (debug || runstats || profile) { double writetime = timer.lap(); std::cout << "\n"; std::cout << "Setup: " << OIIO::Strutil::timeintervalformat (setuptime,2) << "\n"; std::cout << "Run : " << OIIO::Strutil::timeintervalformat (runtime,2) << "\n"; std::cout << "Write: " << OIIO::Strutil::timeintervalformat (writetime,2) << "\n"; std::cout << "\n"; std::cout << shadingsys->getstats (5) << "\n"; OIIO::TextureSystem *texturesys = shadingsys->texturesys(); if (texturesys) std::cout << texturesys->getstats (5) << "\n"; std::cout << ustring::getstats() << "\n"; } // We're done with the shading system now, destroy it shadergroup.reset (); // Must release this before destroying shadingsys delete shadingsys; return EXIT_SUCCESS; }
extern "C" int test_shade (int argc, const char *argv[]) { OIIO::Timer timer; // Create a new shading system. We pass it the RendererServices // object that services callbacks from the shading system, NULL for // the TextureSystem (that just makes 'create' make its own TS), and // an error handler. shadingsys = new ShadingSystem (&rend, NULL, &errhandler); register_closures(shadingsys); // Remember that each shader parameter may optionally have a // metadata hint [[int lockgeom=...]], where 0 indicates that the // parameter may be overridden by the geometry itself, for example // with data interpolated from the mesh vertices, and a value of 1 // means that it is "locked" with respect to the geometry (i.e. it // will not be overridden with interpolated or // per-geometric-primitive data). // // In order to most fully optimize shader, we typically want any // shader parameter not explicitly specified to default to being // locked (i.e. no per-geometry override): shadingsys->attribute("lockgeom", 1); // Now we declare our shader. // // Each material in the scene is comprised of a "shader group." // Each group is comprised of one or more "layers" (a.k.a. shader // instances) with possible connections from outputs of // upstream/early layers into the inputs of downstream/later layers. // A shader instance is the combination of a reference to a shader // master and its parameter values that may override the defaults in // the shader source and may be particular to this instance (versus // all the other instances of the same shader). // // A shader group declaration typically looks like this: // // ShaderGroupRef shadergroup = ss->ShaderGroupBegin (); // ss->Parameter ("paramname", TypeDesc paramtype, void *value); // ... and so on for all the other parameters of... // ss->Shader ("shadertype", "shadername", "layername"); // The Shader() call creates a new instance, which gets // all the pending Parameter() values made right before it. // ... and other shader instances in this group, interspersed with... // ss->ConnectShaders ("layer1", "param1", "layer2", "param2"); // ... and other connections ... // ss->ShaderGroupEnd (); // // It looks so simple, and it really is, except that the way this // testshade program works is that all the Parameter() and Shader() // calls are done inside getargs(), as it walks through the command // line arguments, whereas the connections accumulate and have // to be processed at the end. Bear with us. // Start the shader group and grab a reference to it. ShaderGroupRef shadergroup = shadingsys->ShaderGroupBegin (); // Get the command line arguments. That will set up all the shader // instances and their parameters for the group. getargs (argc, argv); // Now set up the connections for (size_t i = 0; i < connections.size(); i += 4) { if (i+3 < connections.size()) { std::cout << "Connect " << connections[i] << "." << connections[i+1] << " to " << connections[i+2] << "." << connections[i+3] << "\n"; shadingsys->ConnectShaders (connections[i].c_str(), connections[i+1].c_str(), connections[i+2].c_str(), connections[i+3].c_str()); } } // End the group shadingsys->ShaderGroupEnd (); if (outputfiles.size() != 0) std::cout << "\n"; // Set up the named transformations, including shader and object. // For this test application, we just do this statically; in a real // renderer, the global named space (like "myspace") would probably // be static, but shader and object spaces may be different for each // object. setup_transformations (rend, Mshad, Mobj); // Set up the image outputs requested on the command line setup_output_images (shadingsys, shadergroup); if (debug) test_group_attributes (shadergroup.get()); if (num_threads < 1) num_threads = boost::thread::hardware_concurrency(); double setuptime = timer.lap (); // Allow a settable number of iterations to "render" the whole image, // which is useful for time trials of things that would be too quick // to accurately time for a single iteration for (int iter = 0; iter < iters; ++iter) { OIIO::ROI roi (0, xres, 0, yres); bool save = (iter == (iters-1)); // save on last iteration #if 0 shade_region (shadergroup.get(), roi, save); #else OIIO::ImageBufAlgo::parallel_image ( boost::bind (shade_region, shadergroup.get(), _1, save), roi, num_threads); #endif // If any reparam was requested, do it now if (reparams.size() && reparam_layer.size()) { for (size_t p = 0; p < reparams.size(); ++p) { const ParamValue &pv (reparams[p]); shadingsys->ReParameter (*shadergroup, reparam_layer.c_str(), pv.name().c_str(), pv.type(), pv.data()); } } } if (outputfiles.size() == 0) std::cout << "\n"; // Write the output images to disk for (size_t i = 0; i < outputimgs.size(); ++i) { if (outputimgs[i]) { outputimgs[i]->save(); delete outputimgs[i]; outputimgs[i] = NULL; } } // Print some debugging info if (debug || stats) { double runtime = timer.lap(); std::cout << "\n"; std::cout << "Setup: " << OIIO::Strutil::timeintervalformat (setuptime,2) << "\n"; std::cout << "Run : " << OIIO::Strutil::timeintervalformat (runtime,2) << "\n"; std::cout << "\n"; std::cout << shadingsys->getstats (5) << "\n"; OIIO::TextureSystem *texturesys = shadingsys->texturesys(); if (texturesys) std::cout << texturesys->getstats (5) << "\n"; std::cout << ustring::getstats() << "\n"; } // We're done with the shading system now, destroy it shadergroup.reset (); // Must release this before destroying shadingsys delete shadingsys; return EXIT_SUCCESS; }
bool ShadingContext::execute (ShaderGroup &sgroup, ShaderGlobals &ssg, bool run) { m_attribs = &sgroup; // Optimize if we haven't already if (sgroup.nlayers()) { sgroup.start_running (); if (! sgroup.optimized()) { shadingsys().optimize_group (sgroup); if (shadingsys().m_greedyjit && shadingsys().m_groups_to_compile_count) { // If we are greedily JITing, optimize/JIT everything now shadingsys().optimize_all_groups (); } } if (sgroup.does_nothing()) return false; } else { // empty shader - nothing to do! return false; } int profile = shadingsys().m_profile; OIIO::Timer timer (profile); // Allocate enough space on the heap size_t heap_size_needed = sgroup.llvm_groupdata_size(); if (heap_size_needed > m_heap.size()) { if (shadingsys().debug()) info (" ShadingContext %p growing heap to %llu", this, (unsigned long long) heap_size_needed); m_heap.resize (heap_size_needed); } // Zero out the heap memory we will be using if (shadingsys().m_clearmemory) memset (&m_heap[0], 0, heap_size_needed); // Set up closure storage m_closure_pool.clear(); // Clear the message blackboard m_messages.clear (); // Clear miscellaneous scratch space m_scratch_pool.clear (); if (run) { ssg.context = this; ssg.renderer = renderer(); ssg.Ci = NULL; RunLLVMGroupFunc run_func = sgroup.llvm_compiled_version(); DASSERT (run_func); DASSERT (sgroup.llvm_groupdata_size() <= m_heap.size()); run_func (&ssg, &m_heap[0]); } // Process any queued up error messages, warnings, printfs from shaders process_errors (); if (profile) { long long ticks = timer.ticks(); shadingsys().m_stat_total_shading_time_ticks += ticks; sgroup.m_stat_total_shading_time_ticks += ticks; } return true; }
void RuntimeOptimizer::build_llvm_group () { // At this point, we already hold the lock for this group, by virtue // of ShadingSystemImpl::optimize_group. OIIO::Timer timer; if (! m_thread->llvm_context) m_thread->llvm_context = new llvm::LLVMContext(); if (! m_thread->llvm_jitmm) { m_thread->llvm_jitmm = llvm::JITMemoryManager::CreateDefaultMemManager(); OIIO::spin_lock lock (m_shadingsys.m_llvm_mutex); // lock m_llvm_jitmm_hold m_shadingsys.m_llvm_jitmm_hold.push_back (shared_ptr<llvm::JITMemoryManager>(m_thread->llvm_jitmm)); } ASSERT (! m_llvm_module); // Load the LLVM bitcode and parse it into a Module const char *data = osl_llvm_compiled_ops_block; llvm::MemoryBuffer* buf = llvm::MemoryBuffer::getMemBuffer (llvm::StringRef(data, osl_llvm_compiled_ops_size)); std::string err; #ifdef OSL_LLVM_NO_BITCODE m_llvm_module = new llvm::Module("llvm_ops", *llvm_context()); #else // Load the LLVM bitcode and parse it into a Module m_llvm_module = llvm::ParseBitcodeFile (buf, *m_thread->llvm_context, &err); if (err.length()) m_shadingsys.error ("ParseBitcodeFile returned '%s'\n", err.c_str()); delete buf; #endif // Create the ExecutionEngine ASSERT (! m_llvm_exec); err.clear (); llvm::JITMemoryManager *mm = new OSL_Dummy_JITMemoryManager(m_thread->llvm_jitmm); m_llvm_exec = llvm::ExecutionEngine::createJIT (m_llvm_module, &err, mm, llvm::CodeGenOpt::Default, /*AllocateGVsWithCode*/ false); if (! m_llvm_exec) { m_shadingsys.error ("Failed to create engine: %s\n", err.c_str()); ASSERT (0); return; } // Force it to JIT as soon as we ask it for the code pointer, // don't take any chances that it might JIT lazily, since we // will be stealing the JIT code memory from under its nose and // destroying the Module & ExecutionEngine. m_llvm_exec->DisableLazyCompilation (); m_stat_llvm_setup_time += timer.lap(); // Set up m_num_used_layers to be the number of layers that are // actually used, and m_layer_remap[] to map original layer numbers // to the shorter list of actually-called layers. int nlayers = m_group.nlayers(); m_layer_remap.resize (nlayers); m_num_used_layers = 0; for (int layer = 0; layer < m_group.nlayers(); ++layer) { bool lastlayer = (layer == (nlayers-1)); if (! m_group[layer]->unused() || lastlayer) m_layer_remap[layer] = m_num_used_layers++; else m_layer_remap[layer] = -1; } m_shadingsys.m_stat_empty_instances += m_group.nlayers()-m_num_used_layers; initialize_llvm_group (); // Generate the LLVM IR for each layer. Skip unused layers. m_llvm_local_mem = 0; llvm::Function** funcs = (llvm::Function**)alloca(m_num_used_layers * sizeof(llvm::Function*)); for (int layer = 0; layer < nlayers; ++layer) { set_inst (layer); bool lastlayer = (layer == (nlayers-1)); int index = m_layer_remap[layer]; if (index != -1) funcs[index] = build_llvm_instance (lastlayer); } llvm::Function* entry_func = funcs[m_num_used_layers-1]; m_stat_llvm_irgen_time += timer.lap(); if (m_shadingsys.m_max_local_mem_KB && m_llvm_local_mem/1024 > m_shadingsys.m_max_local_mem_KB) { m_shadingsys.error ("Shader group \"%s\" needs too much local storage: %d KB", m_group.name().c_str(), m_llvm_local_mem/1024); } // Optimize the LLVM IR unless it's just a ret void group (1 layer, // 1 BB, 1 inst == retvoid) bool skip_optimization = m_num_used_layers == 1 && entry_func->size() == 1 && entry_func->front().size() == 1; // Label the group as being retvoid or not. m_group.does_nothing(skip_optimization); if (skip_optimization) { m_shadingsys.m_stat_empty_groups += 1; m_shadingsys.m_stat_empty_instances += 1; // the one layer is empty } else { m_llvm_passes->run (*llvm_module()); } m_stat_llvm_opt_time += timer.lap(); if (shadingsys().llvm_debug()) { llvm::outs() << "func after opt = " << *entry_func << "\n"; llvm::outs().flush(); } // Debug code to dump the resulting bitcode to a file if (shadingsys().llvm_debug() >= 2) { std::string err_info; std::string name = Strutil::format ("%s_%d.bc", inst()->layername().c_str(), inst()->id()); llvm::raw_fd_ostream out (name.c_str(), err_info); llvm::WriteBitcodeToFile (llvm_module(), out); } // Force the JIT to happen now RunLLVMGroupFunc f = (RunLLVMGroupFunc) m_llvm_exec->getPointerToFunction(entry_func); m_group.llvm_compiled_version (f); // Remove the IR for the group layer functions, we've already JITed it // and will never need the IR again. This saves memory, and also saves // a huge amount of time since we won't re-optimize it again and again // if we keep adding new shader groups to the same Module. for (int i = 0; i < m_num_used_layers; ++i) { funcs[i]->deleteBody(); } // Free the exec and module to reclaim all the memory. This definitely // saves memory, and has almost no effect on runtime. delete m_llvm_exec; m_llvm_exec = NULL; // N.B. Destroying the EE should have destroyed the module as well. m_llvm_module = NULL; m_stat_llvm_jit_time += timer.lap(); }