extern "C" int
test_shade (int argc, const char *argv[])
{
    OIIO::Timer timer;

    // Create a new shading system.  We pass it the RendererServices
    // object that services callbacks from the shading system, NULL for
    // the TextureSystem (that just makes 'create' make its own TS), and
    // an error handler.
    shadingsys = ShadingSystem::create (&rend, NULL, &errhandler);

    // Remember that each shader parameter may optionally have a
    // metadata hint [[int lockgeom=...]], where 0 indicates that the
    // parameter may be overridden by the geometry itself, for example
    // with data interpolated from the mesh vertices, and a value of 1
    // means that it is "locked" with respect to the geometry (i.e. it
    // will not be overridden with interpolated or
    // per-geometric-primitive data).
    // 
    // In order to most fully optimize shader, we typically want any
    // shader parameter not explicitly specified to default to being
    // locked (i.e. no per-geometry override):
    shadingsys->attribute("lockgeom", 1);

    // Now we declare our shader.
    // 
    // Each material in the scene is comprised of a "shader group."
    // Each group is comprised of one or more "layers" (a.k.a. shader
    // instances) with possible connections from outputs of
    // upstream/early layers into the inputs of downstream/later layers.
    // A shader instance is the combination of a reference to a shader
    // master and its parameter values that may override the defaults in
    // the shader source and may be particular to this instance (versus
    // all the other instances of the same shader).
    // 
    // A shader group declaration typically looks like this:
    //
    //   ss->ShaderGroupBegin ();
    //   ss->Parameter ("paramname", TypeDesc paramtype, void *value);
    //      ... and so on for all the other parameters of...
    //   ss->Shader ("shadertype", "shadername", "layername");
    //      The Shader() call creates a new instance, which gets
    //      all the pending Parameter() values made right before it.
    //   ... and other shader instances in this group, interspersed with...
    //   ss->ConnectShaders ("layer1", "param1", "layer2", "param2");
    //   ... and other connections ...
    //   ss->ShaderGroupEnd ();
    //   // and now grab an opaque reference to that shader group:
    //   ShadingAttribStateRef shaderstate = s->state ();
    // 
    // It looks so simple, and it really is, except that the way this
    // testshade program works is that all the Parameter() and Shader()
    // calls are done inside getargs(), as it walks through the command
    // line arguments, whereas the connections accumulate and have
    // to be processed at the end.  Bear with us.
    
    // Start the shader group.
    shadingsys->ShaderGroupBegin ();
    // Get the command line arguments.  That will set up all the shader
    // instances and their parameters for the group.
    getargs (argc, argv);

    // Now set up the connections
    for (size_t i = 0;  i < connections.size();  i += 4) {
        if (i+3 < connections.size()) {
            std::cout << "Connect " 
                      << connections[i] << "." << connections[i+1]
                      << " to " << connections[i+2] << "." << connections[i+3]
                      << "\n";
            shadingsys->ConnectShaders (connections[i].c_str(),
                                        connections[i+1].c_str(),
                                        connections[i+2].c_str(),
                                        connections[i+3].c_str());
        }
    }

    // End the group
    shadingsys->ShaderGroupEnd ();

    // Now we should have a valid shading state, to get a reference to it.
    ShadingAttribStateRef shaderstate = shadingsys->state ();
    if (outputfiles.size() != 0)
        std::cout << "\n";

    // Set up the named transformations, including shader and object.
    // For this test application, we just do this statically; in a real
    // renderer, the global named space (like "myspace") would probably
    // be static, but shader and object spaces may be different for each
    // object.
    setup_transformations (rend, Mshad, Mobj);

    // Set up the image outputs requested on the command line
    setup_output_images (shadingsys, shaderstate);

    // Set up shader globals and a little test grid of points to shade.
    ShaderGlobals shaderglobals;

    double setuptime = timer.lap ();

    std::vector<float> pixel;

    // Optional: high-performance apps may request this thread-specific
    // pointer in order to save a bit of time on each shade.  Just like
    // the name implies, a multithreaded renderer would need to do this
    // separately for each thread, and be careful to always use the same
    // thread_info each time for that thread.
    //
    // There's nothing wrong with a simpler app just passing NULL for
    // the thread_info; in such a case, the ShadingSystem will do the
    // necessary calls to find the thread-specific pointer itself, but
    // this will degrade performance just a bit.
    OSL::PerThreadInfo *thread_info = shadingsys->create_thread_info();

    // Request a shading context so that we can execute the shader.
    // We could get_context/release_constext for each shading point,
    // but to save overhead, it's more efficient to reuse a context
    // within a thread.
    ShadingContext *ctx = shadingsys->get_context (thread_info);

    // Allow a settable number of iterations to "render" the whole image,
    // which is useful for time trials of things that would be too quick
    // to accurately time for a single iteration
    for (int iter = 0;  iter < iters;  ++iter) {

        // Loop over all pixels in the image (in x and y)...
        for (int y = 0, n = 0;  y < yres;  ++y) {
            for (int x = 0;  x < xres;  ++x, ++n) {

                // In a real renderer, this is where you would figure
                // out what object point is visible in this pixel (or
                // this sample, for antialiasing).  Once determined,
                // you'd set up a ShaderGlobals that contained the vital
                // information about that point, such as its location,
                // the normal there, the u and v coordinates on the
                // surface, the transformation of that object, and so
                // on.  
                //
                // This test app is not a real renderer, so we just
                // set it up rigged to look like we're rendering a single
                // quadrilateral that exactly fills the viewport, and that
                // setup is done in the following function call:
                setup_shaderglobals (shaderglobals, shadingsys, x, y);

                // Actually run the shader for this point
                shadingsys->execute (*ctx, *shaderstate, shaderglobals);

                // Save all the designated outputs.  But only do so if we
                // are on the last iteration requested, so that if we are
                // doing a bunch of iterations for time trials, we only
                // including the output pixel copying once in the timing.
                if (iter == (iters - 1)) {
                    save_outputs (shadingsys, ctx, x, y);
                }
            }
        }
    }

    // We're done shading with this context.
    shadingsys->release_context (ctx);

    // Now that we're done rendering, release the thread=specific
    // pointer we saved.  A simple app could skip this; but if the app
    // asks for it (as we have in this example), then it should also
    // destroy it when done with it.
    shadingsys->destroy_thread_info(thread_info);

    if (outputfiles.size() == 0)
        std::cout << "\n";

    // Write the output images to disk
    for (size_t i = 0;  i < outputimgs.size();  ++i) {
        if (outputimgs[i]) {
            outputimgs[i]->save();
            delete outputimgs[i];
            outputimgs[i] = NULL;
        }
    }

    // Print some debugging info
    if (debug || stats) {
        double runtime = timer();
        std::cout << "\n";
        std::cout << "Setup: " << OIIO::Strutil::timeintervalformat (setuptime,2) << "\n";
        std::cout << "Run  : " << OIIO::Strutil::timeintervalformat (runtime,2) << "\n";
        std::cout << "\n";
        std::cout << shadingsys->getstats (5) << "\n";
    }

    // We're done with the shading system now, destroy it
    ShadingSystem::destroy (shadingsys);

    return EXIT_SUCCESS;
}
void
BackendLLVM::run ()
{
    // At this point, we already hold the lock for this group, by virtue
    // of ShadingSystemImpl::optimize_group.
    OIIO::Timer timer;
    std::string err;

    {
#ifdef OSL_LLVM_NO_BITCODE
    // I don't know which exact part has thread safety issues, but it
    // crashes on windows when we don't lock.
    // FIXME -- try subsequent LLVM releases on Windows to see if this
    // is a problem that is eventually fixed on the LLVM side.
    static spin_mutex mutex;
    OIIO::spin_lock lock (mutex);
#endif

#ifdef OSL_LLVM_NO_BITCODE
    ll.module (ll.new_module ("llvm_ops"));
#else
    ll.module (ll.module_from_bitcode (osl_llvm_compiled_ops_block,
                                       osl_llvm_compiled_ops_size,
                                       "llvm_ops", &err));
    if (err.length())
        shadingcontext()->error ("ParseBitcodeFile returned '%s'\n", err.c_str());
    ASSERT (ll.module());
#endif

    // Create the ExecutionEngine
    if (! ll.make_jit_execengine (&err)) {
        shadingcontext()->error ("Failed to create engine: %s\n", err.c_str());
        ASSERT (0);
        return;
    }

    // End of mutex lock, for the OSL_LLVM_NO_BITCODE case
    }

    m_stat_llvm_setup_time += timer.lap();

    // Set up m_num_used_layers to be the number of layers that are
    // actually used, and m_layer_remap[] to map original layer numbers
    // to the shorter list of actually-called layers.
    int nlayers = group().nlayers();
    m_layer_remap.resize (nlayers);
    m_num_used_layers = 0;
    for (int layer = 0;  layer < group().nlayers();  ++layer) {
        bool lastlayer = (layer == (nlayers-1));
        if (! group()[layer]->unused() || lastlayer)
            m_layer_remap[layer] = m_num_used_layers++;
        else
            m_layer_remap[layer] = -1;
    }
    shadingsys().m_stat_empty_instances += group().nlayers()-m_num_used_layers;

    initialize_llvm_group ();

    // Generate the LLVM IR for each layer.  Skip unused layers.
    m_llvm_local_mem = 0;
    llvm::Function** funcs = (llvm::Function**)alloca(m_num_used_layers * sizeof(llvm::Function*));
    for (int layer = 0; layer < nlayers; ++layer) {
        set_inst (layer);
        bool lastlayer = (layer == (nlayers-1));
        int index = m_layer_remap[layer];
        if (index != -1)
            funcs[index] = build_llvm_instance (lastlayer);
    }
    llvm::Function* entry_func = funcs[m_num_used_layers-1];
    m_stat_llvm_irgen_time += timer.lap();

    if (shadingsys().m_max_local_mem_KB &&
        m_llvm_local_mem/1024 > shadingsys().m_max_local_mem_KB) {
        shadingcontext()->error ("Shader group \"%s\" needs too much local storage: %d KB",
                                 group().name(), m_llvm_local_mem/1024);
    }

    // Optimize the LLVM IR unless it's just a ret void group (1 layer,
    // 1 BB, 1 inst == retvoid)
    bool skip_optimization = m_num_used_layers == 1 && ll.func_is_empty(entry_func);
    // Label the group as being retvoid or not.
    group().does_nothing(skip_optimization);
    if (skip_optimization) {
        shadingsys().m_stat_empty_groups += 1;
        shadingsys().m_stat_empty_instances += 1;  // the one layer is empty
    } else {
        ll.do_optimize();
    }

    m_stat_llvm_opt_time += timer.lap();

    if (llvm_debug()) {
        std::cout << "func after opt  = " << ll.bitcode_string (entry_func) << "\n";
        std::cout.flush();
    }

    // Debug code to dump the resulting bitcode to a file
    if (llvm_debug() >= 2) {
        std::string name = Strutil::format ("%s_%d.bc", inst()->layername(),
                                            inst()->id());
        ll.write_bitcode_file (name.c_str());
    }

    // Force the JIT to happen now and retrieve the JITed function
    group().llvm_compiled_version ((RunLLVMGroupFunc) ll.getPointerToFunction(entry_func));

    // Remove the IR for the group layer functions, we've already JITed it
    // and will never need the IR again.  This saves memory, and also saves
    // a huge amount of time since we won't re-optimize it again and again
    // if we keep adding new shader groups to the same Module.
    for (int i = 0; i < m_num_used_layers; ++i) {
        ll.delete_func_body (funcs[i]);
    }

    // Free the exec and module to reclaim all the memory.  This definitely
    // saves memory, and has almost no effect on runtime.
    ll.execengine (NULL);

    // N.B. Destroying the EE should have destroyed the module as well.
    ll.module (NULL);

    m_stat_llvm_jit_time += timer.lap();

    m_stat_total_llvm_time = timer();

    if (shadingsys().m_compile_report) {
        shadingcontext()->info ("JITed shader group %s:", group().name());
        shadingcontext()->info ("    (%1.2fs = %1.2f setup, %1.2f ir, %1.2f opt, %1.2f jit; local mem %dKB)",
                           m_stat_total_llvm_time, 
                           m_stat_llvm_setup_time,
                           m_stat_llvm_irgen_time, m_stat_llvm_opt_time,
                           m_stat_llvm_jit_time,
                           m_llvm_local_mem/1024);
    }
}
extern "C" int
test_shade (int argc, const char *argv[])
{
    OIIO::Timer timer;

    // Create a new shading system.  We pass it the RendererServices
    // object that services callbacks from the shading system, NULL for
    // the TextureSystem (that just makes 'create' make its own TS), and
    // an error handler.
    shadingsys = new ShadingSystem (&rend, NULL, &errhandler);
    register_closures(shadingsys);

    // Remember that each shader parameter may optionally have a
    // metadata hint [[int lockgeom=...]], where 0 indicates that the
    // parameter may be overridden by the geometry itself, for example
    // with data interpolated from the mesh vertices, and a value of 1
    // means that it is "locked" with respect to the geometry (i.e. it
    // will not be overridden with interpolated or
    // per-geometric-primitive data).
    // 
    // In order to most fully optimize shader, we typically want any
    // shader parameter not explicitly specified to default to being
    // locked (i.e. no per-geometry override):
    shadingsys->attribute("lockgeom", 1);

    // Now we declare our shader.
    // 
    // Each material in the scene is comprised of a "shader group."
    // Each group is comprised of one or more "layers" (a.k.a. shader
    // instances) with possible connections from outputs of
    // upstream/early layers into the inputs of downstream/later layers.
    // A shader instance is the combination of a reference to a shader
    // master and its parameter values that may override the defaults in
    // the shader source and may be particular to this instance (versus
    // all the other instances of the same shader).
    // 
    // A shader group declaration typically looks like this:
    //
    //   ShaderGroupRef shadergroup = ss->ShaderGroupBegin ();
    //   ss->Parameter ("paramname", TypeDesc paramtype, void *value);
    //      ... and so on for all the other parameters of...
    //   ss->Shader ("shadertype", "shadername", "layername");
    //      The Shader() call creates a new instance, which gets
    //      all the pending Parameter() values made right before it.
    //   ... and other shader instances in this group, interspersed with...
    //   ss->ConnectShaders ("layer1", "param1", "layer2", "param2");
    //   ... and other connections ...
    //   ss->ShaderGroupEnd ();
    // 
    // It looks so simple, and it really is, except that the way this
    // testshade program works is that all the Parameter() and Shader()
    // calls are done inside getargs(), as it walks through the command
    // line arguments, whereas the connections accumulate and have
    // to be processed at the end.  Bear with us.
    
    // Start the shader group and grab a reference to it.
    ShaderGroupRef shadergroup = shadingsys->ShaderGroupBegin ();

    // Get the command line arguments.  That will set up all the shader
    // instances and their parameters for the group.
    getargs (argc, argv);

    // Now set up the connections
    for (size_t i = 0;  i < connections.size();  i += 4) {
        if (i+3 < connections.size()) {
            std::cout << "Connect " 
                      << connections[i] << "." << connections[i+1]
                      << " to " << connections[i+2] << "." << connections[i+3]
                      << "\n";
            shadingsys->ConnectShaders (connections[i].c_str(),
                                        connections[i+1].c_str(),
                                        connections[i+2].c_str(),
                                        connections[i+3].c_str());
        }
    }

    // End the group
    shadingsys->ShaderGroupEnd ();

    if (outputfiles.size() != 0)
        std::cout << "\n";

    // Set up the named transformations, including shader and object.
    // For this test application, we just do this statically; in a real
    // renderer, the global named space (like "myspace") would probably
    // be static, but shader and object spaces may be different for each
    // object.
    setup_transformations (rend, Mshad, Mobj);

    // Set up the image outputs requested on the command line
    setup_output_images (shadingsys, shadergroup);

    if (debug)
        test_group_attributes (shadergroup.get());

    if (num_threads < 1)
        num_threads = boost::thread::hardware_concurrency();

    double setuptime = timer.lap ();

    // Allow a settable number of iterations to "render" the whole image,
    // which is useful for time trials of things that would be too quick
    // to accurately time for a single iteration
    for (int iter = 0;  iter < iters;  ++iter) {
        OIIO::ROI roi (0, xres, 0, yres);
        bool save = (iter == (iters-1));   // save on last iteration

#if 0
        shade_region (shadergroup.get(), roi, save);
#else
        OIIO::ImageBufAlgo::parallel_image (
            boost::bind (shade_region, shadergroup.get(), _1, save),
            roi, num_threads);
#endif

        // If any reparam was requested, do it now
        if (reparams.size() && reparam_layer.size()) {
            for (size_t p = 0;  p < reparams.size();  ++p) {
                const ParamValue &pv (reparams[p]);
                shadingsys->ReParameter (*shadergroup, reparam_layer.c_str(),
                                         pv.name().c_str(), pv.type(),
                                         pv.data());
            }
        }
    }

    if (outputfiles.size() == 0)
        std::cout << "\n";

    // Write the output images to disk
    for (size_t i = 0;  i < outputimgs.size();  ++i) {
        if (outputimgs[i]) {
            outputimgs[i]->save();
            delete outputimgs[i];
            outputimgs[i] = NULL;
        }
    }

    // Print some debugging info
    if (debug || stats) {
        double runtime = timer.lap();
        std::cout << "\n";
        std::cout << "Setup: " << OIIO::Strutil::timeintervalformat (setuptime,2) << "\n";
        std::cout << "Run  : " << OIIO::Strutil::timeintervalformat (runtime,2) << "\n";
        std::cout << "\n";
        std::cout << shadingsys->getstats (5) << "\n";
        OIIO::TextureSystem *texturesys = shadingsys->texturesys();
        if (texturesys)
            std::cout << texturesys->getstats (5) << "\n";
        std::cout << ustring::getstats() << "\n";
    }

    // We're done with the shading system now, destroy it
    shadergroup.reset ();  // Must release this before destroying shadingsys
    delete shadingsys;

    return EXIT_SUCCESS;
}
extern "C" OSL_DLL_EXPORT int
test_shade (int argc, const char *argv[])
{
    OIIO::Timer timer;

    // Create a new shading system.  We pass it the RendererServices
    // object that services callbacks from the shading system, NULL for
    // the TextureSystem (that just makes 'create' make its own TS), and
    // an error handler.
    shadingsys = new ShadingSystem (&rend, NULL, &errhandler);

    // Register the layout of all closures known to this renderer
    // Any closure used by the shader which is not registered, or
    // registered with a different number of arguments will lead
    // to a runtime error.
    register_closures(shadingsys);

    // Remember that each shader parameter may optionally have a
    // metadata hint [[int lockgeom=...]], where 0 indicates that the
    // parameter may be overridden by the geometry itself, for example
    // with data interpolated from the mesh vertices, and a value of 1
    // means that it is "locked" with respect to the geometry (i.e. it
    // will not be overridden with interpolated or
    // per-geometric-primitive data).
    // 
    // In order to most fully optimize shader, we typically want any
    // shader parameter not explicitly specified to default to being
    // locked (i.e. no per-geometry override):
    shadingsys->attribute("lockgeom", 1);

    // Now we declare our shader.
    // 
    // Each material in the scene is comprised of a "shader group."
    // Each group is comprised of one or more "layers" (a.k.a. shader
    // instances) with possible connections from outputs of
    // upstream/early layers into the inputs of downstream/later layers.
    // A shader instance is the combination of a reference to a shader
    // master and its parameter values that may override the defaults in
    // the shader source and may be particular to this instance (versus
    // all the other instances of the same shader).
    // 
    // A shader group declaration typically looks like this:
    //
    //   ShaderGroupRef shadergroup = ss->ShaderGroupBegin ();
    //   ss->Parameter ("paramname", TypeDesc paramtype, void *value);
    //      ... and so on for all the other parameters of...
    //   ss->Shader ("shadertype", "shadername", "layername");
    //      The Shader() call creates a new instance, which gets
    //      all the pending Parameter() values made right before it.
    //   ... and other shader instances in this group, interspersed with...
    //   ss->ConnectShaders ("layer1", "param1", "layer2", "param2");
    //   ... and other connections ...
    //   ss->ShaderGroupEnd ();
    // 
    // It looks so simple, and it really is, except that the way this
    // testshade program works is that all the Parameter() and Shader()
    // calls are done inside getargs(), as it walks through the command
    // line arguments, whereas the connections accumulate and have
    // to be processed at the end.  Bear with us.
    
    // Start the shader group and grab a reference to it.
    shadergroup = shadingsys->ShaderGroupBegin (groupname);

    // Get the command line arguments.  That will set up all the shader
    // instances and their parameters for the group.
    getargs (argc, argv);

    if (! shadergroup) {
        std::cerr << "ERROR: Invalid shader group. Exiting testshade.\n";
        return EXIT_FAILURE;
    }

    shadingsys->attribute (shadergroup.get(), "groupname", groupname);

    // Now set up the connections
    for (size_t i = 0;  i < connections.size();  i += 4) {
        if (i+3 < connections.size()) {
            std::cout << "Connect " 
                      << connections[i] << "." << connections[i+1]
                      << " to " << connections[i+2] << "." << connections[i+3]
                      << "\n";
            shadingsys->ConnectShaders (connections[i].c_str(),
                                        connections[i+1].c_str(),
                                        connections[i+2].c_str(),
                                        connections[i+3].c_str());
        }
    }

    // End the group
    shadingsys->ShaderGroupEnd ();

    if (verbose || do_oslquery) {
        std::string pickle;
        shadingsys->getattribute (shadergroup.get(), "pickle", pickle);
        std::cout << "Shader group:\n---\n" << pickle << "\n---\n";
        std::cout << "\n";
        ustring groupname;
        shadingsys->getattribute (shadergroup.get(), "groupname", groupname);
        std::cout << "Shader group \"" << groupname << "\" layers are:\n";
        int num_layers = 0;
        shadingsys->getattribute (shadergroup.get(), "num_layers", num_layers);
        if (num_layers > 0) {
            std::vector<const char *> layers (size_t(num_layers), NULL);
            shadingsys->getattribute (shadergroup.get(), "layer_names",
                                      TypeDesc(TypeDesc::STRING, num_layers),
                                      &layers[0]);
            for (int i = 0; i < num_layers; ++i) {
                std::cout << "    " << (layers[i] ? layers[i] : "<unnamed>") << "\n";
                if (do_oslquery) {
                    OSLQuery q;
                    q.init (shadergroup.get(), i);
                    for (size_t p = 0;  p < q.nparams(); ++p) {
                        const OSLQuery::Parameter *param = q.getparam(p);
                        std::cout << "\t" << (param->isoutput ? "output "  : "")
                                  << param->type << ' ' << param->name << "\n";
                    }
                }
            }
        }
        std::cout << "\n";
    }
    if (archivegroup.size())
        shadingsys->archive_shadergroup (shadergroup.get(), archivegroup);

    if (outputfiles.size() != 0)
        std::cout << "\n";

    // Set up the named transformations, including shader and object.
    // For this test application, we just do this statically; in a real
    // renderer, the global named space (like "myspace") would probably
    // be static, but shader and object spaces may be different for each
    // object.
    setup_transformations (rend, Mshad, Mobj);

    // Set up the image outputs requested on the command line
    setup_output_images (shadingsys, shadergroup);

    if (debug)
        test_group_attributes (shadergroup.get());

    if (num_threads < 1)
        num_threads = boost::thread::hardware_concurrency();

    double setuptime = timer.lap ();

    // Allow a settable number of iterations to "render" the whole image,
    // which is useful for time trials of things that would be too quick
    // to accurately time for a single iteration
    for (int iter = 0;  iter < iters;  ++iter) {
        OIIO::ROI roi (0, xres, 0, yres);

        if (use_shade_image)
            OSL::shade_image (*shadingsys, *shadergroup, NULL,
                              *outputimgs[0], outputvarnames,
                              pixelcenters ? ShadePixelCenters : ShadePixelGrid,
                              roi, num_threads);
        else {
            bool save = (iter == (iters-1));   // save on last iteration
#if 0
            shade_region (shadergroup.get(), roi, save);
#else
            OIIO::ImageBufAlgo::parallel_image (
                    boost::bind (shade_region, shadergroup.get(), _1, save),
                    roi, num_threads);
#endif
        }

        // If any reparam was requested, do it now
        if (reparams.size() && reparam_layer.size()) {
            for (size_t p = 0;  p < reparams.size();  ++p) {
                const ParamValue &pv (reparams[p]);
                shadingsys->ReParameter (*shadergroup, reparam_layer.c_str(),
                                         pv.name().c_str(), pv.type(),
                                         pv.data());
            }
        }
    }
    double runtime = timer.lap();

    if (outputfiles.size() == 0)
        std::cout << "\n";

    // Write the output images to disk
    for (size_t i = 0;  i < outputimgs.size();  ++i) {
        if (outputimgs[i]) {
            if (! print_outputs) {
                std::string filename = outputimgs[i]->name();
                // JPEG, GIF, and PNG images should be automatically saved
                // as sRGB because they are almost certainly supposed to
                // be displayed on web pages.
                using namespace OIIO;
                if (Strutil::iends_with (filename, ".jpg") ||
                    Strutil::iends_with (filename, ".jpeg") ||
                    Strutil::iends_with (filename, ".gif") ||
                    Strutil::iends_with (filename, ".png")) {
                    ImageBuf ccbuf;
                    ImageBufAlgo::colorconvert (ccbuf, *outputimgs[i],
                                                "linear", "sRGB", false,
                                                "", "");
                    ccbuf.set_write_format (outputimgs[i]->spec().format);
                    ccbuf.write (filename);
                } else {
                    outputimgs[i]->write (filename);
                }
            }
            delete outputimgs[i];
            outputimgs[i] = NULL;
        }
    }

    // Print some debugging info
    if (debug || runstats || profile) {
        double writetime = timer.lap();
        std::cout << "\n";
        std::cout << "Setup: " << OIIO::Strutil::timeintervalformat (setuptime,2) << "\n";
        std::cout << "Run  : " << OIIO::Strutil::timeintervalformat (runtime,2) << "\n";
        std::cout << "Write: " << OIIO::Strutil::timeintervalformat (writetime,2) << "\n";
        std::cout << "\n";
        std::cout << shadingsys->getstats (5) << "\n";
        OIIO::TextureSystem *texturesys = shadingsys->texturesys();
        if (texturesys)
            std::cout << texturesys->getstats (5) << "\n";
        std::cout << ustring::getstats() << "\n";
    }

    // We're done with the shading system now, destroy it
    shadergroup.reset ();  // Must release this before destroying shadingsys
    delete shadingsys;

    return EXIT_SUCCESS;
}
void
RuntimeOptimizer::build_llvm_group ()
{
    // At this point, we already hold the lock for this group, by virtue
    // of ShadingSystemImpl::optimize_group.
    OIIO::Timer timer;

    if (! m_thread->llvm_context)
        m_thread->llvm_context = new llvm::LLVMContext();

    if (! m_thread->llvm_jitmm) {
        m_thread->llvm_jitmm = llvm::JITMemoryManager::CreateDefaultMemManager();
        OIIO::spin_lock lock (m_shadingsys.m_llvm_mutex);  // lock m_llvm_jitmm_hold
        m_shadingsys.m_llvm_jitmm_hold.push_back (shared_ptr<llvm::JITMemoryManager>(m_thread->llvm_jitmm));
    }

    ASSERT (! m_llvm_module);
    // Load the LLVM bitcode and parse it into a Module
    const char *data = osl_llvm_compiled_ops_block;
    llvm::MemoryBuffer* buf = llvm::MemoryBuffer::getMemBuffer (llvm::StringRef(data, osl_llvm_compiled_ops_size));
    std::string err;
#ifdef OSL_LLVM_NO_BITCODE
    m_llvm_module = new llvm::Module("llvm_ops", *llvm_context());
#else
    // Load the LLVM bitcode and parse it into a Module
    m_llvm_module = llvm::ParseBitcodeFile (buf, *m_thread->llvm_context, &err);
    if (err.length())
        m_shadingsys.error ("ParseBitcodeFile returned '%s'\n", err.c_str());
    delete buf;
#endif

    // Create the ExecutionEngine
    ASSERT (! m_llvm_exec);
    err.clear ();
    llvm::JITMemoryManager *mm = new OSL_Dummy_JITMemoryManager(m_thread->llvm_jitmm);
    m_llvm_exec = llvm::ExecutionEngine::createJIT (m_llvm_module, &err, mm, llvm::CodeGenOpt::Default, /*AllocateGVsWithCode*/ false);
    if (! m_llvm_exec) {
        m_shadingsys.error ("Failed to create engine: %s\n", err.c_str());
        ASSERT (0);
        return;
    }
    // Force it to JIT as soon as we ask it for the code pointer,
    // don't take any chances that it might JIT lazily, since we
    // will be stealing the JIT code memory from under its nose and
    // destroying the Module & ExecutionEngine.
    m_llvm_exec->DisableLazyCompilation ();

    m_stat_llvm_setup_time += timer.lap();

    // Set up m_num_used_layers to be the number of layers that are
    // actually used, and m_layer_remap[] to map original layer numbers
    // to the shorter list of actually-called layers.
    int nlayers = m_group.nlayers();
    m_layer_remap.resize (nlayers);
    m_num_used_layers = 0;
    for (int layer = 0;  layer < m_group.nlayers();  ++layer) {
        bool lastlayer = (layer == (nlayers-1));
        if (! m_group[layer]->unused() || lastlayer)
            m_layer_remap[layer] = m_num_used_layers++;
        else
            m_layer_remap[layer] = -1;
    }
    m_shadingsys.m_stat_empty_instances += m_group.nlayers()-m_num_used_layers;

    initialize_llvm_group ();

    // Generate the LLVM IR for each layer.  Skip unused layers.
    m_llvm_local_mem = 0;
    llvm::Function** funcs = (llvm::Function**)alloca(m_num_used_layers * sizeof(llvm::Function*));
    for (int layer = 0; layer < nlayers; ++layer) {
        set_inst (layer);
        bool lastlayer = (layer == (nlayers-1));
        int index = m_layer_remap[layer];
        if (index != -1)
            funcs[index] = build_llvm_instance (lastlayer);
    }
    llvm::Function* entry_func = funcs[m_num_used_layers-1];
    m_stat_llvm_irgen_time += timer.lap();

    if (m_shadingsys.m_max_local_mem_KB &&
        m_llvm_local_mem/1024 > m_shadingsys.m_max_local_mem_KB) {
        m_shadingsys.error ("Shader group \"%s\" needs too much local storage: %d KB",
                            m_group.name().c_str(), m_llvm_local_mem/1024);
    }

    // Optimize the LLVM IR unless it's just a ret void group (1 layer,
    // 1 BB, 1 inst == retvoid)
    bool skip_optimization = m_num_used_layers == 1 && entry_func->size() == 1 && entry_func->front().size() == 1;
    // Label the group as being retvoid or not.
    m_group.does_nothing(skip_optimization);
    if (skip_optimization) {
        m_shadingsys.m_stat_empty_groups += 1;
        m_shadingsys.m_stat_empty_instances += 1;  // the one layer is empty
    } else {
        m_llvm_passes->run (*llvm_module());
    }

    m_stat_llvm_opt_time += timer.lap();

    if (shadingsys().llvm_debug()) {
        llvm::outs() << "func after opt  = " << *entry_func << "\n";
        llvm::outs().flush();
    }

    // Debug code to dump the resulting bitcode to a file
    if (shadingsys().llvm_debug() >= 2) {
        std::string err_info;
        std::string name = Strutil::format ("%s_%d.bc",
                                            inst()->layername().c_str(),
                                            inst()->id());
        llvm::raw_fd_ostream out (name.c_str(), err_info);
        llvm::WriteBitcodeToFile (llvm_module(), out);
    }

    // Force the JIT to happen now
    RunLLVMGroupFunc f = (RunLLVMGroupFunc) m_llvm_exec->getPointerToFunction(entry_func);
    m_group.llvm_compiled_version (f);

    // Remove the IR for the group layer functions, we've already JITed it
    // and will never need the IR again.  This saves memory, and also saves
    // a huge amount of time since we won't re-optimize it again and again
    // if we keep adding new shader groups to the same Module.
    for (int i = 0; i < m_num_used_layers; ++i) {
        funcs[i]->deleteBody();
    }

    // Free the exec and module to reclaim all the memory.  This definitely
    // saves memory, and has almost no effect on runtime.
    delete m_llvm_exec;
    m_llvm_exec = NULL;

    // N.B. Destroying the EE should have destroyed the module as well.
    m_llvm_module = NULL;

    m_stat_llvm_jit_time += timer.lap();
}