double OpenCLPricer::priceImplTriangle(OptionSpec& optionSpec, int stepSize) {
    if (stepSize >= 512) {
        std::cerr << "[Error] Step size not valid."
            << "Cannot have more than 512 work items per work group" 
            << std::endl;
        exit(5);
    }

    // ------------------------Derived Parameters------------------------------
    float deltaT = optionSpec.yearsToMaturity / optionSpec.numSteps;

    float upFactor = exp(optionSpec.volatility * sqrt(deltaT));
    float downFactor = 1.0f / upFactor;

    float discountFactor = exp(optionSpec.riskFreeRate * deltaT);

    float upWeight = (discountFactor - downFactor) / (upFactor - downFactor);
    float downWeight = 1.0f - upWeight;
    
    // Create buffers on the devices
    cl::Buffer valueBuffer(*context, 
                           CL_MEM_READ_WRITE,
                           sizeof(float) * (optionSpec.numSteps + 1));

    cl::Buffer triangleBuffer(*context, 
                           CL_MEM_READ_WRITE,
                           sizeof(float) * (optionSpec.numSteps + 1));

    // Create qeueue to push commands for the devices
    cl::CommandQueue queue(*context, *defaultDevice);
    
    // Build and run init kernel 
    cl::Kernel initKernel(*program, "init");
    initKernel.setArg(0, optionSpec.stockPrice);
    initKernel.setArg(1, optionSpec.strikePrice);
    initKernel.setArg(2, optionSpec.numSteps);
    initKernel.setArg(3, optionSpec.type);
    initKernel.setArg(4, deltaT);
    initKernel.setArg(5, upFactor);
    initKernel.setArg(6, downFactor);
    initKernel.setArg(7, valueBuffer);
    queue.enqueueNDRangeKernel(initKernel, 
                              cl::NullRange, 
                              cl::NDRange(optionSpec.numSteps + 1), 
                              cl::NullRange);
    // std::cout << "[INFO] Executing init kernel with " << optionSpec.numSteps + 1
    //         << " work items" << std::endl;

    // Block until init kernel finishes execution
    queue.enqueueBarrierWithWaitList();

    // Note(disiok): Here we use work groups of size stepSize + 1 
    // so that after each iteration, the number of nodes is reduced by stepSize
    int groupSize = stepSize + 1;

    cl::Kernel upKernel(*program, "upTriangle");
    upKernel.setArg(0, upWeight);
    upKernel.setArg(1, downWeight);
    upKernel.setArg(2, discountFactor);
    upKernel.setArg(3, valueBuffer);
    upKernel.setArg(4, cl::Local(sizeof(float) * groupSize));
    upKernel.setArg(5, triangleBuffer);

    cl::Kernel downKernel(*program, "downTriangle");
    downKernel.setArg(0, upWeight);
    downKernel.setArg(1, downWeight);
    downKernel.setArg(2, discountFactor);
    downKernel.setArg(3, valueBuffer);
    downKernel.setArg(4, cl::Local(sizeof(float) * groupSize));
    downKernel.setArg(5, triangleBuffer);
    for (int i = 0; i < optionSpec.numSteps / stepSize; i ++) {
        int numWorkGroupsUp = optionSpec.numSteps / stepSize - i;
        int numWorkGroupsDown = numWorkGroupsUp - 1;
        int numWorkItemsUp = numWorkGroupsUp * groupSize;
        int numWorkItemsDown = numWorkGroupsDown * groupSize;

        queue.enqueueNDRangeKernel(upKernel,
                            cl::NullRange,
                            cl::NDRange(numWorkItemsUp)),
                            cl::NDRange(groupSize);
        // std::cout << "[INFO] Executing up kernel with " << numWorkGroupsUp
        //         << " work groups and " << groupSize << " work items per group"
        //         << std::endl; 

        queue.enqueueBarrierWithWaitList();

        if (numWorkGroupsDown > 0) {
            queue.enqueueNDRangeKernel(downKernel,
                    cl::NullRange,
                    cl::NDRange(numWorkItemsDown)),
                    cl::NDRange(groupSize);
            // std::cout << "[INFO] Executing down kernel with " << numWorkGroupsDown
            //     << " work groups and " << groupSize << " work items per group"
            //     << std::endl; 
            queue.enqueueBarrierWithWaitList();
        }
    }

    // Read results
    float* value = new float;
    queue.enqueueReadBuffer(valueBuffer, 
                            CL_TRUE, 
                            0, 
                            sizeof(float), 
                            value);
    return *value; 
}
bool wxLuaSocketBase::WriteDebugData(const wxLuaDebugData& debugData)
{
    // Debug data is written as
    // [wxInt32 debug data item count] then for each item
    //   [wxInt32 item data length]
    //   [{wxInt32 GetReference}{wxInt32 GetIndex}{wxInt32 GetFlag}
    //    {char GetName \0}{char GetType \0}{char GetValue \0}{char GetSource \0}]

    wxInt32 idx, idxMax = debugData.GetCount();

    wxLuaSocketDebugMsg(m_name + wxT(" wxLuaSocketBase::WriteDebugData"), wxString::Format(wxT("items %d"), idxMax));

    bool ok = Write((const char*)&idxMax, sizeof(wxInt32)) == sizeof(wxInt32);

    for (idx = 0; ok && (idx < idxMax); ++idx)
    {
        const wxLuaDebugItem *item = debugData.Item(idx);

        wxLuaCharBuffer keyBuffer(item->GetKey());
        wxLuaCharBuffer valueBuffer(item->GetValue());
        wxLuaCharBuffer sourceBuffer(item->GetSource());

        int keyLength    = keyBuffer.Length() + 1; // add 1 for terminating \0
        int valueLength  = valueBuffer.Length() + 1;
        int sourceLength = sourceBuffer.Length() + 1;

        wxInt32 bufferLength = (5 * sizeof(wxInt32)) +
                                keyLength + valueLength + sourceLength;

        unsigned char *pBuffer = new unsigned char[bufferLength];
        unsigned char *pMemory = pBuffer;

        ok = Write((const char*)&bufferLength, sizeof(wxInt32)) == sizeof(wxInt32);
        if (!ok) break;

        *(wxInt32 *) pMemory = (wxInt32)item->GetRef();
        pMemory += sizeof(wxInt32);

        *(wxInt32 *) pMemory = (wxInt32)item->GetIndex();
        pMemory += sizeof(wxInt32);

        *(wxInt32 *) pMemory = (wxInt32)item->GetFlag();
        pMemory += sizeof(wxInt32);

        *(wxInt32 *) pMemory = (wxInt32)item->GetKeyType();
        pMemory += sizeof(wxInt32);

        *(wxInt32 *) pMemory = (wxInt32)item->GetValueType();
        pMemory += sizeof(wxInt32);

        memcpy(pMemory, keyBuffer.GetData(), keyLength);
        pMemory += keyLength;

        memcpy(pMemory, valueBuffer.GetData(), valueLength);
        pMemory += valueLength;

        memcpy(pMemory, sourceBuffer.GetData(), sourceLength);

        ok = Write((const char *) pBuffer, bufferLength) == bufferLength;

        delete[] pBuffer;
    }

    return ok;
}