/**********************************************************************
Copyright ©2012 Advanced Micro Devices, Inc. All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
 other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/


#include "BinomialOptionMultiGPU.hpp"


int
BinomialOptionMultiGPU::setupBinomialOptionMultiGPU()
{
    // Make numSamples multiple of 4
    numSamples = (numSamples / 4) ? (numSamples / 4) * 4: 4;

#if defined (_WIN32)
    randArray = (cl_float*)_aligned_malloc(numSamples * sizeof(cl_float4), 16);
#else
    randArray = (cl_float*)memalign(16, numSamples * sizeof(cl_float4));
#endif

    CHECK_ALLOCATION(randArray, "Failed to allocate host memory. (randArray)");

    for(int i = 0; i < numSamples * 4; i += 1)
    {
        randArray[i] = (float)rand() / (float)RAND_MAX;
    }

#if defined (_WIN32)
    output = (cl_float*)_aligned_malloc(numSamples * sizeof(cl_float4), 
                            16);
#else
    output = (cl_float*)memalign(16, 
                            numSamples * sizeof(cl_float4));
#endif

    CHECK_ALLOCATION(output, "Failed to allocate host memory. (output)");

    memset(output, 0, numSamples * sizeof(cl_float4));
    return SDK_SUCCESS;
}


int 
BinomialOptionMultiGPU::genBinaryImage()
{
    streamsdk::bifData binaryData;
    binaryData.kernelName = std::string("BinomialOptionMultiGPU_Kernels.cl");
    binaryData.flagsStr = std::string("");
    if(isComplierFlagsSpecified())
        binaryData.flagsFileName = std::string(flags.c_str());
    binaryData.binaryName = std::string(dumpBinary.c_str());
    int status = sampleCommon->generateBinaryImage(binaryData);
    return status;
}

int
BinomialOptionMultiGPU::setupCL()
{
    cl_int status = CL_SUCCESS;

    cl_device_type dType;

    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    //If -d option is enabled make noMultiGPUSupport to true
    if(isDeviceIdEnabled())
    {
        noMultiGPUSupport =true;
    }

    if (dType == CL_DEVICE_TYPE_CPU && !noMultiGPUSupport)
    {
        sampleCommon->error("Invalid options!!\nYou can't have MultiGPU support enabled when device type is cpu.\n"
                            "disabling multi gpu support");
        noMultiGPUSupport = true;
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */
    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    if(dType == CL_DEVICE_TYPE_GPU)
    {
        //Get Number of devices available
        status = clGetDeviceIDs(platform,
                    CL_DEVICE_TYPE_GPU, 
                    0, 
                    0, 
                    (cl_uint*)&numGPUDevices);

        CHECK_OPENCL_ERROR(status, "clGetDeviceIDs failed");

        //Get the array having all the GPU device IDs
        gpuDeviceIDs = new cl_device_id[numGPUDevices];
        CHECK_ALLOCATION(gpuDeviceIDs, "Allocation failed(gpuDeviceIDs)");

        status = clGetDeviceIDs(platform, 
                        CL_DEVICE_TYPE_GPU, 
                        numGPUDevices, 
                        gpuDeviceIDs, 
                        0);
        CHECK_OPENCL_ERROR(status, "clGetDeviceIDs failed.");
    }

    /**
     * When multiGPU support is enabled.
     * Memory allocation
     */
    if(!noMultiGPUSupport)
    {
        /**
         * We have to have one cl_command_queue per GPU
         */ 
        commandQueues = new cl_command_queue[numGPUDevices];
        CHECK_ALLOCATION(commandQueues, "Allocation failed(commandQueues)");

        programs = new cl_program[numGPUDevices];
        CHECK_ALLOCATION(programs, "Allocation failed(programs)");

        kernels = new cl_kernel[numGPUDevices];
        CHECK_ALLOCATION(kernels, "Allocation failed(kernels)");

        randBuffers = new cl_mem[numGPUDevices];
        CHECK_ALLOCATION(randBuffers, "Allocation failed(randBuffers)");

        outputBuffers = new cl_mem[numGPUDevices];
        CHECK_ALLOCATION(randBuffers, "Allocation failed(outputBuffers)!!");

        devicesInfo = new streamsdk::SDKDeviceInfo[numGPUDevices];
        CHECK_ALLOCATION(devicesInfo, "Allocation failed(devicesInfo)!!");
    }

    /**
     * To get the device info. in case of single device
     */
    streamsdk::SDKDeviceInfo currentDeviceInfo;

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */
    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");
    cl_command_queue_properties prop = 0;

    /**
    * When multiGPU support is enabled.
    */
    if (!noMultiGPUSupport)
    {
        for (int i = 0; i < numGPUDevices; i++)
        {
            commandQueues[i] = clCreateCommandQueue(context, 
                                    gpuDeviceIDs[i], 
                                    prop, 
                                    &status);
            CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
        }
    }
    else 
    {
        // Create command queue 
        commandQueue = clCreateCommandQueue(context,
                                            devices[deviceId],
                                            0,
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    /**
     * Get the device info of all the devices by using SDKDeviceInfo structure and setDeviceInfo function
     **/

    /**
     * When multiGPU support is enabled.
     */
    if (!noMultiGPUSupport)
    {
        for (int i = 0; i < numGPUDevices; i++)
        {
            status = devicesInfo[i].setDeviceInfo(gpuDeviceIDs[i]);
            if(status != SDK_SUCCESS)
            {
                std::cout << "devicesInfo[i].setDeviceInfo failed" << std::endl;
                return SDK_FAILURE;
            }
        }
    }

    /**
     * We get the device info. into currentDeviceInfo struct variable
     */
    status = currentDeviceInfo.setDeviceInfo(devices[deviceId]);
    if(status != SDK_SUCCESS)
    {
        std::cout << "currentDeviceInfo.setDeviceInfo failed " << std::endl;
        return SDK_FAILURE;
    }

    maxWorkGroupSize = currentDeviceInfo.maxWorkGroupSize;

    maxDimensions = currentDeviceInfo.maxWorkItemDims;

    maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t));

    memcpy(maxWorkItemSizes, currentDeviceInfo.maxWorkItemSizes, maxDimensions * sizeof(size_t));

    totalLocalMemory = currentDeviceInfo.localMemSize;

    /**
     * Call load balancing for work divison
     **/

    /**
     * When multiGPU support is enabled.
     */
    if (!noMultiGPUSupport)
    {
        /**
         * This function would divide the work amongst the devices 
         * according to the capability of the device.
         */
        status = loadBalancing();
        CHECK_ERROR(status, SDK_SUCCESS, "loadBalancing failed!!");
    }

    /**
     * Create and initialize memory objects
     */

    /**
    * When multiGPU support is enabled.
    */
    if (!noMultiGPUSupport)
    {
        for (int i = 0; i < numGPUDevices; i++)
        {
            // Set Presistent memory only for AMD platform
            cl_mem_flags inMemFlags = CL_MEM_READ_ONLY;

            /**
            * Use CL_MEM_USE_PERSISTENT_MEM_AMD only in case of AMD GPU
            */
            if(isAmdPlatform())
                inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD;

            // Create memory object for stock price 
            randBuffers[i] = clCreateBuffer(context,
                                        inMemFlags,
                                        numSamplesPerGPU[i] * sizeof(cl_float4),
                                        NULL,
                                        &status);
            CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffers[i])");

            // Create memory object for output array 
            outputBuffers[i] = clCreateBuffer(context,
                                       CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR,
                                       numSamplesPerGPU[i] * sizeof(cl_float4),
                                       NULL,
                                       &status);
            CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffers[i])");
        }
    }
    else //Single GPU case
    {
        // Create memory object for stock price 
        randBuffer = clCreateBuffer(context,
                                    CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                    numSamples * sizeof(cl_float4),
                                    randArray,
                                    &status);
        CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffer)");

        // Create memory object for output array 
        outBuffer = clCreateBuffer(context,
                                   CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
                                   numSamples * sizeof(cl_float4),
                                   output,
                                   &status);
        CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outBuffer)");
    }

    // create a CL program using the kernel source 
    streamsdk::SDKFile kernelFile;
    std::string kernelPath = sampleCommon->getPath();

    if(isLoadBinaryEnabled())
    {
        kernelPath.append(loadBinary.c_str());
    }
    else 
    {
        kernelPath.append("BinomialOptionMultiGPU_Kernels.cl");
    }

    /**
    * When multiGPU support is enabled.
    */
    if(!noMultiGPUSupport)
    {
        for (int i = 0; i < numGPUDevices; i++)
        {
            // create a CL program using the kernel source 
            streamsdk::buildProgramData buildData;
            buildData.kernelName = std::string("BinomialOptionMultiGPU_Kernels.cl");
            buildData.devices = gpuDeviceIDs;
            buildData.deviceId = i;
            buildData.flagsStr = std::string("");
            if(isLoadBinaryEnabled())
                buildData.binaryName = std::string(loadBinary.c_str());

            if(isComplierFlagsSpecified())
                buildData.flagsFileName = std::string(flags.c_str());

            int retValue = sampleCommon->buildOpenCLProgram(programs[i], context, buildData);
            CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

            // get a kernel object handle for a kernel with the given name 
            kernels[i] = clCreateKernel(programs[i], "binomial_options", &status);
            CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

            // Check group-size against what is returned by kernel 
            status = clGetKernelWorkGroupInfo(kernels[i],
                                              gpuDeviceIDs[i],
                                              CL_KERNEL_WORK_GROUP_SIZE,
                                              sizeof(size_t),
                                              &kernelWorkGroupSize,
                                              0);
            CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

            // If group-size is greater than maximum supported on kernel 
            if((size_t)(numSteps + 1) > kernelWorkGroupSize)
            {
                if(!quiet)
                {
                    std::cout << "Out of Resources!" << std::endl;
                    std::cout << "Group Size specified : " << (numSteps + 1) << std::endl;
                    std::cout << "Max Group Size supported on the kernel : " 
                              << kernelWorkGroupSize << std::endl;
                    std::cout << "Using appropiate group-size." << std::endl;
                    std::cout << "-------------------------------------------" << std::endl;
                }
                numSteps = (cl_int)kernelWorkGroupSize - 2;
            }
        }

    }
    else
    {
         // create a CL program using the kernel source 
        streamsdk::buildProgramData buildData;
        buildData.kernelName = std::string("BinomialOptionMultiGPU_Kernels.cl");
        buildData.devices = devices;
        buildData.deviceId = deviceId;
        buildData.flagsStr = std::string("");
        if(isLoadBinaryEnabled())
            buildData.binaryName = std::string(loadBinary.c_str());

        if(isComplierFlagsSpecified())
            buildData.flagsFileName = std::string(flags.c_str());

        int retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
        CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

        // get a kernel object handle for a kernel with the given name 
        kernel = clCreateKernel(program,
                                "binomial_options",
                                &status);
        CHECK_OPENCL_ERROR(status, "clCreateKernel failed.");

        // Get kernel work group size 
        status = clGetKernelWorkGroupInfo(kernel,
                                          devices[deviceId],
                                          CL_KERNEL_WORK_GROUP_SIZE,
                                          sizeof(size_t),
                                          &kernelWorkGroupSize,
                                          0);
        CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");
        // If group-size is gerater than maximum supported on kernel 
        if((size_t)(numSteps + 1) > kernelWorkGroupSize)
        {
            if(!quiet)
            {
                std::cout << "Out of Resources!" << std::endl;
                std::cout << "Group Size specified : " << (numSteps + 1) << std::endl;
                std::cout << "Max Group Size supported on the kernel : " 
                          << kernelWorkGroupSize << std::endl;
                std::cout << "Using appropiate group-size." << std::endl;
                std::cout << "-------------------------------------------" << std::endl;
            }
            numSteps = (cl_int)kernelWorkGroupSize - 2;
        }
    }

    return SDK_SUCCESS;
}


int 
BinomialOptionMultiGPU::runCLKernels()
{
    cl_int status;
    cl_event ndrEvt;
    cl_int eventStatus = CL_QUEUED;

    cl_event inMapEvt;
    void* mapPtr = clEnqueueMapBuffer(commandQueue, 
                                      randBuffer, 
                                      CL_FALSE, 
                                      CL_MAP_WRITE, 
                                      0,
                                      numSamples * sizeof(cl_float4),
                                      0,
                                      NULL,
                                      &inMapEvt,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed. (inputBuffer)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&inMapEvt);
    CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt) Failed");
            
    memcpy(mapPtr, 
        randArray, 
        numSamples * sizeof(cl_float4));

    cl_event inUnmapEvent;

    status = clEnqueueUnmapMemObject(commandQueue,
                                    randBuffer,
                                    mapPtr,
                                    0,
                                    NULL,
                                    &inUnmapEvent);
    CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject failed. (randBuffer)");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&inUnmapEvent);
    CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(inUnmapEvent) Failed");
  
    // Set appropriate arguments to the kernel 
    status = clSetKernelArg(kernel, 
                0, 
                sizeof(int), 
                (void*)&numSteps);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(numSteps) failed.");

    status = clSetKernelArg(kernel, 
                1, 
                sizeof(cl_mem), 
                (void*)&randBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuffer) failed.");

    status = clSetKernelArg(kernel, 
                2, 
                sizeof(cl_mem), 
                (void*)&outBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(outBuffer) failed.");

    status = clSetKernelArg(kernel,
                            3,
                            (numSteps + 1) * sizeof(cl_float4),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(callA) failed.");

    status = clSetKernelArg(kernel,
                            4,
                            numSteps * sizeof(cl_float4),
                            NULL);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg(callB) failed.");

    // Enqueue a kernel run call.
    size_t globalThreads[] = {numSamples * (numSteps + 1)};
    size_t localThreads[] = {numSteps + 1};

    if(localThreads[0] > maxWorkItemSizes[0] || 
        localThreads[0] > maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support"
                     "requested number of work items.";
        return SDK_FAILURE;
    }

    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceId],
                                      CL_KERNEL_LOCAL_MEM_SIZE,
                                      sizeof(cl_ulong),
                                      &usedLocalMemory,
                                      NULL);
    CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE) failed.");

    if(usedLocalMemory > totalLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device."
                  << std::endl;
        return SDK_FAILURE;
    }


    // This algorithm reduces each group of work-items to a single value on OpenCL device

    status = clEnqueueNDRangeKernel(
                    commandQueue,
                    kernel,
                    1,
                    NULL,
                    globalThreads,
                    localThreads,
                    0,
                    NULL,
                    &ndrEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush() failed.");

    eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        ndrEvt, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR(status, "clGetEventInfo failed.");
    }

    cl_event outMapEvt;
    cl_uint* outMapPtr = (cl_uint*)clEnqueueMapBuffer(commandQueue,
                                                   outBuffer,
                                                   CL_FALSE,
                                                   CL_MAP_READ,
                                                   0,
                                                   numSamples * sizeof(cl_float4),
                                                   0,
                                                   NULL,
                                                   &outMapEvt,
                                                   &status);
    CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(outputBuffer) failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&outMapEvt);
    CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(outMapEvt) Failed");

    memcpy(output, 
        outMapPtr, 
        numSamples * sizeof(cl_float4));

    cl_event outUnmapEvt;
    status = clEnqueueUnmapMemObject(commandQueue,
                                     outBuffer,
                                     (void*)outMapPtr,
                                     0,
                                     NULL,
                                     &outUnmapEvt);
    CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(outputBuffer) failed.");

    status = clFlush(commandQueue);
    CHECK_OPENCL_ERROR(status, "clFlush failed.");

    status = sampleCommon->waitForEventAndRelease(&outUnmapEvt);
    CHECK_ERROR(status,SDK_SUCCESS, "WaitForEventAndRelease(outUnmapEvt) Failed");
 
    // Release NDR event
    status = clReleaseEvent(ndrEvt);
    CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed.");
    return SDK_SUCCESS;
}

int 
BinomialOptionMultiGPU::loadBalancing()
{
    /**
    * Calculating the peak GFlops of each device
    **/

    peakGflopsGPU = new cl_double[numGPUDevices];

    CHECK_ALLOCATION(peakGflopsGPU, "Allocation failed(peakGflopGPU)");

    cumulativeSumPerGPU = new cl_int[numGPUDevices];
    CHECK_ALLOCATION(cumulativeSumPerGPU, "Allocation failed(cumulativeSumPerGPU)");

    for (int i = 0; i < numGPUDevices; i++)
    {
        cl_int numComputeUnits = devicesInfo[i].maxComputeUnits;

        cl_int maxClockFrequency = devicesInfo[i].maxClockFrequency;

        char *deviceName = devicesInfo[i].name;
        
        /**
        * In cayman device we have four processing elements in a stream processor
        **/
        int numProcessingElts;
        if (!strcmp(deviceName, "Cayman"))
        {
            numProcessingElts = 4;
        }
        else 
        {
            numProcessingElts = 5;
        }
        /**
        * We have 16 stream processors per compute unit and numProcessingelts number of 
        * processing elements in a stream processor. A processing elt can execute 2 floating point operations
        * per cycle. So the peakGflops fomula would be 
        *
        *   peakGflops = numComputeUnits * numStreamProcessor per Compute Unit * numProcessingElt per stream processor
        *                       * maxClockFrequency * num Floating Point operations per cycle per processing elt;
        *                       We have to divide the result by 1000 to get the Gflops as freq is in MHz.
        *
        **/
        peakGflopsGPU[i] = (numComputeUnits * 16 * 
                            numProcessingElts * maxClockFrequency * 2) / 1000;
    }

    //Work sharing
    double totalPeakGflops = 0;//Sum of peakGflops of all the devices
    for (int i = 0; i < numGPUDevices; i++)
    {
        totalPeakGflops += peakGflopsGPU[i];
    }

    /**
    * ratios function will be used in load balancing statically
    **/
    double *ratios = new cl_double[numGPUDevices];
    CHECK_ALLOCATION(ratios, "Allocation failed!!(ratios)");

    numSamplesPerGPU = new cl_int[numGPUDevices];//Divison of input 
    CHECK_ALLOCATION(numSamplesPerGPU, "Allocation failed!!(numSamplesPerGPU)");

    int cumulativeSumSamples = 0;

    for (int i = 0; i < numGPUDevices; i++)
    {
        ratios[i] = peakGflopsGPU[i] / totalPeakGflops;
        numSamplesPerGPU[i] = static_cast<cl_int>(numSamples * ratios[i]);
        cumulativeSumSamples = cumulativeSumSamples + numSamplesPerGPU[i];
        cumulativeSumPerGPU[i] = cumulativeSumSamples;

        /**
        * There can be possibility that some values are missed. To avoid, add the missing values to the last GPU
        **/
        if (i == numGPUDevices - 1 && cumulativeSumSamples < numSamples)
        {
            numSamplesPerGPU[i] = numSamplesPerGPU[i] + 
                                  (numSamples - cumulativeSumSamples);
            cumulativeSumPerGPU[i] = numSamples;
        }
    }

    if(ratios)
    {
        delete []ratios;
        ratios = NULL;
    }

    return SDK_SUCCESS;
}

/*
 * Reduces the input array (in place)
 * length specifies the length of the array
 */
int 
BinomialOptionMultiGPU::binomialOptionMultiGPUCPUReference()
{
    refOutput = (float*)malloc(numSamples * sizeof(cl_float4));
    CHECK_ALLOCATION(refOutput, "Failed to allocate host memory. (refOutput)");

    float* stepsArray = (float*)malloc((numSteps + 1) * sizeof(cl_float4));
    CHECK_ALLOCATION(stepsArray, "Failed to allocate host memory. (stepsArray)");

    // Iterate for all samples
    for(int bid = 0; bid < numSamples; ++bid)
    {
        float s[4];
        float x[4];
        float vsdt[4];
        float puByr[4];
        float pdByr[4];
        float optionYears[4];

        float inRand[4];

        for(int i = 0; i < 4; ++i)
        {
            inRand[i] = randArray[bid + i];
            s[i] = (1.0f - inRand[i]) * 5.0f + inRand[i] * 30.f;
            x[i] = (1.0f - inRand[i]) * 1.0f + inRand[i] * 100.f;
            optionYears[i] = (1.0f - inRand[i]) * 0.25f + inRand[i] * 10.f; 
            float dt = optionYears[i] * (1.0f / (float)numSteps);
            vsdt[i] = VOLATILITY * sqrtf(dt);
            float rdt = RISKFREE * dt;
            float r = expf(rdt);
            float rInv = 1.0f / r;
            float u = expf(vsdt[i]);
            float d = 1.0f / u;
            float pu = (r - d)/(u - d);
            float pd = 1.0f - pu;
            puByr[i] = pu * rInv;
            pdByr[i] = pd * rInv;
        }

        /**
         * Compute values at expiration date:
         * Call option value at period end is v(t) = s(t) - x
         * If s(t) is greater than x, or zero otherwise...
         * The computation is similar for put options
         */
        for(int j = 0; j <= numSteps; j++)
        {
            for(int i = 0; i < 4; ++i)
            {
                float profit = s[i] * expf(vsdt[i] * (2.0f * j - numSteps)) - x[i];
                stepsArray[j * 4 + i] = profit > 0.0f ? profit : 0.0f;
            }
        }

        /**
         * walk backwards up on the binomial tree of depth numSteps
         * Reduce the price step by step
         */
        for(int j = numSteps; j > 0; --j)
        {
            for(int k = 0; k <= j - 1; ++k)
            {
                for(int i = 0; i < 4; ++i)
                {
					int index_k = k * 4 + i;
					int index_k_1 = (k + 1) * 4 + i;
                    stepsArray[index_k] = pdByr[i] * stepsArray[index_k_1] + puByr[i] * stepsArray[index_k];
                }
            }   
        }

        //Copy the root to result
        refOutput[bid] = stepsArray[0];
    }
    
    if(stepsArray)
    {
        free(stepsArray);
        stepsArray = NULL;
    }

    return SDK_SUCCESS;
}

int BinomialOptionMultiGPU::initialize()
{
    // Call base class Initialize to get default configuration
    CHECK_ERROR(this->SDKSample::initialize(), SDK_SUCCESS, "OpenCL Resource Intilization failed");
    
    streamsdk::Option* num_samples = new streamsdk::Option;
    CHECK_ALLOCATION(num_samples, "Error. Failed to allocate memory (num_samples)\n");

    num_samples->_sVersion = "x";
    num_samples->_lVersion = "samples";
    num_samples->_description = "Number of samples to be calculated";
    num_samples->_type = streamsdk::CA_ARG_INT;
    num_samples->_value = &numSamples;

    sampleArgs->AddOption(num_samples);

    delete num_samples;
    num_samples = NULL;

    streamsdk::Option* num_iterations = new streamsdk::Option;
    CHECK_ALLOCATION(num_iterations, "Error. Failed to allocate memory (num_iterations)\n");

    num_iterations->_sVersion = "i";
    num_iterations->_lVersion = "iterations";
    num_iterations->_description = "Number of iterations for kernel execution";
    num_iterations->_type = streamsdk::CA_ARG_INT;
    num_iterations->_value = &iterations;

    sampleArgs->AddOption(num_iterations);
    delete num_iterations;
    num_iterations = NULL;

    return SDK_SUCCESS;
}

int BinomialOptionMultiGPU::setup()
{
    CHECK_ERROR(setupBinomialOptionMultiGPU(), SDK_SUCCESS, "OpenCL Resource BinomialOptionMultiGPU::setup failed");

    int timer = sampleCommon->createTimer();
    sampleCommon->resetTimer(timer);
    sampleCommon->startTimer(timer);

    CHECK_ERROR(setupCL(), SDK_SUCCESS, "OpenCL Resource setup failed");

    sampleCommon->stopTimer(timer);
    // Compute setup time
    setupTime = (double)(sampleCommon->readTimer(timer));

    return SDK_SUCCESS;
}

/**
* This structure is used in multi threading 
*/
struct dataPerGPU
{
    BinomialOptionMultiGPU *boObj;
    int deviceNumber;
};

void* threadFuncPerGPU(void *data1)
{
    dataPerGPU *data = (dataPerGPU *)data1;
    int deviceNumber = data->deviceNumber;
    BinomialOptionMultiGPU *boObj = data->boObj;
    cl_int status;
    cl_event ndrEvt;
    cl_int eventStatus = CL_QUEUED;

    cl_event inMapEvt;
    void* mapPtr = clEnqueueMapBuffer(boObj->commandQueues[deviceNumber], 
                                      boObj->randBuffers[deviceNumber], 
                                      CL_FALSE, 
                                      CL_MAP_WRITE, 
                                      0,
                                      boObj->numSamplesPerGPU[deviceNumber] * sizeof(cl_float4),
                                      0,
                                      NULL,
                                      &inMapEvt,
                                      &status);

    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clEnqueueMapBuffer failed !!");

    status = clFlush(boObj->commandQueues[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clFlush failed !!");

    eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        inMapEvt, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR_RETURN_NULL(status, "clGetEventInfo failed !!");
    }

    status = clReleaseEvent(inMapEvt);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clReleaseEvent failed !!");

    memcpy(mapPtr,
        boObj->randArray + (boObj->cumulativeSumPerGPU[deviceNumber] - boObj->numSamplesPerGPU[deviceNumber]) * 4,
        (boObj->numSamplesPerGPU[deviceNumber] * sizeof(cl_float4)));

    cl_event inUnmapEvent;

    status = clEnqueueUnmapMemObject(boObj->commandQueues[deviceNumber],
                                    boObj->randBuffers[deviceNumber],
                                    mapPtr,
                                    0,
                                    NULL,
                                    &inUnmapEvent);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clEnqueueUnmapMemObject failed !!");

    status = clFlush(boObj->commandQueues[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clFlush failed !!");

    eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        inUnmapEvent, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR_RETURN_NULL(status, "clGetEventInfo failed !!");
    }

    status = clReleaseEvent(inUnmapEvent);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clReleaseEvent failed !!");

    // Set appropriate arguments to the kernel 
    status = clSetKernelArg(boObj->kernels[deviceNumber], 
                0, 
                sizeof(int), 
                (void*)&boObj->numSteps);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clSetKernelArg failed !!");

    status = clSetKernelArg(boObj->kernels[deviceNumber], 
                1, 
                sizeof(cl_mem), 
                (void*)&boObj->randBuffers[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clSetKernelArg failed !!");

    status = clSetKernelArg(boObj->kernels[deviceNumber], 
                2, 
                sizeof(cl_mem), 
                (void*)&boObj->outputBuffers[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clSetKernelArg failed !!");

    status = clSetKernelArg(boObj->kernels[deviceNumber],
                            3,
                            (boObj->numSteps + 1) * sizeof(cl_float4),
                            NULL);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clSetKernelArg failed !!");

    status = clSetKernelArg(boObj->kernels[deviceNumber],
                            4,
                            boObj->numSteps * sizeof(cl_float4),
                            NULL);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clSetKernelArg failed !!");

    // Enqueue a kernel run call.
    /**
    * numsamples is divided amongst the available devices in loadBalancing function.
    */
    size_t globalThreads[] = {(boObj->numSamplesPerGPU[deviceNumber]) * 
                             (boObj->numSteps + 1)};

    size_t localThreads[] = {boObj->numSteps + 1};

    if(localThreads[0] > boObj->devicesInfo[deviceNumber].maxWorkItemSizes[0] || 
        localThreads[0] > boObj->devicesInfo[deviceNumber].maxWorkGroupSize)
    {
        std::cout << "Unsupported: Device does not support"
                     "requested number of work items.";
        return NULL;
    }

    status = clGetKernelWorkGroupInfo(boObj->kernels[deviceNumber],
                                      boObj->gpuDeviceIDs[deviceNumber],
                                      CL_KERNEL_LOCAL_MEM_SIZE,
                                      sizeof(cl_ulong),
                                      &boObj->usedLocalMemory,
                                      NULL);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clGetKernelWorkGroupInfo failed !!");

    if(boObj->usedLocalMemory > boObj->totalLocalMemory)
    {
        std::cout << "Unsupported: Insufficient local memory on device."
                  << std::endl;
        return NULL;
    }


    /** 
     * This algorithm reduces each group of work-items to a single value
     * on OpenCL device
     */
    status = clEnqueueNDRangeKernel(
                    boObj->commandQueues[deviceNumber],
                    boObj->kernels[deviceNumber],
                    1,
                    NULL,
                    globalThreads,
                    localThreads,
                    0,
                    NULL,
                    &ndrEvt);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clEnqueueNDRangeKernel failed !!");

    status = clFlush(boObj->commandQueues[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clFlush failed !!");

    eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        ndrEvt, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR_RETURN_NULL(status, "clGetEventInfo failed !!");
    }

    cl_event outMapEvt;
    cl_uint* outMapPtr = (cl_uint*)clEnqueueMapBuffer(boObj->commandQueues[deviceNumber],
                                                   boObj->outputBuffers[deviceNumber],
                                                   CL_FALSE,
                                                   CL_MAP_READ,
                                                   0,
                                                   (boObj->numSamplesPerGPU[deviceNumber]) * sizeof(cl_float4),
                                                   0,
                                                   NULL,
                                                   &outMapEvt,
                                                   &status);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clEnqueueMapBuffer failed !!");

    status = clFlush(boObj->commandQueues[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clFlush failed !!");

    eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        outMapEvt, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR_RETURN_NULL(status, "clGetEventInfo failed !!");
    }

    status = clReleaseEvent(outMapEvt);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clReleaseEvent failed !!");

    memcpy(boObj->output + ((boObj->cumulativeSumPerGPU[deviceNumber] - boObj->numSamplesPerGPU[deviceNumber]) * 4), 
        outMapPtr, 
        (boObj->numSamplesPerGPU[deviceNumber] * sizeof(cl_float4)));

    cl_event outUnmapEvt;
    status = clEnqueueUnmapMemObject(boObj->commandQueues[deviceNumber],
                                     boObj->outputBuffers[deviceNumber],
                                     (void*)outMapPtr,
                                     0,
                                     NULL,
                                     &outUnmapEvt);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clEnqueueUnmapMemObject failed !!");

    status = clFlush(boObj->commandQueues[deviceNumber]);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clFlush failed !!");

    eventStatus = CL_QUEUED;
    while(eventStatus != CL_COMPLETE)
    {
        status = clGetEventInfo(
                        outUnmapEvt, 
                        CL_EVENT_COMMAND_EXECUTION_STATUS, 
                        sizeof(cl_int),
                        &eventStatus,
                        NULL);
        CHECK_OPENCL_ERROR_RETURN_NULL(status, "clGetEventInfo failed !!");
    }

    status = clReleaseEvent(outUnmapEvt);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clReleaseEvent failed !!");

    // Release NDR event
    status = clReleaseEvent(ndrEvt);
    CHECK_OPENCL_ERROR_RETURN_NULL(status, "clReleaseEvent failed !!");
    return NULL;
}

int 
BinomialOptionMultiGPU::runCLKernelsMultiGPU()
{
    streamsdk::SDKThread *threads = new streamsdk::SDKThread[numGPUDevices];
    CHECK_ALLOCATION(threads, "Allocation failed!!");

    /**
    * Creating one thread per GPU
    */
    dataPerGPU *data = new dataPerGPU[numGPUDevices];
    for (int i = 0 ; i < numGPUDevices; i++)
    {
        data[i].deviceNumber = i; 
        data[i].boObj = this;
        threads[i].create(threadFuncPerGPU, 
                        (void *) &data[i]);        
    }
    /**
    * Call join() function for synchronization
    * Main thread will wait for each thread to get completed. 
    */
    for (int i = 0; i < numGPUDevices; i++)
    {
        threads[i].join();
    }

    delete []data;
    delete []threads;
    return SDK_SUCCESS;
}

int BinomialOptionMultiGPU::run()
{
    // Warm up
    for(int i = 0; i < 2 && iterations != 1; i++)
    {
        if(!noMultiGPUSupport)
        {
            CHECK_ERROR(runCLKernelsMultiGPU(), SDK_SUCCESS, "OpenCL noMultiGPUSupport failed");
        }
        else 
        {
            CHECK_ERROR(runCLKernels(), SDK_SUCCESS, "OpenCL Run failed");
        }
    }

    std::cout << "Executing kernel for " << iterations 
              << " iterations" << std::endl;
    std::cout << "-------------------------------------------" 
              << std::endl;

    int timer = sampleCommon->createTimer();
    sampleCommon->resetTimer(timer);
    sampleCommon->startTimer(timer);   

    for(int i = 0; i < iterations; i++)
    {
        if(!noMultiGPUSupport)
        {
             CHECK_ERROR(runCLKernelsMultiGPU(), SDK_SUCCESS, "OpenCL noMultiGPUSupport failed");
        } 
        else 
        {
            CHECK_ERROR(runCLKernels(), SDK_SUCCESS, "OpenCL Run failed");
        }
    }

    sampleCommon->stopTimer(timer);

    // Compute average kernel time
    kernelTime = (double)(sampleCommon->readTimer(timer)) / iterations;

    if(!quiet)
        sampleCommon->printArray<cl_float>("Output", 
                        output, 
                        numSamples, 
                        1);
    return SDK_SUCCESS;
}

int BinomialOptionMultiGPU::verifyResults()
{
    if(verify)
    {
        /* reference implementation
         * it overwrites the input array with the output
         */
        int result = SDK_SUCCESS;
        result = binomialOptionMultiGPUCPUReference();
        CHECK_ERROR(result, SDK_SUCCESS, "OpenCL verifyResults failed");

        // compare the results and see if they match
        if(sampleCommon->compare(output, 
                            refOutput, 
                            numSamples, 
                            0.001f))
        {
            std::cout << "Passed!\n" << std::endl;
            return SDK_SUCCESS;
        }
        else
        {
            std::cout <<" Failed\n" << std::endl;
            return SDK_FAILURE;
        }
    }
    return SDK_SUCCESS;
}

void BinomialOptionMultiGPU::printStats()
{
    std::string strArray[4] = 
    {
        "Option Samples", 
        "Time(sec)", 
        "KernelTime(sec)" , 
        "Options/sec"
    };

    totalTime = setupTime + kernelTime;
    std::string stats[4];
    stats[0] = sampleCommon->toString(numSamples, 
                                std::dec);
    stats[1] = sampleCommon->toString(totalTime, 
                                std::dec);
    stats[2] = sampleCommon->toString(kernelTime, 
                                std::dec);
    stats[3] = sampleCommon->toString(numSamples/totalTime, 
                                std::dec);
    this->SDKSample::printStats(strArray, 
                                stats, 
                                4);
}

int
BinomialOptionMultiGPU::cleanup()
{
    // Releases OpenCL resources (Context, Memory etc.)
    cl_int status;

    if(noMultiGPUSupport)
    {
        status = clReleaseKernel(kernel);
        CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.");

        status = clReleaseProgram(program);
        CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.");

        status = clReleaseMemObject(randBuffer);
        CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

        status = clReleaseMemObject(outBuffer);
        CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed!!");

        status = clReleaseCommandQueue(commandQueue);
        CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed!!");

        status = clReleaseContext(context);
        CHECK_OPENCL_ERROR(status, "clReleaseContext failed!!");
    }
    else 
    {
        for (int i = 0; i < numGPUDevices; i++)
        {
            status = clReleaseKernel(kernels[i]);
            CHECK_OPENCL_ERROR(status, "clReleaseKernel failed!!");

            status = clReleaseProgram(programs[i]);
            CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.");

            status = clReleaseMemObject(randBuffers[i]);
            CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

            status = clReleaseMemObject(outputBuffers[i]);
            CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");

            status = clReleaseCommandQueue(commandQueues[i]);
            CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.");
        }
        status = clReleaseContext(context);
        CHECK_OPENCL_ERROR(status, "clReleaseContext failed.");
    }
    return SDK_SUCCESS;
}

BinomialOptionMultiGPU::~BinomialOptionMultiGPU()
{
     
#ifdef _WIN32
             ALIGNED_FREE(randArray);
#else
             FREE(randArray);
#endif

#ifdef _WIN32
             ALIGNED_FREE(output);
#else
             FREE(output);
#endif

     FREE(refOutput);
     FREE(devices);

     if(!noMultiGPUSupport)
     {
         if(kernels)
         {
             delete []kernels;
             kernels = NULL;
         }

         if(programs)
         {
             delete []programs;
             programs = NULL;
         }

         if(commandQueues)
         {
             delete []commandQueues;
             commandQueues = NULL;
         }

         if(peakGflopsGPU)
         {
             delete []peakGflopsGPU;
             peakGflopsGPU = NULL;
         }

         if(cumulativeSumPerGPU)
         {
             delete []cumulativeSumPerGPU;
             cumulativeSumPerGPU = NULL;
         }

         if(numSamplesPerGPU)
         {
            delete []numSamplesPerGPU;
            numSamplesPerGPU = NULL;
         }
         
         if(randBuffers)
         {
            delete []randBuffers;
            randBuffers = NULL;
         }

         if(outputBuffers)
         {
            delete []outputBuffers;
            outputBuffers = NULL;
         }

         if(devicesInfo)
         {
             delete []devicesInfo;
             devicesInfo = NULL;
         }
     }

     delete []gpuDeviceIDs;
     gpuDeviceIDs = NULL;
}

int 
main(int argc, char * argv[])
{
    BinomialOptionMultiGPU clBinomialOptionMultiGPU("OpenCL BinomialOptionMultiGPU");
    // Initialize
    if (clBinomialOptionMultiGPU.initialize() != SDK_SUCCESS)
        return SDK_FAILURE;

    if(clBinomialOptionMultiGPU.parseCommandLine(argc, argv))
        return SDK_FAILURE;

    if(clBinomialOptionMultiGPU.isDumpBinaryEnabled())
    {
        if(clBinomialOptionMultiGPU.genBinaryImage() != SDK_SUCCESS)
            return SDK_FAILURE;
    }
    // Setup
    if(clBinomialOptionMultiGPU.setup() != SDK_SUCCESS)
        return SDK_FAILURE;

    // Run
    if(clBinomialOptionMultiGPU.run() != SDK_SUCCESS)
        return SDK_FAILURE;

    // VerifyResults
    if(clBinomialOptionMultiGPU.verifyResults() != SDK_SUCCESS)
        return SDK_FAILURE;

    // Cleanup
    if (clBinomialOptionMultiGPU.cleanup() != SDK_SUCCESS)
        return SDK_FAILURE;

    clBinomialOptionMultiGPU.printStats();

    return SDK_SUCCESS;
}