/**********************************************************************
Copyright ©2012 Advanced Micro Devices, Inc. All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

•	Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
•	Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
 other materials provided with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
 OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
********************************************************************/


#include "LDSBandwidth.hpp"

//Separator
std::string sep = "-----------------------------------------";
int error = 0;

int 
LDSBandwidth::setupLDSBandwidth()
{
    /**
     * Make vectorSize as 4 if -v option is 3. 
     * This memeory alignment is required as per OpenCL for type3 vectors 
     */
    if(vectorSize == 3)
    {
        vec3 = true;
        vectorSize = 4;
    }
	else if((1 != vectorSize) && (2 != vectorSize) && (4 != vectorSize) && (8 != vectorSize) && (16 != vectorSize))
	{
		std::cout << "The vectorsize can only be one of 1,2,3(4),4,8,16!" << std::endl;
		return SDK_FAILURE;
	}

    // host output 
    output = (cl_float*)malloc(length * vectorSize * sizeof(cl_float));
    CHECK_ALLOCATION(output, "Failed to allocate host memory. (output)");

    return SDK_SUCCESS;
}

int 
LDSBandwidth::genBinaryImage()
{
    streamsdk::bifData binaryData;
    binaryData.kernelName = std::string("LDSBandwidth_Kernels.cl");

    // Pass vectorSize as DATATYPE to kernel
    char buildOption[64];
    if(vectorSize == 1)
        sprintf(buildOption, "-D DATATYPE=float ");
    else
        sprintf(buildOption, "-D DATATYPE=float%d ", vec3 == true ? 3 : vectorSize);

    binaryData.flagsStr = std::string(buildOption);
    if(isComplierFlagsSpecified())
        binaryData.flagsFileName = std::string(flags.c_str());

    binaryData.binaryName = std::string(dumpBinary.c_str());
    int status = sampleCommon->generateBinaryImage(binaryData);
    return status;
}

int
LDSBandwidth::setupCL(void)
{
    cl_int status = 0;
    cl_device_type dType;
    
    if(deviceType.compare("cpu") == 0)
    {
        dType = CL_DEVICE_TYPE_CPU;
    }
    else //deviceType = "gpu" 
    {
        dType = CL_DEVICE_TYPE_GPU;
        if(isThereGPU() == false)
        {
            std::cout << "GPU not found. Falling back to CPU device" << std::endl;
            dType = CL_DEVICE_TYPE_CPU;
        }
    }

    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */

    cl_platform_id platform = NULL;
    int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled());
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed");

    // Display available devices.
    retValue = sampleCommon->displayDevices(platform, dType);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed");

    /*
     * If we could find our platform, use it. Otherwise use just available platform.
     */

    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };

    context = clCreateContextFromType(cps,
                                      dType,
                                      NULL,
                                      NULL,
                                      &status);
    CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed.");

    // getting device on which to run the sample
    status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled());
    CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed");

    //Set device info of given cl_device_id
    retValue = deviceInfo.setDeviceInfo(devices[deviceId]);
    CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed");

    std::string deviceStr(deviceInfo.deviceVersion);
    size_t vStart = deviceStr.find(" ", 0);
    size_t vEnd = deviceStr.find(" ", vStart + 1);
    std::string vStrVal = deviceStr.substr(vStart + 1, vEnd - vStart - 1);

#ifdef CL_VERSION_1_1
    if(vStrVal.compare("1.0") > 0)
    {
        char openclVersion[1024];
        status = clGetDeviceInfo(devices[deviceId],
                                 CL_DEVICE_OPENCL_C_VERSION,
                                 sizeof(openclVersion),
                                 openclVersion,
                                 0);
        CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed.");
        
        std::string tempStr(openclVersion);
        size_t dotPos = tempStr.find_first_of(".");
        size_t spacePos = tempStr.find_last_of(" ");
        tempStr = tempStr.substr(dotPos + 1, spacePos - dotPos);
        int minorVersion = atoi(tempStr.c_str());
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(minorVersion < 1 && vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
    else
    {
        // OpenCL 1.1 has inbuilt support for vec3 data types
        if(vec3 == true)
        {
            OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
        }
    }
#else
    // OpenCL 1.1 has inbuilt support for vec3 data types
    if(vec3 == true)
    {
        OPENCL_EXPECTED_ERROR("Device doesn't support built-in 3 component vectors!");
    }
#endif

    {
        // The block is to move the declaration of prop closer to its use 
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, 
                                            devices[deviceId], 
                                            prop, 
                                            &status);
        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed.");
    }

    outputBuffer = clCreateBuffer(context, 
                                  CL_MEM_WRITE_ONLY,
                                  sizeof(cl_float) * vectorSize * length,
                                  0, 
                                  &status);
    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outputBuffer)");

    // create a CL program using the kernel source 
    char buildOption[64];
    if(vectorSize == 1)
        sprintf(buildOption, "-D DATATYPE=float ");
    else
        sprintf(buildOption, "-D DATATYPE=float%d ", vec3 == true ? 3 : vectorSize);
    
    streamsdk::buildProgramData buildData;
    buildData.kernelName = std::string("LDSBandwidth_Kernels.cl");
    buildData.devices = devices;
    buildData.deviceId = deviceId;
    buildData.flagsStr = std::string(buildOption);
    if(isLoadBinaryEnabled())
        buildData.binaryName = std::string(loadBinary.c_str());

    if(isComplierFlagsSpecified())
        buildData.flagsFileName = std::string(flags.c_str());

    retValue = sampleCommon->buildOpenCLProgram(program, context, buildData);
    CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed");

    // ConstantBuffer bandwidth from single access 
    kernel[0] = clCreateKernel(program, "LDSBandwidth_single", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_single)");

    // ConstantBuffer bandwidth from linear access 
    kernel[1] = clCreateKernel(program, "LDSBandwidth_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_linear)");

    kernel[2] = clCreateKernel(program, "LDSBandwidth_single_verify", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_single_verify)");

    kernel[3] = clCreateKernel(program, "LDSBandwidth_linear_verify", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_linear_verify)");

	kernel[4] = clCreateKernel(program, "LDSBandwidth_write_linear", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_linear_verify)");

	kernel[5] = clCreateKernel(program, "LDSBandwidth_write_linear_verify", &status);
    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(LDSBandwidth_linear_verify)");
    return SDK_SUCCESS;
}

int
LDSBandwidth::bandwidth(cl_kernel &kernel)
{
    cl_int status;
    
    // Check group size against kernelWorkGroupSize 
    status = clGetKernelWorkGroupInfo(kernel,
                                      devices[deviceId],
                                      CL_KERNEL_WORK_GROUP_SIZE,
                                      sizeof(size_t),
                                      &kernelWorkGroupSize,
                                      0);
    CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

    if(localThreads > kernelWorkGroupSize)
        localThreads = kernelWorkGroupSize;

    // Set appropriate arguments to the kernel 
    
    size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float);
    // Local memory
    status = clSetKernelArg(kernel,
                            0,
                            size, 
                            0);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)");

    // Output buffer
    status = clSetKernelArg(kernel,
                            1,
                            sizeof(cl_mem), 
                            (void *)&outputBuffer);
    CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");

    // Get used local memory
    status =  clGetKernelWorkGroupInfo(kernel,
                                       devices[deviceId],
                                       CL_KERNEL_LOCAL_MEM_SIZE,
                                       sizeof(cl_ulong),
                                       &usedLocalMemory,
                                       NULL);
    CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed.");

    if(usedLocalMemory > deviceInfo.localMemSize)
    {
        std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
        return SDK_FAILURE;
    }

    double sec = 0;

    if(deviceType.compare("cpu") == 0)
        iterations = 10;

    // Run the kernel for a number of iterations 
    for(int i = 0; i < iterations; i++)
    {
        // Enqueue a kernel run call 
        cl_event ndrEvt;
        status = clEnqueueNDRangeKernel(commandQueue,
                                        kernel,
                                        1,
                                        NULL,
                                        &globalThreads,
                                        &localThreads,
                                        0,
                                        NULL,
                                        &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution 
        status = clWaitForEvents(1, &ndrEvt);
        CHECK_OPENCL_ERROR(status, "clWaitForEvents failed.");

        // Calculate performance 
        cl_ulong startTime;
        cl_ulong endTime;
        
        // Get kernel profiling info 
        status = clGetEventProfilingInfo(ndrEvt,
                                         CL_PROFILING_COMMAND_START,
                                         sizeof(cl_ulong),
                                         &startTime,
                                         0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(startTime)");

        status = clGetEventProfilingInfo(ndrEvt,
                                         CL_PROFILING_COMMAND_END,
                                         sizeof(cl_ulong),
                                         &endTime,
                                         0);
        CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)");

        // Cumulate time for each iteration 
        sec += 1e-9 * (endTime - startTime);

        status = clReleaseEvent(ndrEvt);
        CHECK_OPENCL_ERROR(status, "clReleaseEvent failed.(endTime)");
    }

    // Copy bytes 
    int bytesPerThread = 0;
    if(vec3 == true)
        bytesPerThread = NUM_READS * 3 * sizeof(cl_float);
    else
        bytesPerThread = NUM_READS * vectorSize * sizeof(cl_float);
    double bytes = (double)(iterations * bytesPerThread);
    double perf = (bytes / sec) * 1e-9;
    perf *= globalThreads;
    
    std::cout << ": " << perf << " GB/s" << std::endl;

    return SDK_SUCCESS;
}

int 
LDSBandwidth::runCLKernels(void)
{
    int status;
    if(vec3 == true)
        printf("\nAccessType\t: single\nVectorElements\t: %d\n", 3);
    else
        printf("\nAccessType\t: single\nVectorElements\t: %d\n", vectorSize);

    printf("Bandwidth\t");

    // Measure bandwidth of single reads from LDS 
    status = bandwidth(kernel[0]);
    if(status != SDK_SUCCESS)
        return SDK_FAILURE;

    if(verify)
    {
        // Check group size against kernelWorkGroupSize 
        status = clGetKernelWorkGroupInfo(kernel[2],
            devices[deviceId],
            CL_KERNEL_WORK_GROUP_SIZE,
            sizeof(size_t),
            &kernelWorkGroupSize,
            0);
        CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

        size_t globalThreads = length;
        size_t localThreads = kernelWorkGroupSize;

        size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float);
        // Local memory
        int status = clSetKernelArg(kernel[2],
            0,
            size, 
            0);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)");

        // Output buffer
        status = clSetKernelArg(kernel[2],
            1,
            sizeof(cl_mem), 
            (void *)&outputBuffer);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");

        // Enqueue a kernel run call 
        status = clEnqueueNDRangeKernel(commandQueue,
            kernel[2],
            1,
            NULL,
            &globalThreads,
            &localThreads,
            0,
            NULL,
            0);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution 
        status = clFinish(commandQueue);
        CHECK_OPENCL_ERROR(status, "clFinish failed.");

        // Read back data from device to host 
        cl_float* vBuffer = (cl_float*)malloc(vectorSize * sizeof(cl_float) * length);

        status = clEnqueueReadBuffer(commandQueue,
            outputBuffer,
            1, 
            0, 
            vectorSize * sizeof(cl_float) * length, 
            vBuffer, 
            0, 
            0, 
            0);
        CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

        int VecElements = (vec3 == true) ? 3 : vectorSize;
        int flag = 0;
        for(int i = 0; i < (int)length; i++)
        {
            for(int k = 0; k < VecElements; k++)
            {
                if(vBuffer[i * vectorSize + k] > 1e-5)
                {
                    flag = 1;
                    break;
                }
            }
        }

        if(!flag)
        {
            std::cout << "Passed!\n" << std::endl;
            //return SDK_SUCCESS;
        }
        else
        {
            std::cout << "Failed!\n" << std::endl;
            return SDK_FAILURE;
        }

        free(vBuffer);

    }

    if(vec3 == true)
        printf("\nAccessType\t: linear\nVectorElements\t: %d\n", 3);
    else
        printf("\nAccessType\t: linear\nVectorElements\t: %d\n", vectorSize);
    printf("Bandwidth\t");

    // Measure bandwidth of linear reads from LDS 
    status = bandwidth(kernel[1]);
    if(status != SDK_SUCCESS)
        return SDK_FAILURE;

    if(verify)
    {

        // Check group size against kernelWorkGroupSize 
        status = clGetKernelWorkGroupInfo(kernel[3],
            devices[deviceId],
            CL_KERNEL_WORK_GROUP_SIZE,
            sizeof(size_t),
            &kernelWorkGroupSize,
            0);
        CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

        size_t globalThreads = length;
        size_t localThreads = kernelWorkGroupSize;

        size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float);
        // Local memory
        int status = clSetKernelArg(kernel[3],
            0,
            size, 
            0);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)");

        // Output buffer
        status = clSetKernelArg(kernel[3],
            1,
            sizeof(cl_mem), 
            (void *)&outputBuffer);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");

        // Enqueue a kernel run call 
        status = clEnqueueNDRangeKernel(commandQueue,
            kernel[3],
            1,
            NULL,
            &globalThreads,
            &localThreads,
            0,
            NULL,
            0);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution 
        status = clFinish(commandQueue);
        CHECK_OPENCL_ERROR(status, "clFinish failed.");

        // Read back data from device to host 
        cl_float* vBuffer = (cl_float*)malloc(vectorSize * sizeof(cl_float) * length);

        status = clEnqueueReadBuffer(commandQueue,
            outputBuffer,
            1, 
            0, 
            vectorSize * sizeof(cl_float) * length, 
            vBuffer, 
            0, 
            0, 
            0);
        CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

        int flag = 0;
        for(int i = 0; i < (int)length; i++)
        {
            for(int k = 0; k < vectorSize; k++)
            {
                if(vBuffer[i * vectorSize + k] > 1e-5)
                {
                    flag = 1;
                    break;
                }
            }
        }

        if(!flag)
        {
            std::cout << "Passed!\n" << std::endl;
        }
        else
        {
            std::cout << "Failed!\n" << std::endl;
            return SDK_FAILURE;
        }

        FREE(vBuffer);

    }

	if(vec3 == true)
        printf("\nAccessType\t: linear write\nVectorElements\t: %d\n", 3);
    else
        printf("\nAccessType\t: linear write\nVectorElements\t: %d\n", vectorSize);

    printf("Bandwidth\t");
	status = bandwidth(kernel[4]);
	 if(status != SDK_SUCCESS)
        return SDK_FAILURE;

    if(verify)
    {
        // Check group size against kernelWorkGroupSize 
        status = clGetKernelWorkGroupInfo(kernel[5],
            devices[deviceId],
            CL_KERNEL_WORK_GROUP_SIZE,
            sizeof(size_t),
            &kernelWorkGroupSize,
            0);
        CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed.");

        size_t globalThreads = length;
        size_t localThreads = kernelWorkGroupSize;

        size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float);
        // Local memory
        int status = clSetKernelArg(kernel[5],
            0,
            size, 
            0);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)");

        // Output buffer
        status = clSetKernelArg(kernel[5],
            1,
            sizeof(cl_mem), 
            (void *)&outputBuffer);
        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)");

        // Enqueue a kernel run call 
        status = clEnqueueNDRangeKernel(commandQueue,
            kernel[5],
            1,
            NULL,
            &globalThreads,
            &localThreads,
            0,
            NULL,
            0);
        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");

        // wait for the kernel call to finish execution 
        status = clFinish(commandQueue);
        CHECK_OPENCL_ERROR(status, "clFinish failed.");

        // Read back data from device to host 
        cl_float* vBuffer = (cl_float*)malloc(vectorSize * sizeof(cl_float) * length);

        status = clEnqueueReadBuffer(commandQueue,
            outputBuffer,
            1, 
            0, 
            vectorSize * sizeof(cl_float) * length, 
            vBuffer, 
            0, 
            0, 
            0);
        CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");

        int VecElements = (vec3 == true) ? 3 : vectorSize;
        int flag = 0;
        for(int i = 0; i < (int)length; i++)
        {
            for(int k = 0; k < VecElements; k++)
            {
                if(vBuffer[i * vectorSize + k] > 1e-5)
                {
                    flag = 1;
                    break;
                }
            }
        }

        if(!flag)
        {
            std::cout << "Passed!\n" << std::endl;
            //return SDK_SUCCESS;
        }
        else
        {
            std::cout << "Failed!\n" << std::endl;
            return SDK_FAILURE;
        }

        free(vBuffer);

    }
	return SDK_SUCCESS;
}


int 
LDSBandwidth::initialize()
{
    // Call base class Initialize to get default configuration 
    if(this->SDKSample::initialize() != SDK_SUCCESS)
        return SDK_FAILURE;
    
    streamsdk::Option* num_iterations = new streamsdk::Option;

    num_iterations->_sVersion = "i";
    num_iterations->_lVersion = "iterations";
    num_iterations->_description = "Number of iterations for kernel execution";
    num_iterations->_type = streamsdk::CA_ARG_INT;
    num_iterations->_value = &iterations;

    sampleArgs->AddOption(num_iterations);
    delete num_iterations;

    streamsdk::Option* num_arguments = new streamsdk::Option;

    num_arguments->_sVersion = "c";
    num_arguments->_lVersion = "components";
    num_arguments->_description = "Number of vector components to be used. Can be either 1,2,3(4),4,8,16";
    num_arguments->_type = streamsdk::CA_ARG_INT;
    num_arguments->_value = &vectorSize;

    sampleArgs->AddOption(num_arguments);
    delete num_arguments;

    return SDK_SUCCESS;
}

int 
LDSBandwidth::setup()
{
	if(iterations < 1)
	{
		std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n";
		exit(0);
	}
    if(setupLDSBandwidth() != SDK_SUCCESS)
        return SDK_FAILURE;
    
    int timer = sampleCommon->createTimer();
    sampleCommon->resetTimer(timer);
    sampleCommon->startTimer(timer);

    int status = setupCL();
    if(status != SDK_SUCCESS)
    {
        if(status == SDK_EXPECTED_FAILURE)
            return SDK_EXPECTED_FAILURE;
        
        return SDK_FAILURE;
    }
    
    sampleCommon->stopTimer(timer);
    setupTime = (cl_double)sampleCommon->readTimer(timer);

    return SDK_SUCCESS;
}


int 
LDSBandwidth::run()
{
    int timer = sampleCommon->createTimer();
    sampleCommon->resetTimer(timer);
    sampleCommon->startTimer(timer);   

    // Arguments are set and execution call is enqueued on command buffer 
    if(runCLKernels() != SDK_SUCCESS)
        return SDK_FAILURE;
    
    sampleCommon->stopTimer(timer);
    kernelTime = (double)(sampleCommon->readTimer(timer));

    return SDK_SUCCESS;
}

int 
LDSBandwidth::verifyResults()
{
    return SDK_SUCCESS;
}


int 
LDSBandwidth::cleanup()
{
    // Releases OpenCL resources (Context, Memory etc.) 
    cl_int status;

    for(int i = 0; i < NUM_KERNELS; i++)
    {
        status = clReleaseKernel(kernel[i]);
        CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.");
    }

    status = clReleaseProgram(program);
    CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.(program)"); 

    status = clReleaseMemObject(outputBuffer);
    CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.(outputBuffer)");

    status = clReleaseCommandQueue(commandQueue);
     CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.(commandQueue)");

    status = clReleaseContext(context);
    CHECK_OPENCL_ERROR(status, "clReleaseContext failed.(context)");

    // release program resources (input memory etc.) 
    FREE(output);

    // release device list 
    FREE(devices);

    return SDK_SUCCESS;
}

void LDSBandwidth::printStats()
{
    std::string strArray[3] = {"Elements", "Time(sec)", "kernelTime(sec)"};
    std::string stats[3];
    
    totalTime = setupTime + kernelTime;
    stats[0]  = sampleCommon->toString(length * vectorSize, std::dec);
    stats[1]  = sampleCommon->toString(totalTime, std::dec);
    stats[2]  = sampleCommon->toString(kernelTime, std::dec);
    
    this->SDKSample::printStats(strArray, stats, 3);
}

int 
main(int argc, char * argv[])
{
    LDSBandwidth clLDSBandwidth("OpenCL LDS Bandwidth");
    
    if(clLDSBandwidth.initialize() != SDK_SUCCESS)
        return SDK_FAILURE;
  
    if(clLDSBandwidth.parseCommandLine(argc, argv))
        return SDK_FAILURE;

    if(clLDSBandwidth.isDumpBinaryEnabled())
    {
        return clLDSBandwidth.genBinaryImage();
    }

    int status = clLDSBandwidth.setup();
    if(status != SDK_SUCCESS)
    {
        if(status == SDK_EXPECTED_FAILURE)
            return SDK_SUCCESS;
        else
            return SDK_FAILURE;
    }

    if(clLDSBandwidth.run() != SDK_SUCCESS)
        return SDK_FAILURE;

    if(clLDSBandwidth.cleanup() != SDK_SUCCESS)
        return SDK_FAILURE;

    if(clLDSBandwidth.verifyResults() != SDK_SUCCESS)
        return SDK_FAILURE;

    clLDSBandwidth.printStats();
    return SDK_SUCCESS;
}