/********************************************************************** Copyright �2012 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: � Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. � Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ********************************************************************/ #include "AtomicCounters.hpp" int AtomicCounters::setupAtomicCounters() { // Make sure length is multiples of GROUP_SIZE length = (length / GROUP_SIZE); length = length ? length * GROUP_SIZE : GROUP_SIZE; // Allocate the memory for input array input = (cl_uint*)malloc(length * sizeof(cl_uint)); CHECK_ALLOCATION(input, "Allocation failed(input)"); // Set the input data value = 2; for(cl_uint i = 0; i < length; ++i) input[i] = (cl_uint)(rand() % 5); if(!quiet) sampleCommon->printArray<cl_uint>("Input Arry", input, 256, 1); return SDK_SUCCESS; } int AtomicCounters::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("AtomicCounters_Kernels.cl"); binaryData.flagsStr = std::string(""); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); return status; } int AtomicCounters::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed."); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed."); cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0}; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed "); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed" ); // Check device extensions if(!strstr(deviceInfo.extensions, "cl_ext_atomic_counters_32")) OPENCL_EXPECTED_ERROR("Device does not support cl_ext_atomic_counters_32 extension!"); if(!strstr(deviceInfo.extensions, "cl_khr_local_int32_base_atomics")) OPENCL_EXPECTED_ERROR("Device does not support cl_khr_local_int32_base_atomics extension!"); // Get OpenCL device version std::string deviceVersionStr = std::string(deviceInfo.deviceVersion); size_t vStart = deviceVersionStr.find(" ", 0); size_t vEnd = deviceVersionStr.find(" ", vStart + 1); std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1); // Check of OPENCL_C_VERSION if device version is 1.1 or later #ifdef CL_VERSION_1_1 if(deviceInfo.openclCVersion) { // Exit if OpenCL C device version is 1.0 deviceVersionStr = std::string(deviceInfo.openclCVersion); vStart = deviceVersionStr.find(" ", 0); vStart = deviceVersionStr.find(" ", vStart + 1); vEnd = deviceVersionStr.find(" ", vStart + 1); vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1); if(vStrVal.compare("1.0") <= 0) OPENCL_EXPECTED_ERROR("Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1"); } else OPENCL_EXPECTED_ERROR("Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1"); #else OPENCL_EXPECTED_ERROR("Unsupported device! Required CL_DEVICE_OPENCL_C_VERSION as 1.1"); #endif //Setup application data if(setupAtomicCounters() != SDK_SUCCESS) return SDK_FAILURE; cl_command_queue_properties props = CL_QUEUE_PROFILING_ENABLE; commandQueue = clCreateCommandQueue(context, devices[deviceId], props, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed(commandQueue)"); // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create buffer for input array inBuf = clCreateBuffer(context, inMemFlags, length * sizeof(cl_uint), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(inBuf)"); // Set up data for input array cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, inBuf, CL_FALSE, 0, length * sizeof(cl_uint), input, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(inBuf) failed.."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); counterOutBuf = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(counterOutBuf)."); globalOutBuf = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_uint), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed.(globalOutBuf)."); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("AtomicCounters_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // ConstantBuffer bandwidth from single access counterKernel = clCreateKernel(program, "atomicCounters", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(counterKernel)."); globalKernel = clCreateKernel(program, "globalAtomics", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel(globalKernel) failed."); status = kernelInfoC.setKernelWorkGroupInfo(counterKernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); status = kernelInfoG.setKernelWorkGroupInfo(globalKernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); if(counterWorkGroupSize > kernelInfoC.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << counterWorkGroupSize << std::endl; std::cout << "Max Group Size supported on the kernel(readKernel) : " << kernelInfoC.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfoC.kernelWorkGroupSize << std::endl; } counterWorkGroupSize = kernelInfoC.kernelWorkGroupSize; } if(globalWorkGroupSize > kernelInfoG.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << globalWorkGroupSize << std::endl; std::cout << "Max Group Size supported on the kernel(writeKernel) : " << kernelInfoG.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfoG.kernelWorkGroupSize << std::endl; } globalWorkGroupSize = kernelInfoG.kernelWorkGroupSize; } // Wait for event and release event status = sampleCommon->waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); return SDK_SUCCESS; } int AtomicCounters::initialize() { // Call base class Initialize to get default configuration CHECK_ERROR(this->SDKSample::initialize(), SDK_SUCCESS, "OpenCL Resources Initialization failed"); streamsdk::Option* array_length = new streamsdk::Option; CHECK_ALLOCATION(array_length, "Allocation failed(array_length)"); array_length->_sVersion = "x"; array_length->_lVersion = "length"; array_length->_description = "Length of the Input array"; array_length->_type = streamsdk::CA_ARG_INT; array_length->_value = &length; sampleArgs->AddOption(array_length); delete array_length; streamsdk::Option* numLoops = new streamsdk::Option; CHECK_ALLOCATION(numLoops, "Allocation failed(numLoops)"); numLoops->_sVersion = "i"; numLoops->_lVersion = "iterations"; numLoops->_description = "Number of timing loops"; numLoops->_type = streamsdk::CA_ARG_INT; numLoops->_value = &iterations; sampleArgs->AddOption(numLoops); delete numLoops; return SDK_SUCCESS; } int AtomicCounters::setup() { int status = setupCL(); if(status != SDK_SUCCESS) { if(status == SDK_EXPECTED_FAILURE) return SDK_EXPECTED_FAILURE; return SDK_FAILURE; } return SDK_SUCCESS; } void AtomicCounters::cpuRefImplementation() { for(cl_uint i = 0; i < length; ++i) if(value == input[i]) refOut++; } int AtomicCounters::verifyResults() { if(verify) { // Calculate the reference output cpuRefImplementation(); // Compare the results and see if they match if(refOut == counterOut && refOut == globalOut) { std::cout << "Passed!\n" << std::endl; return SDK_SUCCESS; } else { std::cout << "Failed\n" << std::endl; return SDK_FAILURE; } } return SDK_SUCCESS; } int AtomicCounters::runAtomicCounterKernel() { cl_int status = CL_SUCCESS; // Set Global and Local work items size_t globalWorkItems = length; size_t localWorkItems = counterWorkGroupSize; // Initialize the counter value cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, counterOutBuf, CL_FALSE, 0, sizeof(cl_uint), &initValue, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(counterOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue)failed."); // Wait for event and release event status = sampleCommon->waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); // Set kernel arguments status = clSetKernelArg(counterKernel, 0, sizeof(cl_mem), &inBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed."); status = clSetKernelArg(counterKernel, 1, sizeof(cl_uint), &value); CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed."); status = clSetKernelArg(counterKernel, 2, sizeof(cl_mem), &counterOutBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(counterOutBuf) failed."); // Run Kernel cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, counterKernel, 1, NULL, &globalWorkItems, &localWorkItems, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(counterKernel) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); cl_int eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed."); } cl_ulong startTime; cl_ulong endTime; // Get profiling information status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed."); status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed."); double sec = 1e-9 * (endTime - startTime); kTimeAtomCounter += sec; status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); // Get the occurrences of Value from atomicKernel cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, counterOutBuf, CL_FALSE, 0, sizeof(cl_uint), &counterOut, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(counterOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = sampleCommon->waitForEventAndRelease(&readEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed."); return SDK_SUCCESS; } int AtomicCounters::runGlobalAtomicKernel() { cl_int status = CL_SUCCESS; // Set Global and Local work items size_t globalWorkItems = length; size_t localWorkItems = globalWorkGroupSize; // Initialize the counter value cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, globalOutBuf, CL_FALSE, 0, sizeof(cl_uint), &initValue, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(globalOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = sampleCommon->waitForEventAndRelease(&writeEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(writeEvt) failed."); // Set kernel arguments status = clSetKernelArg(globalKernel, 0, sizeof(cl_mem), &inBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(inBuf) failed."); status = clSetKernelArg(globalKernel, 1, sizeof(cl_uint), &value); CHECK_OPENCL_ERROR(status, "clSetKernelArg(value) failed."); status = clSetKernelArg(globalKernel, 2, sizeof(cl_mem), &globalOutBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(globalOutBuf) failed."); // Run Kernel cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, globalKernel, 1, NULL, &globalWorkItems, &localWorkItems, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel(globalKernel) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed."); cl_int eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( ndrEvt, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo(ndrEvt) failed."); } cl_ulong startTime; cl_ulong endTime; // Get profiling information status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_START) failed."); status = clGetEventProfilingInfo( ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo(CL_PROFILING_COMMAND_END) failed."); double sec = 1e-9 * (endTime - startTime); kTimeAtomGlobal += sec; status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); // Get the occurrences of Value from atomicKernel cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, globalOutBuf, CL_FALSE, 0, sizeof(cl_uint), &globalOut, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(globalOutBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for event and release event status = sampleCommon->waitForEventAndRelease(&readEvt); CHECK_OPENCL_ERROR(status, "waitForEventAndRelease(readEvt) failed."); return SDK_SUCCESS; } int AtomicCounters::run() { // Warm up Atomic counter kernel for(int i = 0; i < 2 && iterations != 1; i++) if(runAtomicCounterKernel()) return SDK_FAILURE; std::cout << "Executing Kernels for " << iterations << " iterations" << std::endl; std::cout << "-------------------------------------------" << std::endl; kTimeAtomCounter = 0; // Run the kernel for a number of iterations for(int i = 0; i < iterations; i++) if(runAtomicCounterKernel()) return SDK_FAILURE; // Compute total time kTimeAtomCounter /= iterations; if(!quiet) sampleCommon->printArray<cl_uint>("Atomic Counter Output", &counterOut, 1, 1); // Warm up Global atomics kernel for(int i = 0; i < 2 && iterations != 1; i++) if(runGlobalAtomicKernel()) return SDK_FAILURE; kTimeAtomGlobal = 0; // Run the kernel for a number of iterations for(int i = 0; i < iterations; i++) if(runGlobalAtomicKernel()) return SDK_FAILURE; // Compute total time kTimeAtomGlobal /= iterations; if(!quiet) sampleCommon->printArray<cl_uint>("Global atomics Output", &globalOut, 1, 1); return SDK_SUCCESS; } void AtomicCounters::printStats() { std::string strArray[4] = {"Elements", "Occurrences", "AtomicsCounter(sec)", "GlobalAtomics(sec)"}; std::string stats[4]; stats[0] = sampleCommon->toString(length, std::dec); stats[1] = sampleCommon->toString(counterOut, std::dec); stats[2] = sampleCommon->toString(kTimeAtomCounter, std::dec); stats[3] = sampleCommon->toString(kTimeAtomGlobal, std::dec); this->SDKSample::printStats(strArray, stats, 4); } int AtomicCounters::cleanup() { // Releases OpenCL resources (Context, Memory etc.) cl_int status; status = clReleaseMemObject(inBuf); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(inBuf) failed."); status = clReleaseMemObject(counterOutBuf); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(counterOutBuf) failed."); status = clReleaseMemObject(globalOutBuf); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(globalOutBuf) failed."); status = clReleaseKernel(counterKernel); CHECK_OPENCL_ERROR(status, "clReleaseKernel(counterKernel) failed."); status = clReleaseKernel(globalKernel); CHECK_OPENCL_ERROR(status, "clReleaseKernel(globalKernel) failed."); status = clReleaseProgram(program); CHECK_OPENCL_ERROR(status, "clReleaseProgram(program) failed."); status = clReleaseCommandQueue(commandQueue); CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue(commandQueue) failed."); status = clReleaseContext(context); CHECK_OPENCL_ERROR(status, "clReleaseContext(context) failed."); free(input); return SDK_SUCCESS; } int main(int argc, char * argv[]) { int status = 0; AtomicCounters clAtomicCounters("OpenCL Atomic Counter"); if(clAtomicCounters.initialize() != SDK_SUCCESS) return SDK_FAILURE; if(clAtomicCounters.parseCommandLine(argc, argv) != SDK_SUCCESS) return SDK_FAILURE; if(clAtomicCounters.isDumpBinaryEnabled()) { return clAtomicCounters.genBinaryImage(); } status = clAtomicCounters.setup(); if(status != SDK_SUCCESS) return (status == SDK_EXPECTED_FAILURE) ? SDK_SUCCESS : SDK_FAILURE; if(clAtomicCounters.run() != SDK_SUCCESS) return SDK_FAILURE; if(clAtomicCounters.verifyResults() != SDK_SUCCESS) return SDK_FAILURE; if(clAtomicCounters.cleanup() != SDK_SUCCESS) return SDK_FAILURE; clAtomicCounters.printStats(); return SDK_SUCCESS; }