/********************************************************************** Copyright ©2012 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ********************************************************************/ #include "MonteCarloAsianDP.hpp" #include #include /* * structure for attributes of Monte calro * simulation */ typedef struct _MonteCalroAttrib { cl_double4 strikePrice; cl_double4 c1; cl_double4 c2; cl_double4 c3; cl_double4 initPrice; cl_double4 sigma; cl_double4 timeStep; }MonteCarloAttrib; int MonteCarloAsianDP::setupMonteCarloAsianDP() { steps = (steps < 4) ? 4 : steps; steps = (steps / 2) * 2; int i = 0; const cl_double finalValue = 0.8; const cl_double stepValue = finalValue / (cl_double)steps; // Allocate and init memory used by host sigma = (cl_double*)malloc(steps * sizeof(cl_double)); CHECK_ALLOCATION(sigma, "Failed to allocate host memory. (sigma)"); sigma[0] = 0.01; for(i = 1; i < steps; i++) { sigma[i] = sigma[i - 1] + stepValue; } price = (cl_double*) malloc(steps * sizeof(cl_double)); CHECK_ALLOCATION(price, "Failed to allocate host memory. (price)"); memset((void*)price, 0, steps * sizeof(cl_double)); vega = (cl_double*) malloc(steps * sizeof(cl_double)); CHECK_ALLOCATION(vega, "Failed to allocate host memory. (vega)"); memset((void*)vega, 0, steps * sizeof(cl_double)); refPrice = (cl_double*) malloc(steps * sizeof(cl_double)); CHECK_ALLOCATION(refPrice, "Failed to allocate host memory. (refPrice)"); memset((void*)refPrice, 0, steps * sizeof(cl_double)); refVega = (cl_double*) malloc(steps * sizeof(cl_double)); CHECK_ALLOCATION(refVega, "Failed to allocate host memory. (refVega)"); memset((void*)refVega, 0, steps * sizeof(cl_double)); // Set samples and exercize points noOfSum = 12; noOfTraj = 1024; width = noOfTraj / 4; height = noOfTraj / 2; #if defined (_WIN32) randNum = (cl_uint*)_aligned_malloc(width * height * sizeof(cl_uint4), 16); #else randNum = (cl_uint*)memalign(16, width * height * sizeof(cl_uint4)); #endif CHECK_ALLOCATION(randNum, "Failed to allocate host memory. (randNum)"); priceVals = (cl_double*)malloc(width * height * 2 * sizeof(cl_double4)); CHECK_ALLOCATION(priceVals, "Failed to allocate host memory. (priceVals)"); memset((void*)priceVals, 0, width * height * 2 * sizeof(cl_double4)); priceDeriv = (cl_double*)malloc(width * height * 2 * sizeof(cl_double4)); CHECK_ALLOCATION(priceDeriv, "Failed to allocate host memory. (priceDeriv)"); memset((void*)priceDeriv, 0, width * height * 2 * sizeof(cl_double4)); /* * Unless quiet mode has been enabled, print the INPUT array. * No more than 256 values are printed because it clutters the screen * and it is not practical to manually compare a large set of numbers */ if(!quiet) { sampleCommon->printArray("sigma values", sigma, steps, 1); } return SDK_SUCCESS; } int MonteCarloAsianDP::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("MonteCarloAsianDP_Kernels.cl"); binaryData.flagsStr = std::string(""); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); return status; } int MonteCarloAsianDP::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); status = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status,"clCreateCommandQueue failed."); } std::string buildOptions = std::string(""); // Check if cl_khr_fp64 extension is supported if(strstr(deviceInfo.extensions, "cl_khr_fp64")) { buildOptions.append("-D KHR_DP_EXTENSION"); } else { /* Check if cl_amd_fp64 extension is supported */ if(!strstr(deviceInfo.extensions, "cl_amd_fp64")) { sampleCommon->expectedError("Device does not support cl_amd_fp64 extension!"); return SDK_EXPECTED_FAILURE; } } // Exit if SDK version is 2.2 or less std::string deviceVersionStr = std::string(deviceInfo.deviceVersion); size_t vStart = deviceVersionStr.find_last_of("v"); size_t vEnd = deviceVersionStr.find(" ", vStart); std::string vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1); if(vStrVal.compare("2.2") <= 0 && dType == CL_DEVICE_TYPE_GPU) { sampleCommon->expectedError("Few double math functions are not supported in SDK2.2 or less!"); return SDK_EXPECTED_FAILURE; } // Exit if OpenCL version is 1.0 vStart = deviceVersionStr.find(" ", 0); vEnd = deviceVersionStr.find(" ", vStart + 1); vStrVal = deviceVersionStr.substr(vStart + 1, vEnd - vStart - 1); if(vStrVal.compare("1.0") <= 0 && dType == CL_DEVICE_TYPE_GPU) { sampleCommon->expectedError("Unsupported device!"); return SDK_EXPECTED_FAILURE; } // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; randBuf = clCreateBuffer(context, inMemFlags, sizeof(cl_uint4) * width * height, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(randBuf)"); randBufAsync = clCreateBuffer(context, inMemFlags, sizeof(cl_uint4) * width * height, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(randBufAsync) failed."); priceBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(priceBuf)"); priceDerivBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(priceDerivBuf)"); priceBufAsync = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(priceBufAsync)"); priceDerivBufAsync = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_double4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status,"clCreateBuffer failed.(priceDerivBufAsync)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("MonteCarloAsianDP_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "calPriceVega", &status); CHECK_OPENCL_ERROR(status,"clCreateKernel failed."); // Settinf the kernel Information status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); if((blockSizeX * blockSizeY) > kernelInfo.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfo.kernelWorkGroupSize << std::endl; } // Three possible cases if(blockSizeX > kernelInfo.kernelWorkGroupSize) { blockSizeX = kernelInfo.kernelWorkGroupSize; blockSizeY = 1; } } return SDK_SUCCESS; } int MonteCarloAsianDP::runCLKernels(void) { cl_int status; cl_int eventStatus = CL_QUEUED; size_t globalThreads[2] = {width, height}; size_t localThreads[2] = {blockSizeX, blockSizeY}; /* * Declare attribute structure */ MonteCarloAttrib attributes; if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] || localThreads[1] > deviceInfo.maxWorkItemSizes[1] || (size_t)blockSizeX * blockSizeY > deviceInfo.maxWorkGroupSize) { std::cout << "Unsupported: Device does not support requested" ":number of work items."; return SDK_FAILURE; } status = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&noOfSum); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (noOfSum)"); status = clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*)&width); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width)"); double timeStep = maturity / (noOfSum - 1); // Initialize random number generator srand(1); void* inMapPtr1 = NULL; void* inMapPtr2 = NULL; void* outMapPtr11 = NULL; void* outMapPtr12 = NULL; void* outMapPtr21 = NULL; void* outMapPtr22 = NULL; cl_double* ptr21 = NULL; cl_double* ptr22 = NULL; cl_event inMapEvt1; cl_event inMapEvt2; cl_event inUnmapEvt1; cl_event inUnmapEvt2; cl_event outMapEvt11; cl_event outMapEvt12; cl_event outUnmapEvt11; cl_event outUnmapEvt12; cl_event outMapEvt21; cl_event outMapEvt22; cl_event outUnmapEvt21; cl_event outUnmapEvt22; cl_event ndrEvt; size_t inSize = width * height * sizeof(cl_uint4); size_t outSize = width * height * sizeof(cl_double4); for(int k = 0; k < steps / 2; k++) { // Map input buffer for kernel 1 inMapPtr1 = clEnqueueMapBuffer( commandQueue, randBuf, CL_FALSE, CL_MAP_WRITE, 0, inSize, 0, NULL, &inMapEvt1, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Generate data for input for kernel 1 for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); status = sampleCommon->waitForEventAndRelease(&inMapEvt1); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt1) Failed"); memcpy(inMapPtr1, (void*)randNum, inSize); // Unmap of input buffer of kernel 1 status = clEnqueueUnmapMemObject( commandQueue, randBuf, inMapPtr1, 0, NULL, &inUnmapEvt1); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Get data from output buffers of kernel 2 if(k != 0) { // Wait for kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); outMapPtr21 = clEnqueueMapBuffer( commandQueue, priceBufAsync, CL_FALSE, CL_MAP_READ, 0, outSize * 2, 0, NULL, &outMapEvt21, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceBufAsync) failed."); outMapPtr22 = clEnqueueMapBuffer( commandQueue, priceDerivBufAsync, CL_FALSE, CL_MAP_READ, 0, outSize * 2, 0, NULL, &outMapEvt22, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); } // Set up arguments required for kernel 1 double c1 = (interest - 0.5 * sigma[k * 2] * sigma[k * 2]) * timeStep; double c2 = sigma[k * 2] * sqrt(timeStep); double c3 = (interest + 0.5 * sigma[k * 2] * sigma[k * 2]); const cl_double4 c1F4 = {c1, c1, c1, c1}; attributes.c1 = c1F4; const cl_double4 c2F4 = {c2, c2, c2, c2}; attributes.c2 = c2F4; const cl_double4 c3F4 = {c3, c3, c3, c3}; attributes.c3 = c3F4; const cl_double4 initPriceF4 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF4; const cl_double4 strikePriceF4 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF4; const cl_double4 sigmaF4 = {sigma[k * 2], sigma[k * 2], sigma[k * 2], sigma[k * 2]}; attributes.sigma = sigmaF4; const cl_double4 timeStepF4 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF4; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); // Wait for input of kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&inUnmapEvt1); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inUnmapEvt1) Failed"); inMapPtr1 = NULL; // Enqueue kernel 1 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Generate data of input for kernel 2 // Fill data of input buffer for kernel 2 if(k <= steps - 1) { // Map input buffer for kernel 1 inMapPtr2 = clEnqueueMapBuffer( commandQueue, randBufAsync, CL_FALSE, CL_MAP_WRITE, 0, inSize, 0, NULL, &inMapEvt2, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(randBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Generate data for input for kernel 1 for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); // Wait for map of input of kernel 1 status = sampleCommon->waitForEventAndRelease(&inMapEvt2); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt2) Failed"); memcpy(inMapPtr2, (void*)randNum, inSize); // Unmap of input buffer of kernel 1 status = clEnqueueUnmapMemObject( commandQueue, randBufAsync, inMapPtr2, 0, NULL, &inUnmapEvt2); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(randBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); } // Wait for output buffers of kernel 2 to complete // Calcualte the results from output of kernel 2 if(k != 0) { // Wait for output buffers of kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&outMapEvt21); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapevt21) Failed"); status = sampleCommon->waitForEventAndRelease(&outMapEvt22); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapevt22) Failed"); // Calcualte the results from output of kernel 2 ptr21 = (cl_double*)outMapPtr21; ptr22 = (cl_double*)outMapPtr22; for(int i = 0; i < noOfTraj * noOfTraj; i++) { price[k * 2 - 1] += ptr21[i]; vega[k * 2 - 1] += ptr22[i]; } price[k * 2 - 1] /= (noOfTraj * noOfTraj); vega[k * 2 - 1] /= (noOfTraj * noOfTraj); price[k * 2 - 1] = exp(-interest * maturity) * price[k * 2 - 1]; vega[k * 2 - 1] = exp(-interest * maturity) * vega[k * 2 - 1]; // Unmap of output buffers of kernel 2 status = clEnqueueUnmapMemObject( commandQueue, priceBufAsync, outMapPtr21, 0, NULL, &outUnmapEvt21); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceBufAsync) failed."); status = clEnqueueUnmapMemObject( commandQueue, priceDerivBufAsync, outMapPtr22, 0, NULL, &outUnmapEvt22); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceDerivBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&outUnmapEvt21); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outUnmapevt21) Failed"); status = sampleCommon->waitForEventAndRelease(&outUnmapEvt22); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outUnmapevt22) Failed"); } // Wait for kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); // Get data from output buffers of kernel 1 outMapPtr11 = clEnqueueMapBuffer( commandQueue, priceBuf, CL_FALSE, CL_MAP_READ, 0, outSize * 2, 0, NULL, &outMapEvt11, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceBuf) failed."); outMapPtr12 = clEnqueueMapBuffer( commandQueue, priceDerivBuf, CL_FALSE, CL_MAP_READ, 0, outSize * 2, 0, NULL, &outMapEvt12, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Set up arguments required for kernel 2 double c21 = (interest - 0.5 * sigma[k * 2 + 1] * sigma[k * 2 + 1]) * timeStep; double c22 = sigma[k * 2 + 1] * sqrt(timeStep); double c23 = (interest + 0.5 * sigma[k * 2 + 1] * sigma[k * 2 + 1]); const cl_double4 c1F42 = {c21, c21, c21, c21}; attributes.c1 = c1F42; const cl_double4 c2F42 = {c22, c22, c22, c22}; attributes.c2 = c2F42; const cl_double4 c3F42 = {c23, c23, c23, c23}; attributes.c3 = c3F42; const cl_double4 initPriceF42 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF42; const cl_double4 strikePriceF42 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF42; const cl_double4 sigmaF42 = {sigma[k * 2 + 1], sigma[k * 2 + 1], sigma[k * 2 + 1], sigma[k * 2 + 1]}; attributes.sigma = sigmaF42; const cl_double4 timeStepF42 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF42; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); // Wait for input of kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&inUnmapEvt2); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inUnmapEvt2) Failed"); inMapPtr2 = NULL; // Enqueue kernel 2 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for output buffers of kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&outMapEvt11); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapevt11) Failed"); status = sampleCommon->waitForEventAndRelease(&outMapEvt12); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapevt12 ) Failed"); // Calcualte the results from output of kernel 2 ptr21 = (cl_double*)outMapPtr11; ptr22 = (cl_double*)outMapPtr12; for(int i = 0; i < noOfTraj * noOfTraj; i++) { price[k * 2] += ptr21[i]; vega[k * 2] += ptr22[i]; } price[k * 2] /= (noOfTraj * noOfTraj); vega[k * 2] /= (noOfTraj * noOfTraj); price[k * 2] = exp(-interest * maturity) * price[k * 2]; vega[k * 2] = exp(-interest * maturity) * vega[k * 2]; // Unmap of output buffers of kernel 2 status = clEnqueueUnmapMemObject( commandQueue, priceBuf, outMapPtr11, 0, NULL, &outUnmapEvt11); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceBuf) failed."); status = clEnqueueUnmapMemObject( commandQueue, priceDerivBuf, outMapPtr12, 0, NULL, &outUnmapEvt12); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&outUnmapEvt11); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outUnmapEvt11) Failed"); status = sampleCommon->waitForEventAndRelease(&outUnmapEvt12); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outUnmapEvt12) Failed"); } // Wait for kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); // Gather last kernel 2 execution here outMapPtr21 = clEnqueueMapBuffer( commandQueue, priceBufAsync, CL_FALSE, CL_MAP_READ, 0, outSize * 2, 0, NULL, &outMapEvt21, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceBuf) failed."); outMapPtr22 = clEnqueueMapBuffer( commandQueue, priceDerivBufAsync, CL_FALSE, CL_MAP_READ, 0, outSize * 2, 0, NULL, &outMapEvt22, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&outMapEvt21); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapEvt21) Failed"); status = sampleCommon->waitForEventAndRelease(&outMapEvt22); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(outMapEvt22) Failed"); // Calcualte the results from output of kernel 2 ptr21 = (cl_double*)outMapPtr21; ptr22 = (cl_double*)outMapPtr22; for(int i = 0; i < noOfTraj * noOfTraj; i++) { price[steps - 1] += ptr21[i]; vega[steps - 1] += ptr22[i]; } price[steps - 1] /= (noOfTraj * noOfTraj); vega[steps - 1] /= (noOfTraj * noOfTraj); price[steps - 1] = exp(-interest * maturity) * price[steps - 1]; vega[steps - 1] = exp(-interest * maturity) * vega[steps - 1]; return SDK_SUCCESS; } int MonteCarloAsianDP::initialize() { // Call base class Initialize to get default configuration if(this->SDKSample::initialize()) return SDK_FAILURE; const int optionsCount = 5; streamsdk::Option *optionList = new streamsdk::Option[optionsCount]; CHECK_ALLOCATION(optionList,"Failed to allocate host memory. (optionList)"); optionList[0]._sVersion = "c"; optionList[0]._lVersion = "steps"; optionList[0]._description = "Steps of Monte carlo simuation"; optionList[0]._type = streamsdk::CA_ARG_INT; optionList[0]._value = &steps; optionList[1]._sVersion = "P"; optionList[1]._lVersion = "initPrice"; optionList[1]._description = "Initial price(Default value 50)"; optionList[1]._type = streamsdk::CA_ARG_DOUBLE; optionList[1]._value = &initPrice; optionList[2]._sVersion = "s"; optionList[2]._lVersion = "strikePrice"; optionList[2]._description = "Strike price (Default value 55)"; optionList[2]._type = streamsdk::CA_ARG_DOUBLE; optionList[2]._value = &strikePrice; optionList[3]._sVersion = "r"; optionList[3]._lVersion = "interest"; optionList[3]._description = "interest rate (Default value 0.06)"; optionList[3]._type = streamsdk::CA_ARG_DOUBLE; optionList[3]._value = &interest; optionList[4]._sVersion = "m"; optionList[4]._lVersion = "maturity"; optionList[4]._description = "Maturity (Default value 1)"; optionList[4]._type = streamsdk::CA_ARG_DOUBLE; optionList[4]._value = &maturity; for(cl_int i = 0; i < optionsCount; ++i) { sampleArgs->AddOption(&optionList[i]); } delete[] optionList; streamsdk::Option* iteration_option = new streamsdk::Option; CHECK_ALLOCATION(iteration_option,"Failed to allocate host memory. (iteration_option)"); iteration_option->_sVersion = "i"; iteration_option->_lVersion = "iterations"; iteration_option->_description = "Number of iterations to execute kernel"; iteration_option->_type = streamsdk::CA_ARG_INT; iteration_option->_value = &iterations; sampleArgs->AddOption(iteration_option); delete iteration_option; return SDK_SUCCESS; } int MonteCarloAsianDP::setup() { int status=setupMonteCarloAsianDP(); CHECK_ERROR(status, SDK_SUCCESS, "MonteCarloAsianDP::setup) failed"); // create and initialize timers int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); status = setupCL(); if(status != SDK_SUCCESS) return status; sampleCommon->stopTimer(timer); // Compute setup time setupTime = (double)(sampleCommon->readTimer(timer)); return SDK_SUCCESS; } int MonteCarloAsianDP::run() { int status = 0; // Warmup for(int i = 0; i < 2; i++) { // Arguments are set and execution call is enqueued on command buffer if(runCLKernels() != SDK_SUCCESS) return SDK_FAILURE; } std::cout<<"Executing kernel for " << iterations << " iterations" << std::endl; std::cout<<"-------------------------------------------" << std::endl; // create and initialize timers int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); for(int i = 0; i < iterations; i++) { // Arguments are set and execution call is enqueued on command buffer if(runCLKernels()!=SDK_SUCCESS) return SDK_FAILURE; } sampleCommon->stopTimer(timer); // Compute average kernel time kernelTime = (double)(sampleCommon->readTimer(timer)) / iterations; if(!quiet) { sampleCommon->printArray("price", price, steps, 1); sampleCommon->printArray("vega", vega, steps, 1); } return SDK_SUCCESS; } void MonteCarloAsianDP::printStats() { std::string strArray[4] = { "Steps", "Time(sec)", "[Transfer+kernel](sec)", "Samples used /sec" }; std::string stats[4]; totalTime = setupTime + kernelTime; stats[0] = sampleCommon->toString(steps, std::dec); stats[1] = sampleCommon->toString(totalTime, std::dec); stats[2] = sampleCommon->toString(kernelTime, std::dec); stats[3] = sampleCommon->toString((noOfTraj * (noOfSum - 1) * steps) / kernelTime, std::dec); this->SDKSample::printStats(strArray, stats, 4); } void MonteCarloAsianDP::lshift128(unsigned int* input, unsigned int shift, unsigned int * output) { unsigned int invshift = 32u - shift; output[0] = input[0] << shift; output[1] = (input[1] << shift) | (input[0] >> invshift); output[2] = (input[2] << shift) | (input[1] >> invshift); output[3] = (input[3] << shift) | (input[2] >> invshift); } void MonteCarloAsianDP::rshift128(unsigned int* input, unsigned int shift, unsigned int* output) { unsigned int invshift = 32u - shift; output[3]= input[3] >> shift; output[2] = (input[2] >> shift) | (input[0] >> invshift); output[1] = (input[1] >> shift) | (input[1] >> invshift); output[0] = (input[0] >> shift) | (input[2] >> invshift); } void MonteCarloAsianDP::generateRand(unsigned int* seed, double *gaussianRand1, double *gaussianRand2, unsigned int* nextRand) { unsigned int mulFactor = 4; unsigned int temp[8][4]; unsigned int state1[4] = {seed[0], seed[1], seed[2], seed[3]}; unsigned int state2[4] = {0u, 0u, 0u, 0u}; unsigned int state3[4] = {0u, 0u, 0u, 0u}; unsigned int state4[4] = {0u, 0u, 0u, 0u}; unsigned int state5[4] = {0u, 0u, 0u, 0u}; unsigned int stateMask = 1812433253u; unsigned int thirty = 30u; unsigned int mask4[4] = {stateMask, stateMask, stateMask, stateMask}; unsigned int thirty4[4] = {thirty, thirty, thirty, thirty}; unsigned int one4[4] = {1u, 1u, 1u, 1u}; unsigned int two4[4] = {2u, 2u, 2u, 2u}; unsigned int three4[4] = {3u, 3u, 3u, 3u}; unsigned int four4[4] = {4u, 4u, 4u, 4u}; unsigned int r1[4] = {0u, 0u, 0u, 0u}; unsigned int r2[4] = {0u, 0u, 0u, 0u}; unsigned int a[4] = {0u, 0u, 0u, 0u}; unsigned int b[4] = {0u, 0u, 0u, 0u}; unsigned int e[4] = {0u, 0u, 0u, 0u}; unsigned int f[4] = {0u, 0u, 0u, 0u}; unsigned int thirteen = 13u; unsigned int fifteen = 15u; unsigned int shift = 8u * 3u; unsigned int mask11 = 0xfdff37ffu; unsigned int mask12 = 0xef7f3f7du; unsigned int mask13 = 0xff777b7du; unsigned int mask14 = 0x7ff7fb2fu; const double one = 1.0; const double intMax = 4294967296.0; const double PI = 3.14159265358979; const double two = 2.0; double r[4] = {0.0, 0.0, 0.0, 0.0}; double phi[4] = {0.0, 0.0, 0.0, 0.0}; double temp1[4] = {0.0, 0.0, 0.0, 0.0}; double temp2[4] = {0.0, 0.0, 0.0, 0.0}; //Initializing states. for(int c = 0; c < 4; ++c) { state2[c] = mask4[c] * (state1[c] ^ (state1[c] >> thirty4[c])) + one4[c]; state3[c] = mask4[c] * (state2[c] ^ (state2[c] >> thirty4[c])) + two4[c]; state4[c] = mask4[c] * (state3[c] ^ (state3[c] >> thirty4[c])) + three4[c]; state5[c] = mask4[c] * (state4[c] ^ (state4[c] >> thirty4[c])) + four4[c]; } unsigned int i = 0; for(i = 0; i < mulFactor; ++i) { switch(i) { case 0: for(int c = 0; c < 4; ++c) { r1[c] = state4[c]; r2[c] = state5[c]; a[c] = state1[c]; b[c] = state3[c]; } break; case 1: for(int c = 0; c < 4; ++c) { r1[c] = r2[c]; r2[c] = temp[0][c]; a[c] = state2[c]; b[c] = state4[c]; } break; case 2: for(int c = 0; c < 4; ++c) { r1[c] = r2[c]; r2[c] = temp[1][c]; a[c] = state3[c]; b[c] = state5[c]; } break; case 3: for(int c = 0; c < 4; ++c) { r1[c] = r2[c]; r2[c] = temp[2][c]; a[c] = state4[c]; b[c] = state1[c]; } break; default: break; } lshift128(a, shift, e); rshift128(r1, shift, f); temp[i][0] = a[0] ^ e[0] ^ ((b[0] >> thirteen) & mask11) ^ f[0] ^ (r2[0] << fifteen); temp[i][1] = a[1] ^ e[1] ^ ((b[1] >> thirteen) & mask12) ^ f[1] ^ (r2[1] << fifteen); temp[i][2] = a[2] ^ e[2] ^ ((b[2] >> thirteen) & mask13) ^ f[2] ^ (r2[2] << fifteen); temp[i][3] = a[3] ^ e[3] ^ ((b[3] >> thirteen) & mask14) ^ f[3] ^ (r2[3] << fifteen); } for(int c = 0; c < 4; ++c) { temp1[c] = temp[0][c] * one / intMax; temp2[c] = temp[1][c] * one / intMax; } for(int c = 0; c < 4; ++c) { // Applying Box Mullar Transformations. r[c] = sqrt((-two) * log(temp1[c])); phi[c] = two * PI * temp2[c]; gaussianRand1[c] = r[c] * cos(phi[c]); gaussianRand2[c] = r[c] * sin(phi[c]); nextRand[c] = temp[2][c]; } } void MonteCarloAsianDP::calOutputs(double strikePrice, double* meanDeriv1, double* meanDeriv2, double* meanPrice1, double* meanPrice2, double* pathDeriv1, double* pathDeriv2, double* priceVec1, double* priceVec2) { double temp1[4] = {0.0, 0.0, 0.0, 0.0}; double temp2[4] = {0.0, 0.0, 0.0, 0.0}; double temp3[4] = {0.0, 0.0, 0.0, 0.0}; double temp4[4] = {0.0, 0.0, 0.0, 0.0}; double tempDiff1[4] = {0.0, 0.0, 0.0, 0.0}; double tempDiff2[4] = {0.0, 0.0, 0.0, 0.0}; for(int c = 0; c < 4; ++c) { tempDiff1[c] = meanPrice1[c] - strikePrice; tempDiff2[c] = meanPrice2[c] - strikePrice; } if(tempDiff1[0] > 0.0) { temp1[0] = 1.0; temp3[0] = tempDiff1[0]; } if(tempDiff1[1] > 0.0) { temp1[1] = 1.0; temp3[1] = tempDiff1[1]; } if(tempDiff1[2] > 0.0) { temp1[2] = 1.0; temp3[2] = tempDiff1[2]; } if(tempDiff1[3] > 0.0) { temp1[3] = 1.0; temp3[3] = tempDiff1[3]; } if(tempDiff2[0] > 0.0) { temp2[0] = 1.0; temp4[0] = tempDiff2[0]; } if(tempDiff2[1] > 0.0) { temp2[1] = 1.0; temp4[1] = tempDiff2[1]; } if(tempDiff2[2] > 0.0) { temp2[2] = 1.0; temp4[2] = tempDiff2[2]; } if(tempDiff2[3] > 0.0) { temp2[3] = 1.0; temp4[3] = tempDiff2[3]; } for(int c = 0; c < 4; ++c) { pathDeriv1[c] = meanDeriv1[c] * temp1[c]; pathDeriv2[c] = meanDeriv2[c] * temp2[c]; priceVec1[c] = temp3[c]; priceVec2[c] = temp4[c]; } } void MonteCarloAsianDP::cpuReferenceImpl() { double timeStep = maturity / (noOfSum - 1); // Initialize random number generator srand(1); for(int k = 0; k < steps; k++) { double c1 = (interest - 0.5 * sigma[k] * sigma[k]) * timeStep; double c2 = sigma[k] * sqrt(timeStep); double c3 = (interest + 0.5 * sigma[k] * sigma[k]); for(int j = 0; j < (width * height); j++) { unsigned int nextRand[4] = {0u, 0u, 0u, 0u}; for(int c = 0; c < 4; ++c) nextRand[c] = (cl_uint)rand(); double trajPrice1[4] = {initPrice, initPrice, initPrice, initPrice}; double sumPrice1[4] = {initPrice, initPrice, initPrice, initPrice}; double sumDeriv1[4] = {0.0, 0.0, 0.0, 0.0}; double meanPrice1[4] = {0.0, 0.0, 0.0, 0.0}; double meanDeriv1[4] = {0.0, 0.0, 0.0, 0.0}; double price1[4] = {0.0, 0.0, 0.0, 0.0}; double pathDeriv1[4] = {0.0, 0.0, 0.0, 0.0}; double trajPrice2[4] = {initPrice, initPrice, initPrice, initPrice}; double sumPrice2[4] = {initPrice, initPrice, initPrice, initPrice}; double sumDeriv2[4] = {0.0, 0.0, 0.0, 0.0}; double meanPrice2[4] = {0.0, 0.0, 0.0, 0.0}; double meanDeriv2[4] = {0.0, 0.0, 0.0, 0.0}; double price2[4] = {0.0, 0.0, 0.0, 0.0}; double pathDeriv2[4] = {0.0, 0.0, 0.0, 0.0}; //Run the Monte Carlo simulation a total of Num_Sum - 1 times for(int i = 1; i < noOfSum; i++) { unsigned int tempRand[4] = {0u, 0u, 0u, 0u}; for(int c = 0; c < 4; ++c) tempRand[c] = nextRand[c]; double gaussian1[4] = {0.0, 0.0, 0.0, 0.0}; double gaussian2[4] = {0.0, 0.0, 0.0, 0.0}; generateRand(tempRand, gaussian1, gaussian2, nextRand); //Calculate the trajectory price and sum price for all trajectories for(int c = 0; c < 4; ++c) { trajPrice1[c] = trajPrice1[c] * exp(c1 + c2 * gaussian1[c]); trajPrice2[c] = trajPrice2[c] * exp(c1 + c2 * gaussian2[c]); sumPrice1[c] = sumPrice1[c] + trajPrice1[c]; sumPrice2[c] = sumPrice2[c] + trajPrice2[c]; double temp = c3 * timeStep * i; // Calculate the derivative price for all trajectories sumDeriv1[c] = sumDeriv1[c] + trajPrice1[c] * ((log(trajPrice1[c] / initPrice) - temp) / sigma[k]); sumDeriv2[c] = sumDeriv2[c] + trajPrice2[c] * ((log(trajPrice2[c] / initPrice) - temp) / sigma[k]); } } //Calculate the average price and “average derivative” of each simulated path for(int c = 0; c < 4; ++c) { meanPrice1[c] = sumPrice1[c] / noOfSum; meanPrice2[c] = sumPrice2[c] / noOfSum; meanDeriv1[c] = sumDeriv1[c] / noOfSum; meanDeriv2[c] = sumDeriv2[c] / noOfSum; } calOutputs(strikePrice, meanDeriv1, meanDeriv2, meanPrice1, meanPrice2, pathDeriv1, pathDeriv2, price1, price2); for(int c = 0; c < 4; ++c) { priceVals[j * 8 + c] = price1[c]; priceVals[j * 8 + 1 * 4 + c] = price2[c]; priceDeriv[j * 8 + c] = pathDeriv1[c]; priceDeriv[j * 8 + 1 * 4 + c] = pathDeriv2[c]; } } // Replace Following "for" loop with reduction kernel for(int i = 0; i < noOfTraj * noOfTraj; i++) { refPrice[k] += priceVals[i]; refVega[k] += priceDeriv[i]; } refPrice[k] /= (noOfTraj * noOfTraj); refVega[k] /= (noOfTraj * noOfTraj); refPrice[k] = exp(-interest * maturity) * refPrice[k]; refVega[k] = exp(-interest * maturity) * refVega[k]; } } int MonteCarloAsianDP::verifyResults() { if(verify) { /* reference implementation * it overwrites the input array with the output */ cpuReferenceImpl(); // compare the results and see if they match for(int i = 0; i < steps; ++i) { if(fabs(price[i] - refPrice[i]) > 0.2) { std::cout << "Failed\n"; return SDK_FAILURE; } if(fabs(vega[i] - refVega[i]) > 0.2) { std::cout << "Failed\n" << std::endl; return SDK_FAILURE; } } std::cout << "Passed!\n" << std::endl; } return SDK_SUCCESS; } int MonteCarloAsianDP::cleanup() { // Releases OpenCL resources (Context, Memory etc.) cl_int status; status = clReleaseMemObject(priceBuf); CHECK_OPENCL_ERROR(status,"clReleaseMemObject failed.(priceBuf)"); status = clReleaseMemObject(priceDerivBuf); CHECK_OPENCL_ERROR(status,"clReleaseMemObject failed.(priceDerivBuf)"); status = clReleaseMemObject(randBuf); CHECK_OPENCL_ERROR(status,"clReleaseMemObject failed.(randBuf)"); status = clReleaseMemObject(priceBufAsync); CHECK_OPENCL_ERROR(status,"clReleaseMemObject failed.(priceBufAsync)"); status = clReleaseMemObject(priceDerivBufAsync); CHECK_OPENCL_ERROR(status,"clReleaseMemObject failed.(priceDerivBufAsync)"); status = clReleaseMemObject(randBufAsync); CHECK_OPENCL_ERROR(status,"clReleaseMemObject failed.(randBufAsync)"); status = clReleaseKernel(kernel); CHECK_OPENCL_ERROR(status,"clReleaseKernel failed.(kernel)"); status = clReleaseProgram(program); CHECK_OPENCL_ERROR(status,"clReleaseProgram failed.(program)"); status = clReleaseCommandQueue(commandQueue); CHECK_OPENCL_ERROR(status,"clReleaseCommandQueue failed.(commandQueue)"); status = clReleaseContext(context); CHECK_OPENCL_ERROR(status,"clReleaseContext failed.(context)"); // Release program resources (input memory etc.) FREE(sigma); FREE(price); FREE(vega); FREE(refPrice); FREE(refVega); if(randNum) { #if defined (_WIN32) ALIGNED_FREE(randNum); #else FREE(randNum); #endif randNum = NULL; } FREE(priceVals); FREE(priceDeriv); FREE(devices); return SDK_SUCCESS; } int main(int argc, char * argv[]) { // Create MonteCalroAsian object MonteCarloAsianDP clMonteCarloAsianDP( "OpenCL Monte Carlo simulation for Asian Option Pricing"); // Initialization if(clMonteCarloAsianDP.initialize() != SDK_SUCCESS) return SDK_FAILURE; // Parse command line options if(clMonteCarloAsianDP.parseCommandLine(argc, argv)) return SDK_FAILURE; if(clMonteCarloAsianDP.isDumpBinaryEnabled()) { return clMonteCarloAsianDP.genBinaryImage(); } else { // Setup int returnVal = clMonteCarloAsianDP.setup(); if(returnVal != SDK_SUCCESS) return (returnVal == SDK_EXPECTED_FAILURE) ? SDK_SUCCESS : SDK_FAILURE; // Run if(clMonteCarloAsianDP.run() != SDK_SUCCESS) return SDK_FAILURE; if(clMonteCarloAsianDP.verifyResults()!=SDK_SUCCESS) return SDK_FAILURE; // Cleanup resources created if(clMonteCarloAsianDP.cleanup() != SDK_SUCCESS) return SDK_FAILURE; clMonteCarloAsianDP.printStats(); } return SDK_SUCCESS; }