/********************************************************************** Copyright ©2012 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: • Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. • Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ********************************************************************/ #include "MonteCarloAsian.hpp" #include #include /* * structure for attributes of Monte Carlo * simulation */ typedef struct _MonteCalroAttrib { cl_float4 strikePrice; cl_float4 c1; cl_float4 c2; cl_float4 c3; cl_float4 initPrice; cl_float4 sigma; cl_float4 timeStep; }MonteCarloAttrib; int MonteCarloAsian::setupMonteCarloAsian() { if(!disableAsync) { steps = (steps < 4) ? 4 : steps; steps = (steps / 2) * 2; } // Validate flags if(dUseOutAllocHostPtr) if(!(disableAsync && disableMapping)) { std::cout << "Note : Neglected --dOutAllocHostPtr flag" << std::endl; dUseOutAllocHostPtr = false; } if(dUseInPersistent) if(!(disableAsync && disableMapping)) { std::cout << "Note : Neglected --dInPersistent flag" << std::endl; dUseInPersistent = false; } int i = 0; const cl_float finalValue = 0.8f; const cl_float stepValue = finalValue / (cl_float)steps; // Allocate and init memory used by host sigma = (cl_float*)malloc(steps * sizeof(cl_float)); CHECK_ALLOCATION(sigma, "Failed to allocate host memory. (sigma)"); sigma[0] = 0.01f; for(i = 1; i < steps; i++) sigma[i] = sigma[i - 1] + stepValue; price = (cl_float*) malloc(steps * sizeof(cl_float)); CHECK_ALLOCATION(price, "Failed to allocate host memory. (price)"); memset((void*)price, 0, steps * sizeof(cl_float)); vega = (cl_float*) malloc(steps * sizeof(cl_float)); CHECK_ALLOCATION(price, "Failed to allocate host memory. (vega)"); memset((void*)vega, 0, steps * sizeof(cl_float)); refPrice = (cl_float*) malloc(steps * sizeof(cl_float)); CHECK_ALLOCATION(refPrice, "Failed to allocate host memory. (refPrice)"); memset((void*)refPrice, 0, steps * sizeof(cl_float)); refVega = (cl_float*) malloc(steps * sizeof(cl_float)); CHECK_ALLOCATION(refVega, "Failed to allocate host memory. (refVega)"); memset((void*)refVega, 0, steps * sizeof(cl_float)); // Set samples and exercise points noOfSum = 12; noOfTraj = 1024; width = noOfTraj / 4; height = noOfTraj / 2; #if defined (_WIN32) randNum = (cl_uint*)_aligned_malloc(width * height * sizeof(cl_uint4), 16); #else randNum = (cl_uint*)memalign(16, width * height * sizeof(cl_uint4)); #endif CHECK_ALLOCATION(randNum, "Failed to allocate host memory. (randNum)"); priceVals = (cl_float*)malloc(width * height * 2 * sizeof(cl_float4)); CHECK_ALLOCATION(priceVals, "Failed to allocate host memory. (priceVals)"); memset((void*)priceVals, 0, width * height * 2 * sizeof(cl_float4)); priceDeriv = (cl_float*)malloc(width * height * 2 * sizeof(cl_float4)); CHECK_ALLOCATION(priceDeriv, "Failed to allocate host memory. (priceDeriv)"); memset((void*)priceDeriv, 0, width * height * 2 * sizeof(cl_float4)); if(!disableAsync && disableMapping) { priceValsAsync = (cl_float*)malloc(width * height * 2 * sizeof(cl_float4)); CHECK_ALLOCATION(priceValsAsync, "Failed to allocate host memory. (priceValsAsync)"); memset((void*)priceValsAsync, 0, width * height * 2 * sizeof(cl_float4)); priceDerivAsync = (cl_float*)malloc(width * height * 2 * sizeof(cl_float4)); CHECK_ALLOCATION(priceDerivAsync, "Failed to allocate host memory. (priceDerivAsync)"); memset((void*)priceDerivAsync, 0, width * height * 2 * sizeof(cl_float4)); } /* * Unless quiet mode has been enabled, print the INPUT array. * No more than 256 values are printed because it clutters the screen * and it is not practical to manually compare a large set of numbers */ if(!quiet) sampleCommon->printArray( "sigma values", sigma, steps, 1); return SDK_SUCCESS; } int MonteCarloAsian::genBinaryImage() { streamsdk::bifData binaryData; binaryData.kernelName = std::string("MonteCarloAsian_Kernels.cl"); binaryData.flagsStr = std::string(""); if(isComplierFlagsSpecified()) binaryData.flagsFileName = std::string(flags.c_str()); binaryData.binaryName = std::string(dumpBinary.c_str()); int status = sampleCommon->generateBinaryImage(binaryData); return status; } int MonteCarloAsian::setupCL(void) { cl_int status = 0; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } // Get platform cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType() failed."); status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue(commandQueue) failed."); if(!dUseInPersistent) { // Set Persistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; if(isAmdPlatform()) inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create buffer for randBuf randBuf = clCreateBuffer(context, inMemFlags, sizeof(cl_uint4) * width * height, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(randBuf) failed."); if(!disableAsync) { // create Buffer for randBuf in asynchronous mode randBufAsync = clCreateBuffer(context, inMemFlags, sizeof(cl_uint4) * width * height, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(randBufAsync) failed."); } } else { // create Normal Buffer, if persistent memory is not in used randBuf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint4) * width * height, randNum, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(randBuf) failed."); } if(!dUseOutAllocHostPtr) { // create Buffer for PriceBuf priceBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(priceBuf) failed."); // create Buffer for PriceDeriveBuffer priceDerivBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(priceDerivBuf) failed."); if(!disableAsync) { // create Buffer for priceBufAsync priceBufAsync = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(priceBufAsync) failed."); // create Buffer for priceDerivBufAsync priceDerivBufAsync = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_float4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(priceDerivBufAsync) failed."); } } else { // create Buffer for priceBuf priceBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(priceBuf) failed."); // create Buffer for priceDerivBuf priceDerivBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * width * height * 2, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer(priceDerivBuf) failed."); } // Create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("MonteCarloAsian_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "calPriceVega", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel(calPriceVega) failed."); // Check group-size against what is returned by kernel status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); if((blockSizeX * blockSizeY) > kernelInfo.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << blockSizeX * blockSizeY << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout << "Falling back to " << kernelInfo.kernelWorkGroupSize << std::endl; } // Three possible cases if(blockSizeX > kernelInfo.kernelWorkGroupSize) { blockSizeX = kernelInfo.kernelWorkGroupSize; blockSizeY = 1; } } return SDK_SUCCESS; } int MonteCarloAsian::runCLKernels(void) { cl_int status; cl_int eventStatus = CL_QUEUED; size_t globalThreads[2] = {width, height}; size_t localThreads[2] = {blockSizeX, blockSizeY}; size_t sizeAfterReduction =width * height / blockSizeX /blockSizeY * 4; // Declare attribute structure MonteCarloAttrib attributes; if(localThreads[0] > deviceInfo.maxWorkItemSizes[0] || localThreads[1] > deviceInfo.maxWorkItemSizes[1] || (localThreads[0] * localThreads[1]) > deviceInfo.maxWorkGroupSize) { std::cout << "Unsupported: Device does not support requested" << ":number of work items."; return SDK_FAILURE; } // width - i.e number of elements in the array status = clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*)&width); CHECK_OPENCL_ERROR(status, "clSetKernelArg(width) failed."); status = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&noOfSum); CHECK_OPENCL_ERROR(status, "clSetKernelArg(noOfSum) failed."); if(disableAsync) { status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); status = clSetKernelArg(kernel, 6, blockSizeX*blockSizeY*sizeof(cl_float8), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(sData1) failed."); } float timeStep = maturity / (noOfSum - 1); // Initialize random number generator srand(1); if(!disableAsync) { if(disableMapping) { cl_event inWriteEvt1; cl_event inWriteEvt2; cl_event outReadEvt11; cl_event outReadEvt12; cl_event outReadEvt21; cl_event outReadEvt22; cl_event ndrEvt; size_t size = width * height * sizeof(cl_float4); for(int k = 0; k < steps / 2; k++) { // Generate data for input for kernel 1 for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); // Fill data of input buffer for kernel 1 status = clEnqueueWriteBuffer( commandQueue, randBuf, CL_FALSE, 0, size, randNum, 0, NULL, &inWriteEvt1); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Get data from output buffers of kernel 2 if(k != 0) { // Wait for kernel 2 to complete status = clWaitForEvents(1, &ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEvents(ndrEvt) failed."); status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); status = clEnqueueReadBuffer( commandQueue, priceBufAsync, CL_FALSE, 0, sizeAfterReduction, priceValsAsync, 0, NULL, &outReadEvt21); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceBufAsync) failed."); status = clEnqueueReadBuffer( commandQueue, priceDerivBufAsync, CL_FALSE, 0, sizeAfterReduction, priceDerivAsync, 0, NULL, &outReadEvt22); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceDerivBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); } // Set up arguments required for kernel 1 float c1 = (interest - 0.5f * sigma[k * 2] * sigma[k * 2]) * timeStep; float c2 = sigma[k * 2] * sqrt(timeStep); float c3 = (interest + 0.5f * sigma[k * 2] * sigma[k * 2]); const cl_float4 c1F4 = {c1, c1, c1, c1}; attributes.c1 = c1F4; const cl_float4 c2F4 = {c2, c2, c2, c2}; attributes.c2 = c2F4; const cl_float4 c3F4 = {c3, c3, c3, c3}; attributes.c3 = c3F4; const cl_float4 initPriceF4 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF4; const cl_float4 strikePriceF4 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF4; const cl_float4 sigmaF4 = {sigma[k * 2], sigma[k * 2], sigma[k * 2], sigma[k * 2]}; attributes.sigma = sigmaF4; const cl_float4 timeStepF4 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF4; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); status = clSetKernelArg(kernel, 6, blockSizeX*blockSizeY*sizeof(cl_float8), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(sData1) failed."); // Wait for input of kernel to complete status = sampleCommon->waitForEventAndRelease(&inWriteEvt1); CHECK_OPENCL_ERROR(status, "clWaitForEventsAndRelease(inWriteEvt1) failed."); // Enqueue kernel 1 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); /* Generate data of input for kernel 2 Fill data of input buffer for kernel 2*/ if(k <= steps - 1) { // Generate data of input for kernel 2 for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); // Fill data of input buffer for kernel 2 status = clEnqueueWriteBuffer( commandQueue, randBufAsync, CL_FALSE, 0, size, randNum, 0, NULL, &inWriteEvt2); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(randBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); } // Wait for output buffers of kernel 2 to complete // Calculate the results from output of kernel 2 if(k != 0) { // Wait for output buffers of kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&outReadEvt21); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(outReadEvt21) failed."); status = sampleCommon->waitForEventAndRelease(&outReadEvt22); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(outReadEvt22) failed."); // Calculate the results from output of kernel 2 for(size_t i = 0; i < sizeAfterReduction; i++) { price[k * 2 - 1] += priceValsAsync[i]; vega[k * 2 - 1] += priceDerivAsync[i]; } price[k * 2 - 1] /= (noOfTraj * noOfTraj); vega[k * 2 - 1] /= (noOfTraj * noOfTraj); price[k * 2 - 1] = exp(-interest * maturity) * price[k * 2 - 1]; vega[k * 2 - 1] = exp(-interest * maturity) * vega[k * 2 - 1]; } // Wait for kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(ndrEvt) failed."); // Get data from output buffers of kernel 1 status = clEnqueueReadBuffer(commandQueue, priceBuf, CL_TRUE, 0, sizeAfterReduction, priceVals, 0, NULL, &outReadEvt11); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceBuf) failed."); status = clEnqueueReadBuffer(commandQueue, priceDerivBuf, CL_TRUE, 0, sizeAfterReduction, priceDeriv, 0, NULL, &outReadEvt12); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Set up arguments required for kernel 2 float c21 = (interest - 0.5f * sigma[k * 2 + 1] * sigma[k * 2 + 1]) * timeStep; float c22 = sigma[k * 2 + 1] * sqrt(timeStep); float c23 = (interest + 0.5f * sigma[k * 2 + 1] * sigma[k * 2 + 1]); const cl_float4 c1F42 = {c21, c21, c21, c21}; attributes.c1 = c1F42; const cl_float4 c2F42 = {c22, c22, c22, c22}; attributes.c2 = c2F42; const cl_float4 c3F42 = {c23, c23, c23, c23}; attributes.c3 = c3F42; const cl_float4 initPriceF42 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF42; const cl_float4 strikePriceF42 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF42; const cl_float4 sigmaF42 = {sigma[k * 2 + 1], sigma[k * 2 + 1], sigma[k * 2 + 1], sigma[k * 2 + 1]}; attributes.sigma = sigmaF42; const cl_float4 timeStepF42 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF42; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); status = clSetKernelArg(kernel, 6, blockSizeX * blockSizeY*sizeof(cl_float8), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(sData1) failed."); // Wait for input of kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&inWriteEvt2); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(inWriteEvt2) failed."); // Enqueue kernel 2 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for output buffers of kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&outReadEvt11); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(outReadEvt11) failed."); status = sampleCommon->waitForEventAndRelease(&outReadEvt12); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(outReadEvt12) failed."); for(size_t i = 0; i < sizeAfterReduction; i++) { price[k * 2] += priceVals[i]; vega[k * 2] += priceDeriv[i]; } price[k * 2] /= (noOfTraj * noOfTraj); vega[k * 2] /= (noOfTraj * noOfTraj); price[k * 2] = exp(-interest * maturity) * price[k * 2]; vega[k * 2] = exp(-interest * maturity) * vega[k * 2]; } // Wait for kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(ndrEvt) failed."); // Gather last kernel 2 execution here status = clEnqueueReadBuffer( commandQueue, priceBufAsync, CL_FALSE, 0, sizeAfterReduction, priceValsAsync, 0, NULL, &outReadEvt21); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceBufAsync) failed."); status = clEnqueueReadBuffer( commandQueue, priceDerivBufAsync, CL_FALSE, 0, sizeAfterReduction, priceDerivAsync, 0, NULL, &outReadEvt22); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceDerivBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for output buffers of kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&outReadEvt21); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(outReadEvt21) failed."); status = sampleCommon->waitForEventAndRelease(&outReadEvt22); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(outReadEvt22) failed."); // Calculate the results from output of kernel 2 for(size_t i = 0; i < sizeAfterReduction; i++) { price[steps - 1] += priceValsAsync[i]; vega[steps - 1] += priceDerivAsync[i]; } price[steps - 1] /= (noOfTraj * noOfTraj); vega[steps - 1] /= (noOfTraj * noOfTraj); price[steps - 1] = exp(-interest * maturity) * price[steps - 1]; vega[steps - 1] = exp(-interest * maturity) * vega[steps - 1]; } else { void* inMapPtr1 = NULL; void* inMapPtr2 = NULL; void* outMapPtr11 = NULL; void* outMapPtr12 = NULL; void* outMapPtr21 = NULL; void* outMapPtr22 = NULL; cl_float* ptr21 = NULL; cl_float* ptr22 = NULL; cl_event inMapEvt1; cl_event inMapEvt2; cl_event inUnmapEvt1; cl_event inUnmapEvt2; cl_event outMapEvt11; cl_event outMapEvt12; cl_event outUnmapEvt11; cl_event outUnmapEvt12; cl_event outMapEvt21; cl_event outMapEvt22; cl_event outUnmapEvt21; cl_event outUnmapEvt22; cl_event ndrEvt; size_t size = width * height * sizeof(cl_float4); for(int k = 0; k < steps / 2; k++) { // Map input buffer for kernel 1 inMapPtr1 = clEnqueueMapBuffer( commandQueue, randBuf, CL_FALSE, CL_MAP_WRITE, 0, size, 0, NULL, &inMapEvt1, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Generate data for input for kernel 1 for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); // Wait for map of input of kernel 1 eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inMapEvt1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(inMapEvt1); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inMapEvt1) failed."); memcpy(inMapPtr1, (void*)randNum, size); // Unmap of input buffer of kernel 1 status = clEnqueueUnmapMemObject( commandQueue, randBuf, inMapPtr1, 0, NULL, &inUnmapEvt1); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Get data from output buffers of kernel 2 if(k != 0) { // Wait for kernel 2 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(ndrEvt) failed."); outMapPtr21 = clEnqueueMapBuffer( commandQueue, priceBufAsync, CL_FALSE, CL_MAP_READ, 0, sizeAfterReduction, 0, NULL, &outMapEvt21, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceBufAsync) failed."); outMapPtr22 = clEnqueueMapBuffer( commandQueue, priceDerivBufAsync, CL_FALSE, CL_MAP_READ, 0, sizeAfterReduction, 0, NULL, &outMapEvt22, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); } // Set up arguments required for kernel 1 float c1 = (interest - 0.5f * sigma[k * 2] * sigma[k * 2]) * timeStep; float c2 = sigma[k * 2] * sqrt(timeStep); float c3 = (interest + 0.5f * sigma[k * 2] * sigma[k * 2]); const cl_float4 c1F4 = {c1, c1, c1, c1}; attributes.c1 = c1F4; const cl_float4 c2F4 = {c2, c2, c2, c2}; attributes.c2 = c2F4; const cl_float4 c3F4 = {c3, c3, c3, c3}; attributes.c3 = c3F4; const cl_float4 initPriceF4 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF4; const cl_float4 strikePriceF4 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF4; const cl_float4 sigmaF4 = {sigma[k * 2], sigma[k * 2], sigma[k * 2], sigma[k * 2]}; attributes.sigma = sigmaF4; const cl_float4 timeStepF4 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF4; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBuf); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); status = clSetKernelArg(kernel, 6, blockSizeX * blockSizeY*sizeof(cl_float8), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(sData1) failed."); // Wait for input of kernel 1 to complete eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inUnmapEvt1, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(inUnmapEvt1); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inUnmapEvt1) failed."); inMapPtr1 = NULL; // Enqueue kernel 1 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Generate data of input for kernel 2 // Fill data of input buffer for kernel 2 if(k <= steps - 1) { // Map input buffer for kernel 1 inMapPtr2 = clEnqueueMapBuffer( commandQueue, randBufAsync, CL_FALSE, CL_MAP_WRITE, 0, size, 0, NULL, &inMapEvt2, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(randBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Generate data for input for kernel 1 for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); // Wait for map of input of kernel 1 eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inMapEvt2, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(inMapEvt2); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inMapEvt2) failed."); memcpy(inMapPtr2, (void*)randNum, size); // Unmap of input buffer of kernel 1 status = clEnqueueUnmapMemObject( commandQueue, randBufAsync, inMapPtr2, 0, NULL, &inUnmapEvt2); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(randBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); } // Wait for output buffers of kernel 2 to complete // Calculate the results from output of kernel 2 if(k != 0) { // Wait for output buffers of kernel 2 to complete eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outMapEvt21, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outMapEvt21); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outMapEvt21) failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outMapEvt22, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outMapEvt22); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outMapEvt22) failed."); // Calculate the results from output of kernel 2 ptr21 = (cl_float*)outMapPtr21; ptr22 = (cl_float*)outMapPtr22; for(size_t i = 0; i < sizeAfterReduction; i++) { price[k * 2 - 1] += ptr21[i]; vega[k * 2 - 1] += ptr22[i]; } price[k * 2 - 1] /= (noOfTraj * noOfTraj); vega[k * 2 - 1] /= (noOfTraj * noOfTraj); price[k * 2 - 1] = exp(-interest * maturity) * price[k * 2 - 1]; vega[k * 2 - 1] = exp(-interest * maturity) * vega[k * 2 - 1]; // Unmap of output buffers of kernel 2 status = clEnqueueUnmapMemObject( commandQueue, priceBufAsync, outMapPtr21, 0, NULL, &outUnmapEvt21); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceBufAsync) failed."); status = clEnqueueUnmapMemObject( commandQueue, priceDerivBufAsync, outMapPtr22, 0, NULL, &outUnmapEvt22); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceDerivBufAsync) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outUnmapEvt21, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outUnmapEvt21); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outUnmapEvt21) failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outUnmapEvt22, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outUnmapEvt22); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outUnmapEvt22) failed."); } // Wait for kernel 1 to complete status = sampleCommon->waitForEventAndRelease(&ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(ndrEvt) failed."); // Get data from output buffers of kernel 1 outMapPtr11 = clEnqueueMapBuffer( commandQueue, priceBuf, CL_FALSE, CL_MAP_READ, 0, sizeAfterReduction, 0, NULL, &outMapEvt11, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceBuf) failed."); outMapPtr12 = clEnqueueMapBuffer( commandQueue, priceDerivBuf, CL_FALSE, CL_MAP_READ, 0, sizeAfterReduction, 0, NULL, &outMapEvt12, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Set up arguments required for kernel 2 float c21 = (interest - 0.5f * sigma[k * 2 + 1] * sigma[k * 2 + 1]) * timeStep; float c22 = sigma[k * 2 + 1] * sqrt(timeStep); float c23 = (interest + 0.5f * sigma[k * 2 + 1] * sigma[k * 2 + 1]); const cl_float4 c1F42 = {c21, c21, c21, c21}; attributes.c1 = c1F42; const cl_float4 c2F42 = {c22, c22, c22, c22}; attributes.c2 = c2F42; const cl_float4 c3F42 = {c23, c23, c23, c23}; attributes.c3 = c3F42; const cl_float4 initPriceF42 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF42; const cl_float4 strikePriceF42 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF42; const cl_float4 sigmaF42 = {sigma[k * 2 + 1], sigma[k * 2 + 1], sigma[k * 2 + 1], sigma[k * 2 + 1]}; attributes.sigma = sigmaF42; const cl_float4 timeStepF42 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF42; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&randBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(randBuf) failed."); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&priceBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceBuf) failed."); status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&priceDerivBufAsync); CHECK_OPENCL_ERROR(status, "clSetKernelArg(priceDerivBuf) failed."); status = clSetKernelArg(kernel, 6, blockSizeX * blockSizeY*sizeof(cl_float8), NULL); CHECK_OPENCL_ERROR(status, "clSetKernelArg(sData1) failed."); // Wait for input of kernel 2 to complete eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inUnmapEvt2, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(inUnmapEvt2); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inUnmapEvt2) failed."); inMapPtr2 = NULL; // Enqueue kernel 2 status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // Wait for output buffers of kernel 1 to complete eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outMapEvt11, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outMapEvt11); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outMapEvt11) failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outMapEvt12, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outMapEvt12); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outMapEvt12) failed."); // Calculate the results from output of kernel 2 ptr21 = (cl_float*)outMapPtr11; ptr22 = (cl_float*)outMapPtr12; for(size_t i = 0; i < sizeAfterReduction; i++) { price[k * 2] += ptr21[i]; vega[k * 2] += ptr22[i]; } price[k * 2] /= (noOfTraj * noOfTraj); vega[k * 2] /= (noOfTraj * noOfTraj); price[k * 2] = exp(-interest * maturity) * price[k * 2]; vega[k * 2] = exp(-interest * maturity) * vega[k * 2]; // Unmap of output buffers of kernel 2 status = clEnqueueUnmapMemObject( commandQueue, priceBuf, outMapPtr11, 0, NULL, &outUnmapEvt11); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceBuf) failed."); status = clEnqueueUnmapMemObject( commandQueue, priceDerivBuf, outMapPtr12, 0, NULL, &outUnmapEvt12); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outUnmapEvt11, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outUnmapEvt11); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outUnmapEvt11) failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outUnmapEvt12, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outUnmapEvt12); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outUnmapEvt12) failed."); } // Wait for kernel 1 to complete status = clWaitForEvents(1, &ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEvents(ndrEvt) failed."); status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent(ndrEvt) failed."); // Gather last kernel 2 execution here outMapPtr21 = clEnqueueMapBuffer( commandQueue, priceBufAsync, CL_FALSE, CL_MAP_READ, 0, sizeAfterReduction, 0, NULL, &outMapEvt21, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceBuf) failed."); outMapPtr22 = clEnqueueMapBuffer( commandQueue, priceDerivBufAsync, CL_FALSE, CL_MAP_READ, 0, sizeAfterReduction, 0, NULL, &outMapEvt22, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outMapEvt21, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outMapEvt21); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outMapEvt21) failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( outMapEvt22, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } status = clReleaseEvent(outMapEvt22); CHECK_OPENCL_ERROR(status, "clReleaseEvent(outMapEvt22) failed."); // Calculate the results from output of kernel 2 ptr21 = (cl_float*)outMapPtr21; ptr22 = (cl_float*)outMapPtr22; for(size_t i = 0; i < sizeAfterReduction; i++) { price[steps - 1] += ptr21[i]; vega[steps - 1] += ptr22[i]; } price[steps - 1] /= (noOfTraj * noOfTraj); vega[steps - 1] /= (noOfTraj * noOfTraj); price[steps - 1] = exp(-interest * maturity) * price[steps - 1]; vega[steps - 1] = exp(-interest * maturity) * vega[steps - 1]; } } else { cl_event events[1]; for(int k = 0; k < steps; k++) { for(int j = 0; j < (width * height * 4); j++) randNum[j] = (cl_uint)rand(); if(!dUseInPersistent) { if(disableMapping) { status = clEnqueueWriteBuffer( commandQueue, randBuf, CL_TRUE, 0, width * height * sizeof(cl_float4), randNum, 0, NULL, NULL); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer(randBuf) failed."); } else { cl_event inEvent; cl_event inUnEvent; void* mapPtr = clEnqueueMapBuffer( commandQueue, randBuf, CL_FALSE, CL_MAP_WRITE, 0, width * height * sizeof(cl_float4), 0, NULL, &inEvent, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); cl_int eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } memcpy(mapPtr, (void*)randNum, width * height * sizeof(cl_float4)); status = clEnqueueUnmapMemObject( commandQueue, randBuf, mapPtr, 0, NULL, &inUnEvent); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(randBuf) failed."); status = clReleaseEvent(inEvent); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inEvent) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&inUnEvent); CHECK_OPENCL_ERROR(status, "clWaitForEventAndRelease(inUnEvent) failed."); } } float c1 = (interest - 0.5f * sigma[k] * sigma[k]) * timeStep; float c2 = sigma[k] * sqrt(timeStep); float c3 = (interest + 0.5f * sigma[k] * sigma[k]); const cl_float4 c1F4 = {c1, c1, c1, c1}; attributes.c1 = c1F4; const cl_float4 c2F4 = {c2, c2, c2, c2}; attributes.c2 = c2F4; const cl_float4 c3F4 = {c3, c3, c3, c3}; attributes.c3 = c3F4; const cl_float4 initPriceF4 = {initPrice, initPrice, initPrice, initPrice}; attributes.initPrice = initPriceF4; const cl_float4 strikePriceF4 = {strikePrice, strikePrice, strikePrice, strikePrice}; attributes.strikePrice = strikePriceF4; const cl_float4 sigmaF4 = {sigma[k], sigma[k], sigma[k], sigma[k]}; attributes.sigma = sigmaF4; const cl_float4 timeStepF4 = {timeStep, timeStep, timeStep, timeStep}; attributes.timeStep = timeStepF4; // Set appropriate arguments to the kernel status = clSetKernelArg(kernel, 0, sizeof(attributes), (void*)&attributes); CHECK_OPENCL_ERROR(status, "clSetKernelArg(attributes) failed."); // Enqueue a kernel run call. status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel() failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); // wait for the kernel call to finish execution status = sampleCommon->waitForEventAndRelease(&events[0]); CHECK_ERROR(status,0, "WaitForEventAndRelease(events[0]) Failed"); if(!disableMapping) { cl_event inEvent; cl_event inUnEvent; void* mapPtr = clEnqueueMapBuffer( commandQueue, priceBuf, CL_FALSE, CL_MAP_READ, 0, width * height * 2 * sizeof(cl_float4), 0, NULL, &inEvent, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(randBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } memcpy((void*)priceVals, mapPtr, width * height * 2 * sizeof(cl_float4)); status = clEnqueueUnmapMemObject( commandQueue, priceBuf, mapPtr, 0, NULL, &inUnEvent); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceBuf) failed."); status = clReleaseEvent(inEvent); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inEvent) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&inUnEvent); CHECK_ERROR(status,0, "WaitForEventAndRelease(inUnEvent) Failed"); mapPtr = clEnqueueMapBuffer( commandQueue, priceDerivBuf, CL_FALSE, CL_MAP_READ, 0, width * height * 2 * sizeof(cl_float4), 0, NULL, &inEvent, &status); CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer(priceDerivBuf) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); eventStatus = CL_QUEUED; while(eventStatus != CL_COMPLETE) { status = clGetEventInfo( inEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &eventStatus, NULL); CHECK_OPENCL_ERROR(status, "clGetEventInfo failed."); } memcpy((void*)priceDeriv, mapPtr, width * height * 2 * sizeof(cl_float4)); status = clEnqueueUnmapMemObject( commandQueue, priceDerivBuf, mapPtr, 0, NULL, &inUnEvent); CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject(priceDerivBuf) failed."); status = clReleaseEvent(inEvent); CHECK_OPENCL_ERROR(status, "clReleaseEvent(inEvent) failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush() failed."); status = sampleCommon->waitForEventAndRelease(&inUnEvent); CHECK_OPENCL_ERROR(status, "WaitForEventAndRelease(inUnEvent) Failed"); } else { // Enqueue the results to application pointer status = clEnqueueReadBuffer(commandQueue, priceBuf, CL_TRUE, 0, width * height * 2 * sizeof(cl_float4), priceVals, 0, NULL, &events[0]); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceBuf) failed."); // wait for the read buffer to finish execution status = sampleCommon->waitForEventAndRelease(&events[0]); CHECK_OPENCL_ERROR(status, "clWaitForEventsAndRelease(events[0]) failed."); // Enqueue the results to application pointer status = clEnqueueReadBuffer(commandQueue, priceDerivBuf, CL_TRUE, 0, width * height * 2 * sizeof(cl_float4), priceDeriv, 0, NULL, &events[0]); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer(priceDerivBuf) failed."); // wait for the read buffer to finish execution status = sampleCommon->waitForEventAndRelease(&events[0]); CHECK_OPENCL_ERROR(status, "clWaitForEventsAndRelease(events[0]) failed."); } // Replace Following "for" loop with reduction kernel for(size_t i = 0; i < sizeAfterReduction; i++) { price[k] += priceVals[i]; vega[k] += priceDeriv[i]; } price[k] /= (noOfTraj * noOfTraj); vega[k] /= (noOfTraj * noOfTraj); price[k] = exp(-interest * maturity) * price[k]; vega[k] = exp(-interest * maturity) * vega[k]; } } return SDK_SUCCESS; } int MonteCarloAsian::initialize() { // Call base class Initialize to get default configuration CHECK_ERROR(this->SDKSample::initialize(), SDK_SUCCESS, "OpenCL resource initilization failed"); const int optionsCount = 5; streamsdk::Option *optionList = new streamsdk::Option[optionsCount]; CHECK_ALLOCATION(optionList, " Allocate memory failed (optionList)\n"); optionList[0]._sVersion = "c"; optionList[0]._lVersion = "steps"; optionList[0]._description = "Steps of Monte carlo simuation"; optionList[0]._type = streamsdk::CA_ARG_INT; optionList[0]._value = &steps; optionList[1]._sVersion = "P"; optionList[1]._lVersion = "initPrice"; optionList[1]._description = "Initial price(Default value 50)"; optionList[1]._type = streamsdk::CA_ARG_FLOAT;//STRING; optionList[1]._value = &initPrice; optionList[2]._sVersion = "s"; optionList[2]._lVersion = "strikePrice"; optionList[2]._description = "Strike price (Default value 55)"; optionList[2]._type = streamsdk::CA_ARG_FLOAT;//STRING; optionList[2]._value = &strikePrice; optionList[3]._sVersion = "r"; optionList[3]._lVersion = "interest"; optionList[3]._description = "interest rate (Default value 0.06)"; optionList[3]._type = streamsdk::CA_ARG_FLOAT;//STRING; optionList[3]._value = &interest; optionList[4]._sVersion = "m"; optionList[4]._lVersion = "maturity"; optionList[4]._description = "Maturity (Default value 1)"; optionList[4]._type = streamsdk::CA_ARG_FLOAT;//STRING; optionList[4]._value = &maturity; for(cl_int i = 0; i < optionsCount; ++i) sampleArgs->AddOption(&optionList[i]); delete[] optionList; streamsdk::Option* iteration_option = new streamsdk::Option; CHECK_ALLOCATION(iteration_option, "Failed to allocate memory (iteration_option)\n"); iteration_option->_sVersion = "i"; iteration_option->_lVersion = "iterations"; iteration_option->_description = "Number of iterations to execute kernel"; iteration_option->_type = streamsdk::CA_ARG_INT; iteration_option->_value = &iterations; sampleArgs->AddOption(iteration_option); delete iteration_option; streamsdk::Option* inPersistent_option = new streamsdk::Option; CHECK_ALLOCATION(inPersistent_option, "Failed to allocate memory (inPersistent_option)\n"); inPersistent_option->_sVersion = ""; inPersistent_option->_lVersion = "dInPersistent"; inPersistent_option->_description = "Disables the Persistent memory for input buffers"; inPersistent_option->_type = streamsdk::CA_NO_ARGUMENT; inPersistent_option->_value = &dUseInPersistent; sampleArgs->AddOption(inPersistent_option); delete inPersistent_option; streamsdk::Option* outAllocHostPtr_option = new streamsdk::Option; CHECK_ALLOCATION(outAllocHostPtr_option, "Failed to allocate memory (outAllocHostPtr_option)\n"); outAllocHostPtr_option->_sVersion = ""; outAllocHostPtr_option->_lVersion = "dOutAllocHostPtr"; outAllocHostPtr_option->_description = "Disables the Alloc host ptr for output buffers"; outAllocHostPtr_option->_type = streamsdk::CA_NO_ARGUMENT; outAllocHostPtr_option->_value = &dUseOutAllocHostPtr; sampleArgs->AddOption(outAllocHostPtr_option); delete outAllocHostPtr_option; streamsdk::Option* disableMapping_option = new streamsdk::Option; CHECK_ALLOCATION(disableMapping_option, "Failed to allocate memory (disableMapping_option)\n"); disableMapping_option->_sVersion = ""; disableMapping_option->_lVersion = "dMapping"; disableMapping_option->_description = "Disables mapping/unmapping and uses read/write buffers."; disableMapping_option->_type = streamsdk::CA_NO_ARGUMENT; disableMapping_option->_value = &disableMapping; sampleArgs->AddOption(disableMapping_option); delete disableMapping_option; streamsdk::Option* disableAsync_option = new streamsdk::Option; CHECK_ALLOCATION(disableAsync_option, "Failed to allocate memory (disableAsync_option)\n"); disableAsync_option->_sVersion = ""; disableAsync_option->_lVersion = "dAsync"; disableAsync_option->_description = "Disables Asynchronous."; disableAsync_option->_type = streamsdk::CA_NO_ARGUMENT; disableAsync_option->_value = &disableAsync; sampleArgs->AddOption(disableAsync_option); delete disableAsync_option; return SDK_SUCCESS; } int MonteCarloAsian::setup() { if(iterations < 1) { std::cout<<"Error, iterations cannot be 0 or negative. Exiting..\n"; exit(0); } if(setupMonteCarloAsian() != SDK_SUCCESS) return SDK_FAILURE; // create and initialize timers int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); if(setupCL() != SDK_SUCCESS) return SDK_FAILURE; sampleCommon->stopTimer(timer); // Compute setup time setupTime = (double)(sampleCommon->readTimer(timer)); return SDK_SUCCESS; } int MonteCarloAsian::run() { // Warm up for(int i = 0; i < 2 && iterations != 1; i++) { // Arguments are set and execution call is enqueued on command buffer if(runCLKernels() != SDK_SUCCESS) return SDK_FAILURE; } std::cout<<"Executing kernel for " << iterations << " iterations" << std::endl; std::cout<<"-------------------------------------------" << std::endl; // create and initialize timers int timer = sampleCommon->createTimer(); sampleCommon->resetTimer(timer); sampleCommon->startTimer(timer); for(int i = 0; i < iterations; i++) { // Arguments are set and execution call is enqueued on command buffer if(runCLKernels() != SDK_SUCCESS) return SDK_FAILURE; } sampleCommon->stopTimer(timer); // Compute average kernel time kernelTime = (double)(sampleCommon->readTimer(timer)) / iterations; if(!quiet) { sampleCommon->printArray("price", price, steps, 1); sampleCommon->printArray("vega", vega, steps, 1); } return SDK_SUCCESS; } void MonteCarloAsian::printStats() { std::string strArray[4] = { "Steps", "Time(sec)", "[Transfer+Kernel](sec)", "Samples used /sec" }; std::string stats[4]; totalTime = setupTime + kernelTime; stats[0] = sampleCommon->toString(steps, std::dec); stats[1] = sampleCommon->toString(totalTime, std::dec); stats[2] = sampleCommon->toString(kernelTime, std::dec); stats[3] = sampleCommon->toString((noOfTraj * (noOfSum - 1) * steps) / kernelTime, std::dec); this->SDKSample::printStats(strArray, stats, 4); } void MonteCarloAsian::lshift128(unsigned int* input, unsigned int shift, unsigned int * output) { unsigned int invshift = 32u - shift; output[0] = input[0] << shift; output[1] = (input[1] << shift) | (input[0] >> invshift); output[2] = (input[2] << shift) | (input[1] >> invshift); output[3] = (input[3] << shift) | (input[2] >> invshift); } void MonteCarloAsian::rshift128(unsigned int* input, unsigned int shift, unsigned int* output) { unsigned int invshift = 32u - shift; output[3]= input[3] >> shift; output[2] = (input[2] >> shift) | (input[0] >> invshift); output[1] = (input[1] >> shift) | (input[1] >> invshift); output[0] = (input[0] >> shift) | (input[2] >> invshift); } void MonteCarloAsian::generateRand(unsigned int* seed, float *gaussianRand1, float *gaussianRand2, unsigned int* nextRand) { unsigned int mulFactor = 4; unsigned int temp[8][4]; unsigned int state1[4] = {seed[0], seed[1], seed[2], seed[3]}; unsigned int state2[4] = {0u, 0u, 0u, 0u}; unsigned int state3[4] = {0u, 0u, 0u, 0u}; unsigned int state4[4] = {0u, 0u, 0u, 0u}; unsigned int state5[4] = {0u, 0u, 0u, 0u}; unsigned int stateMask = 1812433253u; unsigned int thirty = 30u; unsigned int mask4[4] = {stateMask, stateMask, stateMask, stateMask}; unsigned int thirty4[4] = {thirty, thirty, thirty, thirty}; unsigned int one4[4] = {1u, 1u, 1u, 1u}; unsigned int two4[4] = {2u, 2u, 2u, 2u}; unsigned int three4[4] = {3u, 3u, 3u, 3u}; unsigned int four4[4] = {4u, 4u, 4u, 4u}; unsigned int r1[4] = {0u, 0u, 0u, 0u}; unsigned int r2[4] = {0u, 0u, 0u, 0u}; unsigned int a[4] = {0u, 0u, 0u, 0u}; unsigned int b[4] = {0u, 0u, 0u, 0u}; unsigned int e[4] = {0u, 0u, 0u, 0u}; unsigned int f[4] = {0u, 0u, 0u, 0u}; unsigned int thirteen = 13u; unsigned int fifteen = 15u; unsigned int shift = 8u * 3u; unsigned int mask11 = 0xfdff37ffu; unsigned int mask12 = 0xef7f3f7du; unsigned int mask13 = 0xff777b7du; unsigned int mask14 = 0x7ff7fb2fu; const float one = 1.0f; const float intMax = 4294967296.0f; const float PI = 3.14159265358979f; const float two = 2.0f; float r[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float phi[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float temp1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float temp2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; //Initializing states. for(int c = 0; c < 4; ++c) { state2[c] = mask4[c] * (state1[c] ^ (state1[c] >> thirty4[c])) + one4[c]; state3[c] = mask4[c] * (state2[c] ^ (state2[c] >> thirty4[c])) + two4[c]; state4[c] = mask4[c] * (state3[c] ^ (state3[c] >> thirty4[c])) + three4[c]; state5[c] = mask4[c] * (state4[c] ^ (state4[c] >> thirty4[c])) + four4[c]; } unsigned int i = 0; for(i = 0; i < mulFactor; ++i) { switch(i) { case 0: for(int c = 0; c < 4; ++c) { r1[c] = state4[c]; r2[c] = state5[c]; a[c] = state1[c]; b[c] = state3[c]; } break; case 1: for(int c = 0; c < 4; ++c) { r1[c] = r2[c]; r2[c] = temp[0][c]; a[c] = state2[c]; b[c] = state4[c]; } break; case 2: for(int c = 0; c < 4; ++c) { r1[c] = r2[c]; r2[c] = temp[1][c]; a[c] = state3[c]; b[c] = state5[c]; } break; case 3: for(int c = 0; c < 4; ++c) { r1[c] = r2[c]; r2[c] = temp[2][c]; a[c] = state4[c]; b[c] = state1[c]; } break; default: break; } lshift128(a, shift, e); rshift128(r1, shift, f); temp[i][0] = a[0] ^ e[0] ^ ((b[0] >> thirteen) & mask11) ^ f[0] ^ (r2[0] << fifteen); temp[i][1] = a[1] ^ e[1] ^ ((b[1] >> thirteen) & mask12) ^ f[1] ^ (r2[1] << fifteen); temp[i][2] = a[2] ^ e[2] ^ ((b[2] >> thirteen) & mask13) ^ f[2] ^ (r2[2] << fifteen); temp[i][3] = a[3] ^ e[3] ^ ((b[3] >> thirteen) & mask14) ^ f[3] ^ (r2[3] << fifteen); } for(int c = 0; c < 4; ++c) { temp1[c] = temp[0][c] * one / intMax; temp2[c] = temp[1][c] * one / intMax; } for(int c = 0; c < 4; ++c) { // Applying Box Mullar Transformations. r[c] = sqrt((-two) * log(temp1[c])); phi[c] = two * PI * temp2[c]; gaussianRand1[c] = r[c] * cos(phi[c]); gaussianRand2[c] = r[c] * sin(phi[c]); nextRand[c] = temp[2][c]; } } void MonteCarloAsian::calOutputs(float strikePrice, float* meanDeriv1, float* meanDeriv2, float* meanPrice1, float* meanPrice2, float* pathDeriv1, float* pathDeriv2, float* priceVec1, float* priceVec2) { float temp1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float temp2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float temp3[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float temp4[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float tempDiff1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float tempDiff2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; for(int c = 0; c < 4; ++c) { tempDiff1[c] = meanPrice1[c] - strikePrice; tempDiff2[c] = meanPrice2[c] - strikePrice; } if(tempDiff1[0] > 0.0f) { temp1[0] = 1.0f; temp3[0] = tempDiff1[0]; } if(tempDiff1[1] > 0.0f) { temp1[1] = 1.0f; temp3[1] = tempDiff1[1]; } if(tempDiff1[2] > 0.0f) { temp1[2] = 1.0f; temp3[2] = tempDiff1[2]; } if(tempDiff1[3] > 0.0f) { temp1[3] = 1.0f; temp3[3] = tempDiff1[3]; } if(tempDiff2[0] > 0.0f) { temp2[0] = 1.0f; temp4[0] = tempDiff2[0]; } if(tempDiff2[1] > 0.0f) { temp2[1] = 1.0f; temp4[1] = tempDiff2[1]; } if(tempDiff2[2] > 0.0f) { temp2[2] = 1.0f; temp4[2] = tempDiff2[2]; } if(tempDiff2[3] > 0.0f) { temp2[3] = 1.0f; temp4[3] = tempDiff2[3]; } for(int c = 0; c < 4; ++c) { pathDeriv1[c] = meanDeriv1[c] * temp1[c]; pathDeriv2[c] = meanDeriv2[c] * temp2[c]; priceVec1[c] = temp3[c]; priceVec2[c] = temp4[c]; } } void MonteCarloAsian::cpuReferenceImpl() { float timeStep = maturity / (noOfSum - 1); // Initialize random number generator srand(1); for(int k = 0; k < steps; k++) { float c1 = (interest - 0.5f * sigma[k] * sigma[k]) * timeStep; float c2 = sigma[k] * sqrt(timeStep); float c3 = (interest + 0.5f * sigma[k] * sigma[k]); for(int j = 0; j < (width * height); j++) { unsigned int nextRand[4] = {0u, 0u, 0u, 0u}; for(int c = 0; c < 4; ++c) nextRand[c] = (cl_uint)rand(); float trajPrice1[4] = {initPrice, initPrice, initPrice, initPrice}; float sumPrice1[4] = {initPrice, initPrice, initPrice, initPrice}; float sumDeriv1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float meanPrice1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float meanDeriv1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float price1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float pathDeriv1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float trajPrice2[4] = {initPrice, initPrice, initPrice, initPrice}; float sumPrice2[4] = {initPrice, initPrice, initPrice, initPrice}; float sumDeriv2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float meanPrice2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float meanDeriv2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float price2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float pathDeriv2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; //Run the Monte Carlo simulation a total of Num_Sum - 1 times for(int i = 1; i < noOfSum; i++) { unsigned int tempRand[4] = {0u, 0u, 0u, 0u}; for(int c = 0; c < 4; ++c) tempRand[c] = nextRand[c]; float gaussian1[4] = {0.0f, 0.0f, 0.0f, 0.0f}; float gaussian2[4] = {0.0f, 0.0f, 0.0f, 0.0f}; generateRand(tempRand, gaussian1, gaussian2, nextRand); //Calculate the trajectory price and sum price for all trajectories for(int c = 0; c < 4; ++c) { trajPrice1[c] = trajPrice1[c] * exp(c1 + c2 * gaussian1[c]); trajPrice2[c] = trajPrice2[c] * exp(c1 + c2 * gaussian2[c]); sumPrice1[c] = sumPrice1[c] + trajPrice1[c]; sumPrice2[c] = sumPrice2[c] + trajPrice2[c]; float temp = c3 * timeStep * i; // Calculate the derivative price for all trajectories sumDeriv1[c] = sumDeriv1[c] + trajPrice1[c] * ((log(trajPrice1[c] / initPrice) - temp) / sigma[k]); sumDeriv2[c] = sumDeriv2[c] + trajPrice2[c] * ((log(trajPrice2[c] / initPrice) - temp) / sigma[k]); } } //Calculate the average price and average derivative of each simulated path for(int c = 0; c < 4; ++c) { meanPrice1[c] = sumPrice1[c] / noOfSum; meanPrice2[c] = sumPrice2[c] / noOfSum; meanDeriv1[c] = sumDeriv1[c] / noOfSum; meanDeriv2[c] = sumDeriv2[c] / noOfSum; } calOutputs(strikePrice, meanDeriv1, meanDeriv2, meanPrice1, meanPrice2, pathDeriv1, pathDeriv2, price1, price2); for(int c = 0; c < 4; ++c) { priceVals[j * 8 + c] = price1[c]; priceVals[j * 8 + 1 * 4 + c] = price2[c]; priceDeriv[j * 8 + c] = pathDeriv1[c]; priceDeriv[j * 8 + 1 * 4 + c] = pathDeriv2[c]; } } // Replace Following "for" loop with reduction kernel for(int i = 0; i < noOfTraj * noOfTraj; i++) { refPrice[k] += priceVals[i]; refVega[k] += priceDeriv[i]; } refPrice[k] /= (noOfTraj * noOfTraj); refVega[k] /= (noOfTraj * noOfTraj); refPrice[k] = exp(-interest * maturity) * refPrice[k]; refVega[k] = exp(-interest * maturity) * refVega[k]; } } int MonteCarloAsian::verifyResults() { if(verify) { /* reference implementation * it overwrites the input array with the output */ cpuReferenceImpl(); // compare the results and see if they match for(int i = 0; i < steps; ++i) { if(fabs(price[i] - refPrice[i]) > 0.2f) { std::cout << "Failed\n"; return SDK_FAILURE; } if(fabs(vega[i] - refVega[i]) > 0.2f) { std::cout << "Failed\n" << std::endl; return SDK_FAILURE; } } std::cout << "Passed!\n" << std::endl; } return SDK_SUCCESS; } int MonteCarloAsian::cleanup() { // Releases OpenCL resources (Context, Memory etc.) cl_int status; status = clReleaseMemObject(priceBuf); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(priceBuf) failed."); status = clReleaseMemObject(priceDerivBuf); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(priceDerivBuf) failed."); status = clReleaseMemObject(randBuf); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(randBuf) failed."); if(!disableAsync) { status = clReleaseMemObject(priceBufAsync); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(priceBufAsync) failed."); status = clReleaseMemObject(priceDerivBufAsync); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(priceDerivBufAsync) failed."); status = clReleaseMemObject(randBufAsync); CHECK_OPENCL_ERROR(status, "clReleaseMemObject(randBufAsync) failed."); } status = clReleaseKernel(kernel); CHECK_OPENCL_ERROR(status, "clReleaseKernel(kernel) failed."); status = clReleaseProgram(program); CHECK_OPENCL_ERROR(status, "clReleaseProgram(program) failed."); status = clReleaseCommandQueue(commandQueue); CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue(readKernel) failed."); status = clReleaseContext(context); CHECK_OPENCL_ERROR(status, "clReleaseContext(context) failed."); // Release program resources (input memory etc.) FREE(sigma); FREE(price); FREE(vega); FREE(refPrice); FREE(refVega); FREE(priceVals); FREE(priceDeriv); if(!disableAsync && disableMapping) { FREE(priceValsAsync); FREE(priceDerivAsync); } FREE(devices); return SDK_SUCCESS; } int main(int argc, char * argv[]) { // Create MonteCalroAsian object MonteCarloAsian clMonteCarloAsian( "OpenCL Monte Carlo simulation for Asian Option Pricing"); // Initialization if(clMonteCarloAsian.initialize()!=SDK_SUCCESS) return SDK_FAILURE; // Parse command line options if(clMonteCarloAsian.parseCommandLine(argc, argv)) return SDK_FAILURE; if(clMonteCarloAsian.isDumpBinaryEnabled()) return clMonteCarloAsian.genBinaryImage(); else { // Setup if(clMonteCarloAsian.setup() != SDK_SUCCESS) return SDK_FAILURE; // Run if(clMonteCarloAsian.run() != SDK_SUCCESS) return SDK_FAILURE; //VerifyResults. if(clMonteCarloAsian.verifyResults() != SDK_SUCCESS) return SDK_FAILURE; // Cleanup resources created if(clMonteCarloAsian.cleanup() != SDK_SUCCESS) return SDK_FAILURE; // Print performance statistics clMonteCarloAsian.printStats(); } return SDK_SUCCESS; }