--- libreoffice-6.4.2.2-orig/sc/source/core/opencl/formulagroupcl.cxx 2020-03-12 03:18:35.000000000 +1100 +++ libreoffice-6.4.2.2/sc/source/core/opencl/formulagroupcl.cxx 2020-03-24 16:14:55.930166474 +1100 @@ -1026,9 +1026,6 @@ /// Handling a Double Vector that is used as a sliding window input /// to either a sliding window average or sum-of-products /// Generate a sequential loop for reductions -class OpAverage; -class OpCount; - template class DynamicKernelSlidingArgument : public Base { @@ -1335,185 +1332,7 @@ } /// Emit the definition for the auxiliary reduction kernel - virtual void GenSlidingWindowFunction( std::stringstream& ss ) - { - if (!dynamic_cast(mpCodeGen.get())) - { - std::string name = Base::GetName(); - ss << "__kernel void " << name; - ss << "_reduction(__global double* A, " - "__global double *result,int arrayLength,int windowSize){\n"; - ss << " double tmp, current_result =" << - mpCodeGen->GetBottom(); - ss << ";\n"; - ss << " int writePos = get_group_id(1);\n"; - ss << " int lidx = get_local_id(0);\n"; - ss << " __local double shm_buf[256];\n"; - if (mpDVR->IsStartFixed()) - ss << " int offset = 0;\n"; - else // if (!mpDVR->IsStartFixed()) - ss << " int offset = get_group_id(1);\n"; - if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) - ss << " int end = windowSize;\n"; - else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) - ss << " int end = offset + windowSize;\n"; - else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) - ss << " int end = windowSize + get_group_id(1);\n"; - else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) - ss << " int end = windowSize;\n"; - ss << " end = min(end, arrayLength);\n"; - - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " int loop = arrayLength/512 + 1;\n"; - ss << " for (int l=0; lGen2( - "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; - ss << " tmp = legalize(" << mpCodeGen->Gen2( - "A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n"; - ss << " } else if ((loopOffset + lidx + offset) < end)\n"; - ss << " tmp = legalize(" << mpCodeGen->Gen2( - "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; - ss << " shm_buf[lidx] = tmp;\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " for (int i = 128; i >0; i/=2) {\n"; - ss << " if (lidx < i)\n"; - ss << " shm_buf[lidx] = "; - // Special case count - if (dynamic_cast(mpCodeGen.get())) - ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; - else - ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << ";\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " current_result ="; - if (dynamic_cast(mpCodeGen.get())) - ss << "current_result + shm_buf[0]"; - else - ss << mpCodeGen->Gen2("current_result", "shm_buf[0]"); - ss << ";\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " result[writePos] = current_result;\n"; - ss << "}\n"; - } - else - { - std::string name = Base::GetName(); - /*sum reduction*/ - ss << "__kernel void " << name << "_sum"; - ss << "_reduction(__global double* A, " - "__global double *result,int arrayLength,int windowSize){\n"; - ss << " double tmp, current_result =" << - mpCodeGen->GetBottom(); - ss << ";\n"; - ss << " int writePos = get_group_id(1);\n"; - ss << " int lidx = get_local_id(0);\n"; - ss << " __local double shm_buf[256];\n"; - if (mpDVR->IsStartFixed()) - ss << " int offset = 0;\n"; - else // if (!mpDVR->IsStartFixed()) - ss << " int offset = get_group_id(1);\n"; - if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) - ss << " int end = windowSize;\n"; - else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) - ss << " int end = offset + windowSize;\n"; - else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) - ss << " int end = windowSize + get_group_id(1);\n"; - else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) - ss << " int end = windowSize;\n"; - ss << " end = min(end, arrayLength);\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " int loop = arrayLength/512 + 1;\n"; - ss << " for (int l=0; l0; i/=2) {\n"; - ss << " if (lidx < i)\n"; - ss << " shm_buf[lidx] = "; - ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " current_result ="; - ss << "current_result + shm_buf[0]"; - ss << ";\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " result[writePos] = current_result;\n"; - ss << "}\n"; - /*count reduction*/ - ss << "__kernel void " << name << "_count"; - ss << "_reduction(__global double* A, " - "__global double *result,int arrayLength,int windowSize){\n"; - ss << " double tmp, current_result =" << - mpCodeGen->GetBottom(); - ss << ";\n"; - ss << " int writePos = get_group_id(1);\n"; - ss << " int lidx = get_local_id(0);\n"; - ss << " __local double shm_buf[256];\n"; - if (mpDVR->IsStartFixed()) - ss << " int offset = 0;\n"; - else // if (!mpDVR->IsStartFixed()) - ss << " int offset = get_group_id(1);\n"; - if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) - ss << " int end = windowSize;\n"; - else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) - ss << " int end = offset + windowSize;\n"; - else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) - ss << " int end = windowSize + get_group_id(1);\n"; - else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) - ss << " int end = windowSize;\n"; - ss << " end = min(end, arrayLength);\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " int loop = arrayLength/512 + 1;\n"; - ss << " for (int l=0; l0; i/=2) {\n"; - ss << " if (lidx < i)\n"; - ss << " shm_buf[lidx] = "; - ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " current_result ="; - ss << "current_result + shm_buf[0];"; - ss << ";\n"; - ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; - ss << " }\n"; - ss << " if (lidx == 0)\n"; - ss << " result[writePos] = current_result;\n"; - ss << "}\n"; - } - - } + virtual void GenSlidingWindowFunction( std::stringstream& ss ); virtual std::string GenSlidingWindowDeclRef( bool ) const { @@ -1527,194 +1346,9 @@ /// Controls how the elements in the DoubleVectorRef are traversed size_t GenReductionLoopHeader( - std::stringstream& ss, int nResultSize, bool& needBody ) - { - assert(mpDVR); - size_t nCurWindowSize = mpDVR->GetRefRowSize(); - std::string temp = Base::GetName() + "[gid0]"; - ss << "tmp = "; - // Special case count - if (dynamic_cast(mpCodeGen.get())) - { - ss << mpCodeGen->Gen2(temp, "tmp") << ";\n"; - ss << "nCount = nCount-1;\n"; - ss << "nCount = nCount +"; /*re-assign nCount from count reduction*/ - ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n"; - } - else if (dynamic_cast(mpCodeGen.get())) - ss << temp << "+ tmp"; - else - ss << mpCodeGen->Gen2(temp, "tmp"); - ss << ";\n\t"; - needBody = false; - return nCurWindowSize; - } + std::stringstream& ss, int nResultSize, bool& needBody ); - virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ) - { - assert(Base::mpClmem == nullptr); - - openclwrapper::KernelEnv kEnv; - openclwrapper::setKernelEnv(&kEnv); - cl_int err; - size_t nInput = mpDVR->GetArrayLength(); - size_t nCurWindowSize = mpDVR->GetRefRowSize(); - // create clmem buffer - if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr) - throw Unhandled(__FILE__, __LINE__); - double* pHostBuffer = const_cast( - mpDVR->GetArrays()[Base::mnIndex].mpNumericArray); - size_t szHostBuffer = nInput * sizeof(double); - Base::mpClmem = clCreateBuffer(kEnv.mpkContext, - cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR, - szHostBuffer, - pHostBuffer, &err); - SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " << pHostBuffer); - - mpClmem2 = clCreateBuffer(kEnv.mpkContext, - CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, - sizeof(double) * w, nullptr, nullptr); - if (CL_SUCCESS != err) - throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); - SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << sizeof(double) << "*" << w << "=" << (sizeof(double)*w)); - - // reproduce the reduction function name - std::string kernelName; - if (!dynamic_cast(mpCodeGen.get())) - kernelName = Base::GetName() + "_reduction"; - else - kernelName = Base::GetName() + "_sum_reduction"; - cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); - if (err != CL_SUCCESS) - throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); - SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); - - // set kernel arg of reduction kernel - // TODO(Wei Wei): use unique name for kernel - cl_mem buf = Base::GetCLBuffer(); - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); - err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), - static_cast(&buf)); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); - err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); - err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast(&nInput)); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); - err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast(&nCurWindowSize)); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - // set work group size and execute - size_t global_work_size[] = { 256, static_cast(w) }; - size_t const local_work_size[] = { 256, 1 }; - SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); - err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, - global_work_size, local_work_size, 0, nullptr, nullptr); - if (CL_SUCCESS != err) - throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); - err = clFinish(kEnv.mpkCmdQueue); - if (CL_SUCCESS != err) - throw OpenCLError("clFinish", err, __FILE__, __LINE__); - if (dynamic_cast(mpCodeGen.get())) - { - /*average need more reduction kernel for count computing*/ - std::unique_ptr pAllBuffer(new double[2 * w]); - double* resbuf = static_cast(clEnqueueMapBuffer(kEnv.mpkCmdQueue, - mpClmem2, - CL_TRUE, CL_MAP_READ, 0, - sizeof(double) * w, 0, nullptr, nullptr, - &err)); - if (err != CL_SUCCESS) - throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); - - for (int i = 0; i < w; i++) - pAllBuffer[i] = resbuf[i]; - err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); - if (err != CL_SUCCESS) - throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, __LINE__); - - kernelName = Base::GetName() + "_count_reduction"; - redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); - if (err != CL_SUCCESS) - throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); - SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); - - // set kernel arg of reduction kernel - buf = Base::GetCLBuffer(); - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); - err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), - static_cast(&buf)); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); - err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); - err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast(&nInput)); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); - err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast(&nCurWindowSize)); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - - // set work group size and execute - size_t global_work_size1[] = { 256, static_cast(w) }; - size_t const local_work_size1[] = { 256, 1 }; - SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); - err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, - global_work_size1, local_work_size1, 0, nullptr, nullptr); - if (CL_SUCCESS != err) - throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); - err = clFinish(kEnv.mpkCmdQueue); - if (CL_SUCCESS != err) - throw OpenCLError("clFinish", err, __FILE__, __LINE__); - resbuf = static_cast(clEnqueueMapBuffer(kEnv.mpkCmdQueue, - mpClmem2, - CL_TRUE, CL_MAP_READ, 0, - sizeof(double) * w, 0, nullptr, nullptr, - &err)); - if (err != CL_SUCCESS) - throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); - for (int i = 0; i < w; i++) - pAllBuffer[i + w] = resbuf[i]; - err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); - // FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails? - if (CL_SUCCESS != err) - SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err)); - if (mpClmem2) - { - err = clReleaseMemObject(mpClmem2); - SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err)); - mpClmem2 = nullptr; - } - mpClmem2 = clCreateBuffer(kEnv.mpkContext, - cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR, - w * sizeof(double) * 2, pAllBuffer.get(), &err); - if (CL_SUCCESS != err) - throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); - SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer " << pAllBuffer.get()); - } - // set kernel arg - SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2); - err = clSetKernelArg(k, argno, sizeof(cl_mem), &mpClmem2); - if (CL_SUCCESS != err) - throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); - return 1; - } + virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ); ~ParallelReductionVectorRef() { @@ -2326,6 +1960,378 @@ }; } +template +void ParallelReductionVectorRef::GenSlidingWindowFunction( std::stringstream& ss ) +{ + if (!dynamic_cast(mpCodeGen.get())) + { + std::string name = Base::GetName(); + ss << "__kernel void " << name; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result =" << + mpCodeGen->GetBottom(); + ss << ";\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + if (mpDVR->IsStartFixed()) + ss << " int offset = 0;\n"; + else // if (!mpDVR->IsStartFixed()) + ss << " int offset = get_group_id(1);\n"; + if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = offset + windowSize;\n"; + else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = windowSize + get_group_id(1);\n"; + else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + ss << " end = min(end, arrayLength);\n"; + + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; lGen2( + "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; + ss << " tmp = legalize(" << mpCodeGen->Gen2( + "A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n"; + ss << " } else if ((loopOffset + lidx + offset) < end)\n"; + ss << " tmp = legalize(" << mpCodeGen->Gen2( + "A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; + ss << " shm_buf[lidx] = tmp;\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " for (int i = 128; i >0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] = "; + // Special case count + if (dynamic_cast(mpCodeGen.get())) + ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; + else + ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result ="; + if (dynamic_cast(mpCodeGen.get())) + ss << "current_result + shm_buf[0]"; + else + ss << mpCodeGen->Gen2("current_result", "shm_buf[0]"); + ss << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } + else + { + std::string name = Base::GetName(); + /*sum reduction*/ + ss << "__kernel void " << name << "_sum"; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result =" << + mpCodeGen->GetBottom(); + ss << ";\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + if (mpDVR->IsStartFixed()) + ss << " int offset = 0;\n"; + else // if (!mpDVR->IsStartFixed()) + ss << " int offset = get_group_id(1);\n"; + if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = offset + windowSize;\n"; + else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = windowSize + get_group_id(1);\n"; + else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + ss << " end = min(end, arrayLength);\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] = "; + ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result ="; + ss << "current_result + shm_buf[0]"; + ss << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + /*count reduction*/ + ss << "__kernel void " << name << "_count"; + ss << "_reduction(__global double* A, " + "__global double *result,int arrayLength,int windowSize){\n"; + ss << " double tmp, current_result =" << + mpCodeGen->GetBottom(); + ss << ";\n"; + ss << " int writePos = get_group_id(1);\n"; + ss << " int lidx = get_local_id(0);\n"; + ss << " __local double shm_buf[256];\n"; + if (mpDVR->IsStartFixed()) + ss << " int offset = 0;\n"; + else // if (!mpDVR->IsStartFixed()) + ss << " int offset = get_group_id(1);\n"; + if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = offset + windowSize;\n"; + else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) + ss << " int end = windowSize + get_group_id(1);\n"; + else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) + ss << " int end = windowSize;\n"; + ss << " end = min(end, arrayLength);\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " int loop = arrayLength/512 + 1;\n"; + ss << " for (int l=0; l0; i/=2) {\n"; + ss << " if (lidx < i)\n"; + ss << " shm_buf[lidx] = "; + ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " current_result ="; + ss << "current_result + shm_buf[0];"; + ss << ";\n"; + ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; + ss << " }\n"; + ss << " if (lidx == 0)\n"; + ss << " result[writePos] = current_result;\n"; + ss << "}\n"; + } +} + +template +size_t ParallelReductionVectorRef::GenReductionLoopHeader( + std::stringstream& ss, int nResultSize, bool& needBody ) +{ + assert(mpDVR); + size_t nCurWindowSize = mpDVR->GetRefRowSize(); + std::string temp = Base::GetName() + "[gid0]"; + ss << "tmp = "; + // Special case count + if (dynamic_cast(mpCodeGen.get())) + { + ss << mpCodeGen->Gen2(temp, "tmp") << ";\n"; + ss << "nCount = nCount-1;\n"; + ss << "nCount = nCount +"; /*re-assign nCount from count reduction*/ + ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n"; + } + else if (dynamic_cast(mpCodeGen.get())) + ss << temp << "+ tmp"; + else + ss << mpCodeGen->Gen2(temp, "tmp"); + ss << ";\n\t"; + needBody = false; + return nCurWindowSize; +} + +template +size_t ParallelReductionVectorRef::Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ) +{ + assert(Base::mpClmem == nullptr); + + openclwrapper::KernelEnv kEnv; + openclwrapper::setKernelEnv(&kEnv); + cl_int err; + size_t nInput = mpDVR->GetArrayLength(); + size_t nCurWindowSize = mpDVR->GetRefRowSize(); + // create clmem buffer + if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr) + throw Unhandled(__FILE__, __LINE__); + double* pHostBuffer = const_cast( + mpDVR->GetArrays()[Base::mnIndex].mpNumericArray); + size_t szHostBuffer = nInput * sizeof(double); + Base::mpClmem = clCreateBuffer(kEnv.mpkContext, + cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR, + szHostBuffer, + pHostBuffer, &err); + SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " << pHostBuffer); + + mpClmem2 = clCreateBuffer(kEnv.mpkContext, + CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, + sizeof(double) * w, nullptr, nullptr); + if (CL_SUCCESS != err) + throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << sizeof(double) << "*" << w << "=" << (sizeof(double)*w)); + + // reproduce the reduction function name + std::string kernelName; + if (!dynamic_cast(mpCodeGen.get())) + kernelName = Base::GetName() + "_reduction"; + else + kernelName = Base::GetName() + "_sum_reduction"; + cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); + + // set kernel arg of reduction kernel + // TODO(Wei Wei): use unique name for kernel + cl_mem buf = Base::GetCLBuffer(); + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); + err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), + static_cast(&buf)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); + err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); + err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast(&nInput)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); + err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast(&nCurWindowSize)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + // set work group size and execute + size_t global_work_size[] = { 256, static_cast(w) }; + size_t const local_work_size[] = { 256, 1 }; + SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, + global_work_size, local_work_size, 0, nullptr, nullptr); + if (CL_SUCCESS != err) + throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError("clFinish", err, __FILE__, __LINE__); + if (dynamic_cast(mpCodeGen.get())) + { + /*average need more reduction kernel for count computing*/ + std::unique_ptr pAllBuffer(new double[2 * w]); + double* resbuf = static_cast(clEnqueueMapBuffer(kEnv.mpkCmdQueue, + mpClmem2, + CL_TRUE, CL_MAP_READ, 0, + sizeof(double) * w, 0, nullptr, nullptr, + &err)); + if (err != CL_SUCCESS) + throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); + + for (int i = 0; i < w; i++) + pAllBuffer[i] = resbuf[i]; + err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); + if (err != CL_SUCCESS) + throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, __LINE__); + + kernelName = Base::GetName() + "_count_reduction"; + redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); + if (err != CL_SUCCESS) + throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); + + // set kernel arg of reduction kernel + buf = Base::GetCLBuffer(); + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); + err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), + static_cast(&buf)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); + err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); + err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast(&nInput)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); + err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast(&nCurWindowSize)); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + + // set work group size and execute + size_t global_work_size1[] = { 256, static_cast(w) }; + size_t const local_work_size1[] = { 256, 1 }; + SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); + err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, + global_work_size1, local_work_size1, 0, nullptr, nullptr); + if (CL_SUCCESS != err) + throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); + err = clFinish(kEnv.mpkCmdQueue); + if (CL_SUCCESS != err) + throw OpenCLError("clFinish", err, __FILE__, __LINE__); + resbuf = static_cast(clEnqueueMapBuffer(kEnv.mpkCmdQueue, + mpClmem2, + CL_TRUE, CL_MAP_READ, 0, + sizeof(double) * w, 0, nullptr, nullptr, + &err)); + if (err != CL_SUCCESS) + throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); + for (int i = 0; i < w; i++) + pAllBuffer[i + w] = resbuf[i]; + err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); + // FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails? + if (CL_SUCCESS != err) + SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err)); + if (mpClmem2) + { + err = clReleaseMemObject(mpClmem2); + SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err)); + mpClmem2 = nullptr; + } + mpClmem2 = clCreateBuffer(kEnv.mpkContext, + cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR, + w * sizeof(double) * 2, pAllBuffer.get(), &err); + if (CL_SUCCESS != err) + throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); + SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer " << pAllBuffer.get()); + } + // set kernel arg + SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2); + err = clSetKernelArg(k, argno, sizeof(cl_mem), &mpClmem2); + if (CL_SUCCESS != err) + throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); + return 1; +} + /// Helper functions that have multiple buffers class DynamicKernelSoPArguments : public DynamicKernelArgument {