Lines 1026-1034
Link Here
|
1026 |
/// Handling a Double Vector that is used as a sliding window input |
1026 |
/// Handling a Double Vector that is used as a sliding window input |
1027 |
/// to either a sliding window average or sum-of-products |
1027 |
/// to either a sliding window average or sum-of-products |
1028 |
/// Generate a sequential loop for reductions |
1028 |
/// Generate a sequential loop for reductions |
1029 |
class OpAverage; |
|
|
1030 |
class OpCount; |
1031 |
|
1032 |
template<class Base> |
1029 |
template<class Base> |
1033 |
class DynamicKernelSlidingArgument : public Base |
1030 |
class DynamicKernelSlidingArgument : public Base |
1034 |
{ |
1031 |
{ |
Lines 1335-1519
Link Here
|
1335 |
} |
1332 |
} |
1336 |
|
1333 |
|
1337 |
/// Emit the definition for the auxiliary reduction kernel |
1334 |
/// Emit the definition for the auxiliary reduction kernel |
1338 |
virtual void GenSlidingWindowFunction( std::stringstream& ss ) |
1335 |
virtual void GenSlidingWindowFunction( std::stringstream& ss ); |
1339 |
{ |
|
|
1340 |
if (!dynamic_cast<OpAverage*>(mpCodeGen.get())) |
1341 |
{ |
1342 |
std::string name = Base::GetName(); |
1343 |
ss << "__kernel void " << name; |
1344 |
ss << "_reduction(__global double* A, " |
1345 |
"__global double *result,int arrayLength,int windowSize){\n"; |
1346 |
ss << " double tmp, current_result =" << |
1347 |
mpCodeGen->GetBottom(); |
1348 |
ss << ";\n"; |
1349 |
ss << " int writePos = get_group_id(1);\n"; |
1350 |
ss << " int lidx = get_local_id(0);\n"; |
1351 |
ss << " __local double shm_buf[256];\n"; |
1352 |
if (mpDVR->IsStartFixed()) |
1353 |
ss << " int offset = 0;\n"; |
1354 |
else // if (!mpDVR->IsStartFixed()) |
1355 |
ss << " int offset = get_group_id(1);\n"; |
1356 |
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1357 |
ss << " int end = windowSize;\n"; |
1358 |
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1359 |
ss << " int end = offset + windowSize;\n"; |
1360 |
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1361 |
ss << " int end = windowSize + get_group_id(1);\n"; |
1362 |
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1363 |
ss << " int end = windowSize;\n"; |
1364 |
ss << " end = min(end, arrayLength);\n"; |
1365 |
|
1366 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1367 |
ss << " int loop = arrayLength/512 + 1;\n"; |
1368 |
ss << " for (int l=0; l<loop; l++){\n"; |
1369 |
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; |
1370 |
ss << " int loopOffset = l*512;\n"; |
1371 |
ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; |
1372 |
ss << " tmp = legalize(" << mpCodeGen->Gen2( |
1373 |
"A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; |
1374 |
ss << " tmp = legalize(" << mpCodeGen->Gen2( |
1375 |
"A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n"; |
1376 |
ss << " } else if ((loopOffset + lidx + offset) < end)\n"; |
1377 |
ss << " tmp = legalize(" << mpCodeGen->Gen2( |
1378 |
"A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; |
1379 |
ss << " shm_buf[lidx] = tmp;\n"; |
1380 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1381 |
ss << " for (int i = 128; i >0; i/=2) {\n"; |
1382 |
ss << " if (lidx < i)\n"; |
1383 |
ss << " shm_buf[lidx] = "; |
1384 |
// Special case count |
1385 |
if (dynamic_cast<OpCount*>(mpCodeGen.get())) |
1386 |
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; |
1387 |
else |
1388 |
ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << ";\n"; |
1389 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1390 |
ss << " }\n"; |
1391 |
ss << " if (lidx == 0)\n"; |
1392 |
ss << " current_result ="; |
1393 |
if (dynamic_cast<OpCount*>(mpCodeGen.get())) |
1394 |
ss << "current_result + shm_buf[0]"; |
1395 |
else |
1396 |
ss << mpCodeGen->Gen2("current_result", "shm_buf[0]"); |
1397 |
ss << ";\n"; |
1398 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1399 |
ss << " }\n"; |
1400 |
ss << " if (lidx == 0)\n"; |
1401 |
ss << " result[writePos] = current_result;\n"; |
1402 |
ss << "}\n"; |
1403 |
} |
1404 |
else |
1405 |
{ |
1406 |
std::string name = Base::GetName(); |
1407 |
/*sum reduction*/ |
1408 |
ss << "__kernel void " << name << "_sum"; |
1409 |
ss << "_reduction(__global double* A, " |
1410 |
"__global double *result,int arrayLength,int windowSize){\n"; |
1411 |
ss << " double tmp, current_result =" << |
1412 |
mpCodeGen->GetBottom(); |
1413 |
ss << ";\n"; |
1414 |
ss << " int writePos = get_group_id(1);\n"; |
1415 |
ss << " int lidx = get_local_id(0);\n"; |
1416 |
ss << " __local double shm_buf[256];\n"; |
1417 |
if (mpDVR->IsStartFixed()) |
1418 |
ss << " int offset = 0;\n"; |
1419 |
else // if (!mpDVR->IsStartFixed()) |
1420 |
ss << " int offset = get_group_id(1);\n"; |
1421 |
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1422 |
ss << " int end = windowSize;\n"; |
1423 |
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1424 |
ss << " int end = offset + windowSize;\n"; |
1425 |
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1426 |
ss << " int end = windowSize + get_group_id(1);\n"; |
1427 |
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1428 |
ss << " int end = windowSize;\n"; |
1429 |
ss << " end = min(end, arrayLength);\n"; |
1430 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1431 |
ss << " int loop = arrayLength/512 + 1;\n"; |
1432 |
ss << " for (int l=0; l<loop; l++){\n"; |
1433 |
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; |
1434 |
ss << " int loopOffset = l*512;\n"; |
1435 |
ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; |
1436 |
ss << " tmp = legalize("; |
1437 |
ss << "(A[loopOffset + lidx + offset]+ tmp)"; |
1438 |
ss << ", tmp);\n"; |
1439 |
ss << " tmp = legalize((A[loopOffset + lidx + offset + 256]+ tmp)"; |
1440 |
ss << ", tmp);\n"; |
1441 |
ss << " } else if ((loopOffset + lidx + offset) < end)\n"; |
1442 |
ss << " tmp = legalize((A[loopOffset + lidx + offset] + tmp)"; |
1443 |
ss << ", tmp);\n"; |
1444 |
ss << " shm_buf[lidx] = tmp;\n"; |
1445 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1446 |
ss << " for (int i = 128; i >0; i/=2) {\n"; |
1447 |
ss << " if (lidx < i)\n"; |
1448 |
ss << " shm_buf[lidx] = "; |
1449 |
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; |
1450 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1451 |
ss << " }\n"; |
1452 |
ss << " if (lidx == 0)\n"; |
1453 |
ss << " current_result ="; |
1454 |
ss << "current_result + shm_buf[0]"; |
1455 |
ss << ";\n"; |
1456 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1457 |
ss << " }\n"; |
1458 |
ss << " if (lidx == 0)\n"; |
1459 |
ss << " result[writePos] = current_result;\n"; |
1460 |
ss << "}\n"; |
1461 |
/*count reduction*/ |
1462 |
ss << "__kernel void " << name << "_count"; |
1463 |
ss << "_reduction(__global double* A, " |
1464 |
"__global double *result,int arrayLength,int windowSize){\n"; |
1465 |
ss << " double tmp, current_result =" << |
1466 |
mpCodeGen->GetBottom(); |
1467 |
ss << ";\n"; |
1468 |
ss << " int writePos = get_group_id(1);\n"; |
1469 |
ss << " int lidx = get_local_id(0);\n"; |
1470 |
ss << " __local double shm_buf[256];\n"; |
1471 |
if (mpDVR->IsStartFixed()) |
1472 |
ss << " int offset = 0;\n"; |
1473 |
else // if (!mpDVR->IsStartFixed()) |
1474 |
ss << " int offset = get_group_id(1);\n"; |
1475 |
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1476 |
ss << " int end = windowSize;\n"; |
1477 |
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1478 |
ss << " int end = offset + windowSize;\n"; |
1479 |
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1480 |
ss << " int end = windowSize + get_group_id(1);\n"; |
1481 |
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1482 |
ss << " int end = windowSize;\n"; |
1483 |
ss << " end = min(end, arrayLength);\n"; |
1484 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1485 |
ss << " int loop = arrayLength/512 + 1;\n"; |
1486 |
ss << " for (int l=0; l<loop; l++){\n"; |
1487 |
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; |
1488 |
ss << " int loopOffset = l*512;\n"; |
1489 |
ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; |
1490 |
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)"; |
1491 |
ss << ", tmp);\n"; |
1492 |
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset+256])?tmp:tmp+1.0)"; |
1493 |
ss << ", tmp);\n"; |
1494 |
ss << " } else if ((loopOffset + lidx + offset) < end)\n"; |
1495 |
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)"; |
1496 |
ss << ", tmp);\n"; |
1497 |
ss << " shm_buf[lidx] = tmp;\n"; |
1498 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1499 |
ss << " for (int i = 128; i >0; i/=2) {\n"; |
1500 |
ss << " if (lidx < i)\n"; |
1501 |
ss << " shm_buf[lidx] = "; |
1502 |
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; |
1503 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1504 |
ss << " }\n"; |
1505 |
ss << " if (lidx == 0)\n"; |
1506 |
ss << " current_result ="; |
1507 |
ss << "current_result + shm_buf[0];"; |
1508 |
ss << ";\n"; |
1509 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1510 |
ss << " }\n"; |
1511 |
ss << " if (lidx == 0)\n"; |
1512 |
ss << " result[writePos] = current_result;\n"; |
1513 |
ss << "}\n"; |
1514 |
} |
1515 |
|
1516 |
} |
1517 |
|
1336 |
|
1518 |
virtual std::string GenSlidingWindowDeclRef( bool ) const |
1337 |
virtual std::string GenSlidingWindowDeclRef( bool ) const |
1519 |
{ |
1338 |
{ |
Lines 1527-1720
Link Here
|
1527 |
|
1346 |
|
1528 |
/// Controls how the elements in the DoubleVectorRef are traversed |
1347 |
/// Controls how the elements in the DoubleVectorRef are traversed |
1529 |
size_t GenReductionLoopHeader( |
1348 |
size_t GenReductionLoopHeader( |
1530 |
std::stringstream& ss, int nResultSize, bool& needBody ) |
1349 |
std::stringstream& ss, int nResultSize, bool& needBody ); |
1531 |
{ |
|
|
1532 |
assert(mpDVR); |
1533 |
size_t nCurWindowSize = mpDVR->GetRefRowSize(); |
1534 |
std::string temp = Base::GetName() + "[gid0]"; |
1535 |
ss << "tmp = "; |
1536 |
// Special case count |
1537 |
if (dynamic_cast<OpAverage*>(mpCodeGen.get())) |
1538 |
{ |
1539 |
ss << mpCodeGen->Gen2(temp, "tmp") << ";\n"; |
1540 |
ss << "nCount = nCount-1;\n"; |
1541 |
ss << "nCount = nCount +"; /*re-assign nCount from count reduction*/ |
1542 |
ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n"; |
1543 |
} |
1544 |
else if (dynamic_cast<OpCount*>(mpCodeGen.get())) |
1545 |
ss << temp << "+ tmp"; |
1546 |
else |
1547 |
ss << mpCodeGen->Gen2(temp, "tmp"); |
1548 |
ss << ";\n\t"; |
1549 |
needBody = false; |
1550 |
return nCurWindowSize; |
1551 |
} |
1552 |
|
1350 |
|
1553 |
virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ) |
1351 |
virtual size_t Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ); |
1554 |
{ |
|
|
1555 |
assert(Base::mpClmem == nullptr); |
1556 |
|
1557 |
openclwrapper::KernelEnv kEnv; |
1558 |
openclwrapper::setKernelEnv(&kEnv); |
1559 |
cl_int err; |
1560 |
size_t nInput = mpDVR->GetArrayLength(); |
1561 |
size_t nCurWindowSize = mpDVR->GetRefRowSize(); |
1562 |
// create clmem buffer |
1563 |
if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr) |
1564 |
throw Unhandled(__FILE__, __LINE__); |
1565 |
double* pHostBuffer = const_cast<double*>( |
1566 |
mpDVR->GetArrays()[Base::mnIndex].mpNumericArray); |
1567 |
size_t szHostBuffer = nInput * sizeof(double); |
1568 |
Base::mpClmem = clCreateBuffer(kEnv.mpkContext, |
1569 |
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR, |
1570 |
szHostBuffer, |
1571 |
pHostBuffer, &err); |
1572 |
SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " << pHostBuffer); |
1573 |
|
1574 |
mpClmem2 = clCreateBuffer(kEnv.mpkContext, |
1575 |
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, |
1576 |
sizeof(double) * w, nullptr, nullptr); |
1577 |
if (CL_SUCCESS != err) |
1578 |
throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); |
1579 |
SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << sizeof(double) << "*" << w << "=" << (sizeof(double)*w)); |
1580 |
|
1581 |
// reproduce the reduction function name |
1582 |
std::string kernelName; |
1583 |
if (!dynamic_cast<OpAverage*>(mpCodeGen.get())) |
1584 |
kernelName = Base::GetName() + "_reduction"; |
1585 |
else |
1586 |
kernelName = Base::GetName() + "_sum_reduction"; |
1587 |
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); |
1588 |
if (err != CL_SUCCESS) |
1589 |
throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); |
1590 |
SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); |
1591 |
|
1592 |
// set kernel arg of reduction kernel |
1593 |
// TODO(Wei Wei): use unique name for kernel |
1594 |
cl_mem buf = Base::GetCLBuffer(); |
1595 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); |
1596 |
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), |
1597 |
static_cast<void*>(&buf)); |
1598 |
if (CL_SUCCESS != err) |
1599 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1600 |
|
1601 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); |
1602 |
err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); |
1603 |
if (CL_SUCCESS != err) |
1604 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1605 |
|
1606 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); |
1607 |
err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput)); |
1608 |
if (CL_SUCCESS != err) |
1609 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1610 |
|
1611 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); |
1612 |
err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize)); |
1613 |
if (CL_SUCCESS != err) |
1614 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1615 |
|
1616 |
// set work group size and execute |
1617 |
size_t global_work_size[] = { 256, static_cast<size_t>(w) }; |
1618 |
size_t const local_work_size[] = { 256, 1 }; |
1619 |
SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); |
1620 |
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, |
1621 |
global_work_size, local_work_size, 0, nullptr, nullptr); |
1622 |
if (CL_SUCCESS != err) |
1623 |
throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); |
1624 |
err = clFinish(kEnv.mpkCmdQueue); |
1625 |
if (CL_SUCCESS != err) |
1626 |
throw OpenCLError("clFinish", err, __FILE__, __LINE__); |
1627 |
if (dynamic_cast<OpAverage*>(mpCodeGen.get())) |
1628 |
{ |
1629 |
/*average need more reduction kernel for count computing*/ |
1630 |
std::unique_ptr<double[]> pAllBuffer(new double[2 * w]); |
1631 |
double* resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue, |
1632 |
mpClmem2, |
1633 |
CL_TRUE, CL_MAP_READ, 0, |
1634 |
sizeof(double) * w, 0, nullptr, nullptr, |
1635 |
&err)); |
1636 |
if (err != CL_SUCCESS) |
1637 |
throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); |
1638 |
|
1639 |
for (int i = 0; i < w; i++) |
1640 |
pAllBuffer[i] = resbuf[i]; |
1641 |
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); |
1642 |
if (err != CL_SUCCESS) |
1643 |
throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, __LINE__); |
1644 |
|
1645 |
kernelName = Base::GetName() + "_count_reduction"; |
1646 |
redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); |
1647 |
if (err != CL_SUCCESS) |
1648 |
throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); |
1649 |
SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); |
1650 |
|
1651 |
// set kernel arg of reduction kernel |
1652 |
buf = Base::GetCLBuffer(); |
1653 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); |
1654 |
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), |
1655 |
static_cast<void*>(&buf)); |
1656 |
if (CL_SUCCESS != err) |
1657 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1658 |
|
1659 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); |
1660 |
err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); |
1661 |
if (CL_SUCCESS != err) |
1662 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1663 |
|
1664 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); |
1665 |
err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput)); |
1666 |
if (CL_SUCCESS != err) |
1667 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1668 |
|
1669 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); |
1670 |
err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize)); |
1671 |
if (CL_SUCCESS != err) |
1672 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1673 |
|
1674 |
// set work group size and execute |
1675 |
size_t global_work_size1[] = { 256, static_cast<size_t>(w) }; |
1676 |
size_t const local_work_size1[] = { 256, 1 }; |
1677 |
SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); |
1678 |
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, |
1679 |
global_work_size1, local_work_size1, 0, nullptr, nullptr); |
1680 |
if (CL_SUCCESS != err) |
1681 |
throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); |
1682 |
err = clFinish(kEnv.mpkCmdQueue); |
1683 |
if (CL_SUCCESS != err) |
1684 |
throw OpenCLError("clFinish", err, __FILE__, __LINE__); |
1685 |
resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue, |
1686 |
mpClmem2, |
1687 |
CL_TRUE, CL_MAP_READ, 0, |
1688 |
sizeof(double) * w, 0, nullptr, nullptr, |
1689 |
&err)); |
1690 |
if (err != CL_SUCCESS) |
1691 |
throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); |
1692 |
for (int i = 0; i < w; i++) |
1693 |
pAllBuffer[i + w] = resbuf[i]; |
1694 |
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); |
1695 |
// FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails? |
1696 |
if (CL_SUCCESS != err) |
1697 |
SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err)); |
1698 |
if (mpClmem2) |
1699 |
{ |
1700 |
err = clReleaseMemObject(mpClmem2); |
1701 |
SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err)); |
1702 |
mpClmem2 = nullptr; |
1703 |
} |
1704 |
mpClmem2 = clCreateBuffer(kEnv.mpkContext, |
1705 |
cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR, |
1706 |
w * sizeof(double) * 2, pAllBuffer.get(), &err); |
1707 |
if (CL_SUCCESS != err) |
1708 |
throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); |
1709 |
SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer " << pAllBuffer.get()); |
1710 |
} |
1711 |
// set kernel arg |
1712 |
SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2); |
1713 |
err = clSetKernelArg(k, argno, sizeof(cl_mem), &mpClmem2); |
1714 |
if (CL_SUCCESS != err) |
1715 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
1716 |
return 1; |
1717 |
} |
1718 |
|
1352 |
|
1719 |
~ParallelReductionVectorRef() |
1353 |
~ParallelReductionVectorRef() |
1720 |
{ |
1354 |
{ |
Lines 2326-2331
Link Here
|
2326 |
}; |
1960 |
}; |
2327 |
} |
1961 |
} |
2328 |
|
1962 |
|
|
|
1963 |
template<class Base> |
1964 |
void ParallelReductionVectorRef<Base>::GenSlidingWindowFunction( std::stringstream& ss ) |
1965 |
{ |
1966 |
if (!dynamic_cast<OpAverage*>(mpCodeGen.get())) |
1967 |
{ |
1968 |
std::string name = Base::GetName(); |
1969 |
ss << "__kernel void " << name; |
1970 |
ss << "_reduction(__global double* A, " |
1971 |
"__global double *result,int arrayLength,int windowSize){\n"; |
1972 |
ss << " double tmp, current_result =" << |
1973 |
mpCodeGen->GetBottom(); |
1974 |
ss << ";\n"; |
1975 |
ss << " int writePos = get_group_id(1);\n"; |
1976 |
ss << " int lidx = get_local_id(0);\n"; |
1977 |
ss << " __local double shm_buf[256];\n"; |
1978 |
if (mpDVR->IsStartFixed()) |
1979 |
ss << " int offset = 0;\n"; |
1980 |
else // if (!mpDVR->IsStartFixed()) |
1981 |
ss << " int offset = get_group_id(1);\n"; |
1982 |
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1983 |
ss << " int end = windowSize;\n"; |
1984 |
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1985 |
ss << " int end = offset + windowSize;\n"; |
1986 |
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
1987 |
ss << " int end = windowSize + get_group_id(1);\n"; |
1988 |
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
1989 |
ss << " int end = windowSize;\n"; |
1990 |
ss << " end = min(end, arrayLength);\n"; |
1991 |
|
1992 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
1993 |
ss << " int loop = arrayLength/512 + 1;\n"; |
1994 |
ss << " for (int l=0; l<loop; l++){\n"; |
1995 |
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; |
1996 |
ss << " int loopOffset = l*512;\n"; |
1997 |
ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; |
1998 |
ss << " tmp = legalize(" << mpCodeGen->Gen2( |
1999 |
"A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; |
2000 |
ss << " tmp = legalize(" << mpCodeGen->Gen2( |
2001 |
"A[loopOffset + lidx + offset + 256]", "tmp") << ", tmp);\n"; |
2002 |
ss << " } else if ((loopOffset + lidx + offset) < end)\n"; |
2003 |
ss << " tmp = legalize(" << mpCodeGen->Gen2( |
2004 |
"A[loopOffset + lidx + offset]", "tmp") << ", tmp);\n"; |
2005 |
ss << " shm_buf[lidx] = tmp;\n"; |
2006 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2007 |
ss << " for (int i = 128; i >0; i/=2) {\n"; |
2008 |
ss << " if (lidx < i)\n"; |
2009 |
ss << " shm_buf[lidx] = "; |
2010 |
// Special case count |
2011 |
if (dynamic_cast<OpCount*>(mpCodeGen.get())) |
2012 |
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; |
2013 |
else |
2014 |
ss << mpCodeGen->Gen2("shm_buf[lidx]", "shm_buf[lidx + i]") << ";\n"; |
2015 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2016 |
ss << " }\n"; |
2017 |
ss << " if (lidx == 0)\n"; |
2018 |
ss << " current_result ="; |
2019 |
if (dynamic_cast<OpCount*>(mpCodeGen.get())) |
2020 |
ss << "current_result + shm_buf[0]"; |
2021 |
else |
2022 |
ss << mpCodeGen->Gen2("current_result", "shm_buf[0]"); |
2023 |
ss << ";\n"; |
2024 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2025 |
ss << " }\n"; |
2026 |
ss << " if (lidx == 0)\n"; |
2027 |
ss << " result[writePos] = current_result;\n"; |
2028 |
ss << "}\n"; |
2029 |
} |
2030 |
else |
2031 |
{ |
2032 |
std::string name = Base::GetName(); |
2033 |
/*sum reduction*/ |
2034 |
ss << "__kernel void " << name << "_sum"; |
2035 |
ss << "_reduction(__global double* A, " |
2036 |
"__global double *result,int arrayLength,int windowSize){\n"; |
2037 |
ss << " double tmp, current_result =" << |
2038 |
mpCodeGen->GetBottom(); |
2039 |
ss << ";\n"; |
2040 |
ss << " int writePos = get_group_id(1);\n"; |
2041 |
ss << " int lidx = get_local_id(0);\n"; |
2042 |
ss << " __local double shm_buf[256];\n"; |
2043 |
if (mpDVR->IsStartFixed()) |
2044 |
ss << " int offset = 0;\n"; |
2045 |
else // if (!mpDVR->IsStartFixed()) |
2046 |
ss << " int offset = get_group_id(1);\n"; |
2047 |
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
2048 |
ss << " int end = windowSize;\n"; |
2049 |
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
2050 |
ss << " int end = offset + windowSize;\n"; |
2051 |
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
2052 |
ss << " int end = windowSize + get_group_id(1);\n"; |
2053 |
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
2054 |
ss << " int end = windowSize;\n"; |
2055 |
ss << " end = min(end, arrayLength);\n"; |
2056 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2057 |
ss << " int loop = arrayLength/512 + 1;\n"; |
2058 |
ss << " for (int l=0; l<loop; l++){\n"; |
2059 |
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; |
2060 |
ss << " int loopOffset = l*512;\n"; |
2061 |
ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; |
2062 |
ss << " tmp = legalize("; |
2063 |
ss << "(A[loopOffset + lidx + offset]+ tmp)"; |
2064 |
ss << ", tmp);\n"; |
2065 |
ss << " tmp = legalize((A[loopOffset + lidx + offset + 256]+ tmp)"; |
2066 |
ss << ", tmp);\n"; |
2067 |
ss << " } else if ((loopOffset + lidx + offset) < end)\n"; |
2068 |
ss << " tmp = legalize((A[loopOffset + lidx + offset] + tmp)"; |
2069 |
ss << ", tmp);\n"; |
2070 |
ss << " shm_buf[lidx] = tmp;\n"; |
2071 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2072 |
ss << " for (int i = 128; i >0; i/=2) {\n"; |
2073 |
ss << " if (lidx < i)\n"; |
2074 |
ss << " shm_buf[lidx] = "; |
2075 |
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; |
2076 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2077 |
ss << " }\n"; |
2078 |
ss << " if (lidx == 0)\n"; |
2079 |
ss << " current_result ="; |
2080 |
ss << "current_result + shm_buf[0]"; |
2081 |
ss << ";\n"; |
2082 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2083 |
ss << " }\n"; |
2084 |
ss << " if (lidx == 0)\n"; |
2085 |
ss << " result[writePos] = current_result;\n"; |
2086 |
ss << "}\n"; |
2087 |
/*count reduction*/ |
2088 |
ss << "__kernel void " << name << "_count"; |
2089 |
ss << "_reduction(__global double* A, " |
2090 |
"__global double *result,int arrayLength,int windowSize){\n"; |
2091 |
ss << " double tmp, current_result =" << |
2092 |
mpCodeGen->GetBottom(); |
2093 |
ss << ";\n"; |
2094 |
ss << " int writePos = get_group_id(1);\n"; |
2095 |
ss << " int lidx = get_local_id(0);\n"; |
2096 |
ss << " __local double shm_buf[256];\n"; |
2097 |
if (mpDVR->IsStartFixed()) |
2098 |
ss << " int offset = 0;\n"; |
2099 |
else // if (!mpDVR->IsStartFixed()) |
2100 |
ss << " int offset = get_group_id(1);\n"; |
2101 |
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
2102 |
ss << " int end = windowSize;\n"; |
2103 |
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
2104 |
ss << " int end = offset + windowSize;\n"; |
2105 |
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed()) |
2106 |
ss << " int end = windowSize + get_group_id(1);\n"; |
2107 |
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed()) |
2108 |
ss << " int end = windowSize;\n"; |
2109 |
ss << " end = min(end, arrayLength);\n"; |
2110 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2111 |
ss << " int loop = arrayLength/512 + 1;\n"; |
2112 |
ss << " for (int l=0; l<loop; l++){\n"; |
2113 |
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n"; |
2114 |
ss << " int loopOffset = l*512;\n"; |
2115 |
ss << " if((loopOffset + lidx + offset + 256) < end) {\n"; |
2116 |
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)"; |
2117 |
ss << ", tmp);\n"; |
2118 |
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset+256])?tmp:tmp+1.0)"; |
2119 |
ss << ", tmp);\n"; |
2120 |
ss << " } else if ((loopOffset + lidx + offset) < end)\n"; |
2121 |
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)"; |
2122 |
ss << ", tmp);\n"; |
2123 |
ss << " shm_buf[lidx] = tmp;\n"; |
2124 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2125 |
ss << " for (int i = 128; i >0; i/=2) {\n"; |
2126 |
ss << " if (lidx < i)\n"; |
2127 |
ss << " shm_buf[lidx] = "; |
2128 |
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n"; |
2129 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2130 |
ss << " }\n"; |
2131 |
ss << " if (lidx == 0)\n"; |
2132 |
ss << " current_result ="; |
2133 |
ss << "current_result + shm_buf[0];"; |
2134 |
ss << ";\n"; |
2135 |
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n"; |
2136 |
ss << " }\n"; |
2137 |
ss << " if (lidx == 0)\n"; |
2138 |
ss << " result[writePos] = current_result;\n"; |
2139 |
ss << "}\n"; |
2140 |
} |
2141 |
} |
2142 |
|
2143 |
template<class Base> |
2144 |
size_t ParallelReductionVectorRef<Base>::GenReductionLoopHeader( |
2145 |
std::stringstream& ss, int nResultSize, bool& needBody ) |
2146 |
{ |
2147 |
assert(mpDVR); |
2148 |
size_t nCurWindowSize = mpDVR->GetRefRowSize(); |
2149 |
std::string temp = Base::GetName() + "[gid0]"; |
2150 |
ss << "tmp = "; |
2151 |
// Special case count |
2152 |
if (dynamic_cast<OpAverage*>(mpCodeGen.get())) |
2153 |
{ |
2154 |
ss << mpCodeGen->Gen2(temp, "tmp") << ";\n"; |
2155 |
ss << "nCount = nCount-1;\n"; |
2156 |
ss << "nCount = nCount +"; /*re-assign nCount from count reduction*/ |
2157 |
ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n"; |
2158 |
} |
2159 |
else if (dynamic_cast<OpCount*>(mpCodeGen.get())) |
2160 |
ss << temp << "+ tmp"; |
2161 |
else |
2162 |
ss << mpCodeGen->Gen2(temp, "tmp"); |
2163 |
ss << ";\n\t"; |
2164 |
needBody = false; |
2165 |
return nCurWindowSize; |
2166 |
} |
2167 |
|
2168 |
template<class Base> |
2169 |
size_t ParallelReductionVectorRef<Base>::Marshal( cl_kernel k, int argno, int w, cl_program mpProgram ) |
2170 |
{ |
2171 |
assert(Base::mpClmem == nullptr); |
2172 |
|
2173 |
openclwrapper::KernelEnv kEnv; |
2174 |
openclwrapper::setKernelEnv(&kEnv); |
2175 |
cl_int err; |
2176 |
size_t nInput = mpDVR->GetArrayLength(); |
2177 |
size_t nCurWindowSize = mpDVR->GetRefRowSize(); |
2178 |
// create clmem buffer |
2179 |
if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr) |
2180 |
throw Unhandled(__FILE__, __LINE__); |
2181 |
double* pHostBuffer = const_cast<double*>( |
2182 |
mpDVR->GetArrays()[Base::mnIndex].mpNumericArray); |
2183 |
size_t szHostBuffer = nInput * sizeof(double); |
2184 |
Base::mpClmem = clCreateBuffer(kEnv.mpkContext, |
2185 |
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR, |
2186 |
szHostBuffer, |
2187 |
pHostBuffer, &err); |
2188 |
SAL_INFO("sc.opencl", "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof(double) << "=" << szHostBuffer << " using host buffer " << pHostBuffer); |
2189 |
|
2190 |
mpClmem2 = clCreateBuffer(kEnv.mpkContext, |
2191 |
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, |
2192 |
sizeof(double) * w, nullptr, nullptr); |
2193 |
if (CL_SUCCESS != err) |
2194 |
throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); |
2195 |
SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << sizeof(double) << "*" << w << "=" << (sizeof(double)*w)); |
2196 |
|
2197 |
// reproduce the reduction function name |
2198 |
std::string kernelName; |
2199 |
if (!dynamic_cast<OpAverage*>(mpCodeGen.get())) |
2200 |
kernelName = Base::GetName() + "_reduction"; |
2201 |
else |
2202 |
kernelName = Base::GetName() + "_sum_reduction"; |
2203 |
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); |
2204 |
if (err != CL_SUCCESS) |
2205 |
throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); |
2206 |
SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); |
2207 |
|
2208 |
// set kernel arg of reduction kernel |
2209 |
// TODO(Wei Wei): use unique name for kernel |
2210 |
cl_mem buf = Base::GetCLBuffer(); |
2211 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); |
2212 |
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), |
2213 |
static_cast<void*>(&buf)); |
2214 |
if (CL_SUCCESS != err) |
2215 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2216 |
|
2217 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); |
2218 |
err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); |
2219 |
if (CL_SUCCESS != err) |
2220 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2221 |
|
2222 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); |
2223 |
err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput)); |
2224 |
if (CL_SUCCESS != err) |
2225 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2226 |
|
2227 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); |
2228 |
err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize)); |
2229 |
if (CL_SUCCESS != err) |
2230 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2231 |
|
2232 |
// set work group size and execute |
2233 |
size_t global_work_size[] = { 256, static_cast<size_t>(w) }; |
2234 |
size_t const local_work_size[] = { 256, 1 }; |
2235 |
SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); |
2236 |
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, |
2237 |
global_work_size, local_work_size, 0, nullptr, nullptr); |
2238 |
if (CL_SUCCESS != err) |
2239 |
throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); |
2240 |
err = clFinish(kEnv.mpkCmdQueue); |
2241 |
if (CL_SUCCESS != err) |
2242 |
throw OpenCLError("clFinish", err, __FILE__, __LINE__); |
2243 |
if (dynamic_cast<OpAverage*>(mpCodeGen.get())) |
2244 |
{ |
2245 |
/*average need more reduction kernel for count computing*/ |
2246 |
std::unique_ptr<double[]> pAllBuffer(new double[2 * w]); |
2247 |
double* resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue, |
2248 |
mpClmem2, |
2249 |
CL_TRUE, CL_MAP_READ, 0, |
2250 |
sizeof(double) * w, 0, nullptr, nullptr, |
2251 |
&err)); |
2252 |
if (err != CL_SUCCESS) |
2253 |
throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); |
2254 |
|
2255 |
for (int i = 0; i < w; i++) |
2256 |
pAllBuffer[i] = resbuf[i]; |
2257 |
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); |
2258 |
if (err != CL_SUCCESS) |
2259 |
throw OpenCLError("clEnqueueUnmapMemObject", err, __FILE__, __LINE__); |
2260 |
|
2261 |
kernelName = Base::GetName() + "_count_reduction"; |
2262 |
redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err); |
2263 |
if (err != CL_SUCCESS) |
2264 |
throw OpenCLError("clCreateKernel", err, __FILE__, __LINE__); |
2265 |
SAL_INFO("sc.opencl", "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram); |
2266 |
|
2267 |
// set kernel arg of reduction kernel |
2268 |
buf = Base::GetCLBuffer(); |
2269 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf); |
2270 |
err = clSetKernelArg(redKernel, 0, sizeof(cl_mem), |
2271 |
static_cast<void*>(&buf)); |
2272 |
if (CL_SUCCESS != err) |
2273 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2274 |
|
2275 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2); |
2276 |
err = clSetKernelArg(redKernel, 1, sizeof(cl_mem), &mpClmem2); |
2277 |
if (CL_SUCCESS != err) |
2278 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2279 |
|
2280 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput); |
2281 |
err = clSetKernelArg(redKernel, 2, sizeof(cl_int), static_cast<void*>(&nInput)); |
2282 |
if (CL_SUCCESS != err) |
2283 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2284 |
|
2285 |
SAL_INFO("sc.opencl", "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize); |
2286 |
err = clSetKernelArg(redKernel, 3, sizeof(cl_int), static_cast<void*>(&nCurWindowSize)); |
2287 |
if (CL_SUCCESS != err) |
2288 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2289 |
|
2290 |
// set work group size and execute |
2291 |
size_t global_work_size1[] = { 256, static_cast<size_t>(w) }; |
2292 |
size_t const local_work_size1[] = { 256, 1 }; |
2293 |
SAL_INFO("sc.opencl", "Enqueing kernel " << redKernel); |
2294 |
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr, |
2295 |
global_work_size1, local_work_size1, 0, nullptr, nullptr); |
2296 |
if (CL_SUCCESS != err) |
2297 |
throw OpenCLError("clEnqueueNDRangeKernel", err, __FILE__, __LINE__); |
2298 |
err = clFinish(kEnv.mpkCmdQueue); |
2299 |
if (CL_SUCCESS != err) |
2300 |
throw OpenCLError("clFinish", err, __FILE__, __LINE__); |
2301 |
resbuf = static_cast<double*>(clEnqueueMapBuffer(kEnv.mpkCmdQueue, |
2302 |
mpClmem2, |
2303 |
CL_TRUE, CL_MAP_READ, 0, |
2304 |
sizeof(double) * w, 0, nullptr, nullptr, |
2305 |
&err)); |
2306 |
if (err != CL_SUCCESS) |
2307 |
throw OpenCLError("clEnqueueMapBuffer", err, __FILE__, __LINE__); |
2308 |
for (int i = 0; i < w; i++) |
2309 |
pAllBuffer[i + w] = resbuf[i]; |
2310 |
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr); |
2311 |
// FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails? |
2312 |
if (CL_SUCCESS != err) |
2313 |
SAL_WARN("sc.opencl", "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err)); |
2314 |
if (mpClmem2) |
2315 |
{ |
2316 |
err = clReleaseMemObject(mpClmem2); |
2317 |
SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl", "clReleaseMemObject failed: " << openclwrapper::errorString(err)); |
2318 |
mpClmem2 = nullptr; |
2319 |
} |
2320 |
mpClmem2 = clCreateBuffer(kEnv.mpkContext, |
2321 |
cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR, |
2322 |
w * sizeof(double) * 2, pAllBuffer.get(), &err); |
2323 |
if (CL_SUCCESS != err) |
2324 |
throw OpenCLError("clCreateBuffer", err, __FILE__, __LINE__); |
2325 |
SAL_INFO("sc.opencl", "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof(double) << "=" << (w*sizeof(double)) << " copying host buffer " << pAllBuffer.get()); |
2326 |
} |
2327 |
// set kernel arg |
2328 |
SAL_INFO("sc.opencl", "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2); |
2329 |
err = clSetKernelArg(k, argno, sizeof(cl_mem), &mpClmem2); |
2330 |
if (CL_SUCCESS != err) |
2331 |
throw OpenCLError("clSetKernelArg", err, __FILE__, __LINE__); |
2332 |
return 1; |
2333 |
} |
2334 |
|
2329 |
/// Helper functions that have multiple buffers |
2335 |
/// Helper functions that have multiple buffers |
2330 |
class DynamicKernelSoPArguments : public DynamicKernelArgument |
2336 |
class DynamicKernelSoPArguments : public DynamicKernelArgument |
2331 |
{ |
2337 |
{ |