/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; fill-column: 100 -*- */
/*
* This file is part of the LibreOffice project.
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*/
#include <formulagroup.hxx>
#include <formulagroupcl.hxx>
#include <document.hxx>
#include <formulacell.hxx>
#include <tokenarray.hxx>
#include <compiler.hxx>
#include <comphelper/random.hxx>
#include <scmatrix.hxx>
#include <sal/log.hxx>
#include <opencl/openclwrapper.hxx>
#include <opencl/OpenCLZone.hxx>
#include "op_financial.hxx"
#include "op_math.hxx"
#include "op_logical.hxx"
#include "op_statistical.hxx"
#include "op_array.hxx"
#include "op_spreadsheet.hxx"
#include "op_addin.hxx"
#include <limits>
#include <com/sun/star/sheet/FormulaLanguage.hpp>
const char *
const publicFunc =
"\n"
"#define IllegalArgument 502\n"
"#define IllegalFPOperation 503 // #NUM!\n"
"#define NoValue 519 // #VALUE!\n"
"#define NoConvergence 523\n"
"#define DivisionByZero 532 // #DIV/0!\n"
"#define NOTAVAILABLE 0x7fff // #N/A\n"
"\n"
"double CreateDoubleError(ulong nErr)\n"
"{\n"
// At least nVidia on Linux and Intel on Windows seem to ignore the argument to nan(),
// so using that would not propagate the type of error, work that around
// by directly constructing the proper IEEE double NaN value
// TODO: maybe use a better way to detect such systems?
" return as_double(0x7FF8000000000000+nErr);\n"
// " return nan(nErr);\n"
"}\n"
"\n"
"double fsum(double a, double b) { return isnan(a)?b:a+b; }\n"
"double legalize(double a, double b) { return isnan(a)?b:a;}\n"
;
#include <utility>
#include <vector>
#include <map>
#include <iostream>
#include <algorithm>
#include <rtl/digest.h>
#include <memory>
using namespace formula;
namespace sc::opencl {
namespace {
std::string linenumberify(
const std::string& s)
{
outputstream ss;
int linenumber = 1;
size_t start = 0;
size_t newline;
while ((newline = s.find(
'\n' , start)) != std::string::npos)
{
ss <<
"/*" << std::setw(4) << linenumber++ << "*/ " << s.substr(start, newline-start+1);
start = newline + 1;
}
if (start < s.size())
ss <<
"/*" << std::setw(4) << linenumber++ << "*/ " << s.substr(start, std::string::npos);
return ss.str();
}
bool AllStringsAreNull(
const rtl_uString*
const * pStringArray, size_t nLength)
{
if (pStringArray == nullptr)
return true ;
for (size_t i = 0; i < nLength; i++)
if (pStringArray[i] != nullptr)
return false ;
return true ;
}
OUString LimitedString( std::u16string_view str )
{
if ( str.size() < 20 )
return OUString::Concat(
"\" ") + str + " \
"" ;
else
return OUString::Concat(
"\" ") + str.substr( 0, 20 ) + " \
"..." ;
}
const int MAX_PEEK_ELEMENTS = 5;
// Returns formatted contents of the data (possibly shortened), to be used in debug output.
std::string DebugPeekData(
const FormulaToken* ref,
int doubleRefIndex = 0)
{
if (ref->GetType() == formula::svSingleVectorRef)
{
const formula::SingleVectorRefToken* pSVR =
static_cast <
const formula::SingleVectorRefToken*>(ref);
outputstream buf;
buf <<
"SingleRef {" ;
for ( size_t i = 0; i < std::min< size_t >( MAX_PEEK_ELEMENTS, pSVR->GetArrayLength()); ++i )
{
if ( i != 0 )
buf <<
"," ;
if ( pSVR->GetArray().mpStringArray != nullptr
&& pSVR->GetArray().mpStringArray[ i ] != nullptr )
{
buf << LimitedString( OUString( pSVR->GetArray().mpStringArray[ i ] ));
}
else if ( pSVR->GetArray().mpNumericArray != nullptr )
buf << pSVR->GetArray().mpNumericArray[ i ];
}
if ( pSVR->GetArrayLength() > MAX_PEEK_ELEMENTS )
buf <<
",..." ;
buf <<
"}" ;
return buf.str();
}
else if (ref->GetType() == formula::svDoubleVectorRef)
{
const formula::DoubleVectorRefToken* pDVR =
static_cast <
const formula::DoubleVectorRefToken*>(ref);
outputstream buf;
buf <<
"DoubleRef {" ;
for ( size_t i = 0; i < std::min< size_t >( MAX_PEEK_ELEMENTS, pDVR->GetArrayLength()); ++i )
{
if ( i != 0 )
buf <<
"," ;
if ( pDVR->GetArrays()[doubleRefIndex].mpStringArray != nullptr
&& pDVR->GetArrays()[doubleRefIndex].mpStringArray[ i ] != nullptr )
{
buf << LimitedString( OUString( pDVR->GetArrays()[doubleRefIndex].mpStringArray[ i ] ));
}
else if ( pDVR->GetArrays()[doubleRefIndex].mpNumericArray != nullptr )
buf << pDVR->GetArrays()[doubleRefIndex].mpNumericArray[ i ];
}
if ( pDVR->GetArrayLength() > MAX_PEEK_ELEMENTS )
buf <<
",..." ;
buf <<
"}" ;
return buf.str();
}
else if (ref->GetType() == formula::svString)
{
outputstream buf;
buf <<
"String " << LimitedString( ref->GetString().getString());
return buf.str();
}
else if (ref->GetType() == formula::svDouble)
{
return preciseFloat(ref->GetDouble());
}
else
{
return "?" ;
}
}
// Returns formatted contents of a doubles buffer, to be used in debug output.
std::string DebugPeekDoubles(
const double * data,
int size)
{
outputstream buf;
buf <<
"{" ;
for (
int i = 0; i < std::min( MAX_PEEK_ELEMENTS, size ); ++i )
{
if ( i != 0 )
buf <<
"," ;
buf << data[ i ];
}
if ( size > MAX_PEEK_ELEMENTS )
buf <<
",..." ;
buf <<
"}" ;
return buf.str();
}
}
// anonymous namespace
/// Map the buffer used by an argument and do necessary argument setting
size_t VectorRef::Marshal( cl_kernel k,
int argno,
int , cl_program )
{
OpenCLZone zone;
FormulaToken* ref = mFormulaTree->GetFormulaToken();
double * pHostBuffer = nullptr;
size_t szHostBuffer = 0;
if (ref->GetType() == formula::svSingleVectorRef)
{
const formula::SingleVectorRefToken* pSVR =
static_cast <
const formula::SingleVectorRefToken*>(ref);
SAL_INFO(
"sc.opencl" ,
"SingleVectorRef len=" << pSVR->GetArrayLength() <<
" mpNumericArray=" << pSVR->GetArray().mpNumericArray <<
" (mpStringArray=" << pSVR->GetArray().mpStringArray <<
")" );
if ( forceStringsToZero && pSVR->GetArray().mpStringArray != nullptr )
{
dataBuffer.resize( pSVR->GetArrayLength());
for ( size_t i = 0; i < pSVR->GetArrayLength(); ++i )
if ( pSVR->GetArray().mpStringArray[ i ] != nullptr )
dataBuffer[ i ] = 0;
else
dataBuffer[ i ] = pSVR->GetArray().mpNumericArray[ i ];
pHostBuffer = dataBuffer.data();
SAL_INFO(
"sc.opencl" ,
"Forced strings to zero : " << DebugPeekDoubles( pHostBuffer, pSV
R->GetArrayLength()));
}
else
{
pHostBuffer = const_cast <double *>(pSVR->GetArray().mpNumericArray);
}
szHostBuffer = pSVR->GetArrayLength() * sizeof (double );
}
else if (ref->GetType() == formula::svDoubleVectorRef)
{
const formula::DoubleVectorRefToken* pDVR =
static_cast <const formula::DoubleVectorRefToken*>(ref);
SAL_INFO("sc.opencl" , "DoubleVectorRef index=" << mnIndex << " len=" << pDVR->GetArrayLength() << " mpNumericArray=" << pDVR->GetArrays()[mnIndex].mpNumericArray << " (mpStringArray=" << pDVR->GetArrays()[mnIndex].mpStringArray << ")" );
if ( forceStringsToZero && pDVR->GetArrays()[mnIndex].mpStringArray != nullptr )
{
dataBuffer.resize( pDVR->GetArrayLength());
for ( size_t i = 0; i < pDVR->GetArrayLength(); ++i )
if ( pDVR->GetArrays()[mnIndex].mpStringArray[ i ] != nullptr )
dataBuffer[ i ] = 0;
else
dataBuffer[ i ] = pDVR->GetArrays()[mnIndex].mpNumericArray[ i ];
pHostBuffer = dataBuffer.data();
SAL_INFO("sc.opencl" , "Forced strings to zero : " << DebugPeekDoubles( pHostBuffer, pDVR->GetArrayLength()));
}
else
{
pHostBuffer = const_cast <double *>(pDVR->GetArrays()[mnIndex].mpNumericArray);
}
szHostBuffer = pDVR->GetArrayLength() * sizeof (double );
}
else
{
throw Unhandled(__FILE__, __LINE__);
}
openclwrapper::KernelEnv kEnv;
openclwrapper::setKernelEnv(&kEnv);
cl_int err;
if (pHostBuffer)
{
mpClmem = clCreateBuffer(kEnv.mpkContext,
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR,
szHostBuffer,
pHostBuffer, &err);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem << " size " << szHostBuffer << " using host buffer " << pHostBuffer);
}
else
{
if (szHostBuffer == 0)
szHostBuffer = sizeof (double ); // a dummy small value
// Marshal as a buffer of NANs
mpClmem = clCreateBuffer(kEnv.mpkContext,
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_ALLOC_HOST_PTR,
szHostBuffer, nullptr, &err);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem << " size " << szHostBuffer);
double * pNanBuffer = static_cast <double *>(clEnqueueMapBuffer(
kEnv.mpkCmdQueue, mpClmem, CL_TRUE, CL_MAP_WRITE, 0,
szHostBuffer, 0, nullptr, nullptr, &err));
if (CL_SUCCESS != err)
throw OpenCLError("clEnqueueMapBuffer" , err, __FILE__, __LINE__);
for (size_t i = 0; i < szHostBuffer / sizeof (double ); i++)
pNanBuffer[i] = std::numeric_limits<double >::quiet_NaN();
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem,
pNanBuffer, 0, nullptr, nullptr);
// FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails?
if (CL_SUCCESS != err)
SAL_WARN("sc.opencl" , "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err));
}
SAL_INFO("sc.opencl" , "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem << " (" << DebugPeekData(ref, mnIndex) << ")" );
err = clSetKernelArg(k, argno, sizeof (cl_mem), static_cast <void *>(&mpClmem));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
return 1;
}
namespace {
class DynamicKernelPiArgument : public DynamicKernelArgument
{
public :
DynamicKernelPiArgument( const ScCalcConfig& config, const std::string& s,
const FormulaTreeNodeRef& ft ) :
DynamicKernelArgument(config, s, ft) { }
/// Generate declaration
virtual void GenDecl( outputstream& ss ) const override
{
ss << "double " << mSymName;
}
virtual void GenDeclRef( outputstream& ss ) const override
{
ss << "M_PI" ;
}
virtual void GenSlidingWindowDecl( outputstream& ss ) const override
{
GenDecl(ss);
}
virtual std::string GenSlidingWindowDeclRef( bool = false ) const override
{
return mSymName;
}
virtual size_t GetWindowSize() const override
{
return 1;
}
/// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal( cl_kernel k, int argno, int , cl_program ) override
{
OpenCLZone zone;
double tmp = 0.0;
// Pass the scalar result back to the rest of the formula kernel
SAL_INFO("sc.opencl" , "Kernel " << k << " arg " << argno << ": double: " << preciseFloat( tmp ) << " (PI)" );
cl_int err = clSetKernelArg(k, argno, sizeof (double ), static_cast <void *>(&tmp));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
return 1;
}
};
class DynamicKernelRandomArgument : public DynamicKernelArgument
{
public :
DynamicKernelRandomArgument( const ScCalcConfig& config, const std::string& s,
const FormulaTreeNodeRef& ft ) :
DynamicKernelArgument(config, s, ft) { }
/// Generate declaration
virtual void GenDecl( outputstream& ss ) const override
{
ss << "double " << mSymName;
}
virtual void GenDeclRef( outputstream& ss ) const override
{
ss << mSymName;
}
virtual void GenSlidingWindowDecl( outputstream& ss ) const override
{
ss << "int " << mSymName;
}
virtual std::string GenSlidingWindowDeclRef( bool = false ) const override
{
return mSymName + "_Random(" + mSymName + ")" ;
}
virtual void GenSlidingWindowFunction( outputstream& ss ) override
{
// This string is from the pi_opencl_kernel.i file as
// generated when building the Random123 examples. Unused
// stuff has been removed, and the actual kernel is not the
// same as in the totally different use case of that example,
// of course. Only the code that calculates the counter-based
// random number and what it needs is left.
ss << "\
\n\
#ifndef DEFINED_RANDOM123_STUFF\n\
#define DEFINED_RANDOM123_STUFF\n\
\n\
/*\n\
Copyright 2010-2011, D. E. Shaw Research.\n\
All rights reserved.\n\
\n\
Redistribution and use in source and binary forms, with or without\n\
modification, are permitted provided that the following conditions are\n\
met:\n\
\n\
* Redistributions of source code must retain the above copyright\n\
notice, this list of conditions, and the following disclaimer.\n\
\n\
* Redistributions in binary form must reproduce the above copyright\n\
notice, this list of conditions, and the following disclaimer in the\n\
documentation and/or other materials provided with the distribution.\n\
\n\
* Neither the name of D. E. Shaw Research nor the names of its\n\
contributors may be used to endorse or promote products derived from\n\
this software without specific prior written permission.\n\
\n\
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\
\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n\
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n\
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\n\
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\n\
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT\n\
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,\n\
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY\n\
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n\
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n\
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\
*/
\n\
typedef uint uint32_t;\n\
struct r123array2x32\n\
{\n\
uint32_t v[2];\n\
};\n\
enum r123_enum_threefry32x2\n\
{\n\
R_32x2_0_0 = 13,\n\
R_32x2_1_0 = 15,\n\
R_32x2_2_0 = 26,\n\
R_32x2_3_0 = 6,\n\
R_32x2_4_0 = 17,\n\
R_32x2_5_0 = 29,\n\
R_32x2_6_0 = 16,\n\
R_32x2_7_0 = 24\n\
};\n\
inline uint32_t RotL_32 (uint32_t x, unsigned int N)\n\
__attribute__ ((always_inline));\n\
inline uint32_t\n\
RotL_32 (uint32_t x, unsigned int N)\n\
{\n\
return (x << (N & 31)) | (x >> ((32 - N) & 31));\n\
}\n\
\n\
typedef struct r123array2x32 threefry2x32_ctr_t;\n\
typedef struct r123array2x32 threefry2x32_key_t;\n\
typedef struct r123array2x32 threefry2x32_ukey_t;\n\
inline threefry2x32_key_t\n\
threefry2x32keyinit (threefry2x32_ukey_t uk)\n\
{\n\
return uk;\n\
}\n\
\n\
inline threefry2x32_ctr_t threefry2x32_R (unsigned int Nrounds,\n\
threefry2x32_ctr_t in,\n\
threefry2x32_key_t k)\n\
__attribute__ ((always_inline));\n\
inline threefry2x32_ctr_t\n\
threefry2x32_R (unsigned int Nrounds, threefry2x32_ctr_t in,\n\
threefry2x32_key_t k)\n\
{\n\
threefry2x32_ctr_t X;\n\
uint32_t ks[2 + 1];\n\
int i;\n\
ks[2] = 0x1BD11BDA;\n\
for (i = 0; i < 2; i++) {\n\
ks[i] = k.v[i];\n\
X.v[i] = in.v[i];\n\
ks[2] ^= k.v[i];\n\
}\n\
X.v[0] += ks[0];\n\
X.v[1] += ks[1];\n\
if (Nrounds > 0) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_0_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 1) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_1_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 2) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_2_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 3) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_3_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 3) {\n\
X.v[0] += ks[1];\n\
X.v[1] += ks[2];\n\
X.v[1] += 1;\n\
}\n\
if (Nrounds > 4) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_4_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 5) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_5_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 6) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_6_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 7) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_7_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 7) {\n\
X.v[0] += ks[2];\n\
X.v[1] += ks[0];\n\
X.v[1] += 2;\n\
}\n\
if (Nrounds > 8) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_0_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 9) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_1_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 10) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_2_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 11) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_3_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 11) {\n\
X.v[0] += ks[0];\n\
X.v[1] += ks[1];\n\
X.v[1] += 3;\n\
}\n\
if (Nrounds > 12) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_4_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 13) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_5_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 14) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_6_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 15) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_7_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 15) {\n\
X.v[0] += ks[1];\n\
X.v[1] += ks[2];\n\
X.v[1] += 4;\n\
}\n\
if (Nrounds > 16) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_0_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 17) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_1_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 18) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_2_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 19) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_3_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 19) {\n\
X.v[0] += ks[2];\n\
X.v[1] += ks[0];\n\
X.v[1] += 5;\n\
}\n\
if (Nrounds > 20) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_4_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 21) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_5_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 22) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_6_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 23) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_7_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 23) {\n\
X.v[0] += ks[0];\n\
X.v[1] += ks[1];\n\
X.v[1] += 6;\n\
}\n\
if (Nrounds > 24) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_0_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 25) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_1_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 26) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_2_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 27) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_3_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 27) {\n\
X.v[0] += ks[1];\n\
X.v[1] += ks[2];\n\
X.v[1] += 7;\n\
}\n\
if (Nrounds > 28) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_4_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 29) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_5_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 30) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_6_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 31) {\n\
X.v[0] += X.v[1];\n\
X.v[1] = RotL_32 (X.v[1], R_32x2_7_0);\n\
X.v[1] ^= X.v[0];\n\
}\n\
if (Nrounds > 31) {\n\
X.v[0] += ks[2];\n\
X.v[1] += ks[0];\n\
X.v[1] += 8;\n\
}\n\
return X;\n\
}\n\
\n\
enum r123_enum_threefry2x32\n\
{ threefry2x32_rounds = 20 };\n\
inline threefry2x32_ctr_t threefry2x32 (threefry2x32_ctr_t in,\n\
threefry2x32_key_t k)\n\
__attribute__ ((always_inline));\n\
inline threefry2x32_ctr_t\n\
threefry2x32 (threefry2x32_ctr_t in, threefry2x32_key_t k)\n\
{\n\
return threefry2x32_R (threefry2x32_rounds, in, k);\n\
}\n\
#endif \n\
\n\
";
ss << "double " << mSymName << "_Random (int seed)\n\
{\n\
unsigned tid = get_global_id(0);\n\
threefry2x32_key_t k = { {tid, 0xdecafbad} };\n\
threefry2x32_ctr_t c = { {seed, 0xf00dcafe} };\n\
c = threefry2x32_R(threefry2x32_rounds, c, k);\n\
const double factor = 1./(" << SAL_MAX_UINT32 << " .0 + 1.0);\n\
const double halffactor = 0.5*factor;\n\
return c.v[0] * factor + halffactor;\n\
}\n\
";
}
virtual size_t GetWindowSize() const override
{
return 1;
}
/// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal( cl_kernel k, int argno, int , cl_program ) override
{
OpenCLZone zone;
cl_int seed = comphelper::rng::uniform_int_distribution(0, SAL_MAX_INT32);
// Pass the scalar result back to the rest of the formula kernel
SAL_INFO("sc.opencl" , "Kernel " << k << " arg " << argno << ": cl_int: " << seed << "(RANDOM)" );
cl_int err = clSetKernelArg(k, argno, sizeof (cl_int), static_cast <void *>(&seed));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
return 1;
}
};
// Arguments that are actually compile-time constant string
class ConstStringArgument : public DynamicKernelArgument
{
public :
ConstStringArgument( const ScCalcConfig& config, const std::string& s,
const FormulaTreeNodeRef& ft ) :
DynamicKernelArgument(config, s, ft) { }
/// Generate declaration
virtual void GenDecl( outputstream& ss ) const override
{
ss << "double " << mSymName;
}
virtual void GenDeclRef( outputstream& ss ) const override
{
ss << GenSlidingWindowDeclRef();
}
virtual void GenSlidingWindowDecl( outputstream& ss ) const override
{
GenDecl(ss);
}
virtual std::string GenSlidingWindowDeclRef( bool = false ) const override
{
outputstream ss;
if (GetFormulaToken()->GetType() != formula::svString)
throw Unhandled(__FILE__, __LINE__);
FormulaToken* Tok = GetFormulaToken();
ss << GetStringId(Tok->GetString().getData());
return ss.str();
}
virtual std::string GenIsString( bool = false ) const override
{
return "true" ;
}
virtual size_t GetWindowSize() const override
{
return 1;
}
virtual size_t Marshal( cl_kernel k, int argno, int , cl_program ) override
{
FormulaToken* ref = mFormulaTree->GetFormulaToken();
if (ref->GetType() != formula::svString)
{
throw Unhandled(__FILE__, __LINE__);
}
cl_double stringId = GetStringId(ref->GetString().getData());
// Pass the scalar result back to the rest of the formula kernel
SAL_INFO("sc.opencl" , "Kernel " << k << " arg " << argno
<< ": stringId: " << stringId << " (" << DebugPeekData(ref) << ")" );
cl_int err = clSetKernelArg(k, argno, sizeof (cl_double), static_cast <void *>(&stringId));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
return 1;
}
};
} // namespace
// Marshal a string vector reference
size_t DynamicKernelStringArgument::Marshal( cl_kernel k, int argno, int , cl_program )
{
OpenCLZone zone;
FormulaToken* ref = mFormulaTree->GetFormulaToken();
openclwrapper::KernelEnv kEnv;
openclwrapper::setKernelEnv(&kEnv);
cl_int err;
formula::VectorRefArray vRef;
size_t nStrings = 0;
if (ref->GetType() == formula::svSingleVectorRef)
{
const formula::SingleVectorRefToken* pSVR =
static_cast <const formula::SingleVectorRefToken*>(ref);
nStrings = pSVR->GetArrayLength();
vRef = pSVR->GetArray();
}
else if (ref->GetType() == formula::svDoubleVectorRef)
{
const formula::DoubleVectorRefToken* pDVR =
static_cast <const formula::DoubleVectorRefToken*>(ref);
nStrings = pDVR->GetArrayLength();
vRef = pDVR->GetArrays()[mnIndex];
}
size_t szHostBuffer = nStrings * sizeof (cl_double);
cl_double* pStringIdsBuffer = nullptr;
if (vRef.mpStringArray != nullptr)
{
// Marshal strings. See GetStringId().
mpClmem = clCreateBuffer(kEnv.mpkContext,
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_ALLOC_HOST_PTR,
szHostBuffer, nullptr, &err);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem << " size " << szHostBuffer);
pStringIdsBuffer = static_cast <cl_double*>(clEnqueueMapBuffer(
kEnv.mpkCmdQueue, mpClmem, CL_TRUE, CL_MAP_WRITE, 0,
szHostBuffer, 0, nullptr, nullptr, &err));
if (CL_SUCCESS != err)
throw OpenCLError("clEnqueueMapBuffer" , err, __FILE__, __LINE__);
for (size_t i = 0; i < nStrings; i++)
{
if (vRef.mpStringArray[i])
pStringIdsBuffer[i] = GetStringId(vRef.mpStringArray[i]);
else
rtl::math::setNan(&pStringIdsBuffer[i]);
}
}
else
{
if (nStrings == 0)
szHostBuffer = sizeof (cl_double); // a dummy small value
// Marshal as a buffer of NANs
mpClmem = clCreateBuffer(kEnv.mpkContext,
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_ALLOC_HOST_PTR,
szHostBuffer, nullptr, &err);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem << " size " << szHostBuffer);
pStringIdsBuffer = static_cast <cl_double*>(clEnqueueMapBuffer(
kEnv.mpkCmdQueue, mpClmem, CL_TRUE, CL_MAP_WRITE, 0,
szHostBuffer, 0, nullptr, nullptr, &err));
if (CL_SUCCESS != err)
throw OpenCLError("clEnqueueMapBuffer" , err, __FILE__, __LINE__);
for (size_t i = 0; i < szHostBuffer / sizeof (cl_double); i++)
rtl::math::setNan(&pStringIdsBuffer[i]);
}
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem,
pStringIdsBuffer, 0, nullptr, nullptr);
if (CL_SUCCESS != err)
throw OpenCLError("clEnqueueUnmapMemObject" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem
<< " (stringIds: " << DebugPeekDoubles(pStringIdsBuffer, nStrings) << " "
<< DebugPeekData(ref,mnIndex) << ")" );
err = clSetKernelArg(k, argno, sizeof (cl_mem), static_cast <void *>(&mpClmem));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
return 1;
}
std::string DynamicKernelStringArgument::GenIsString( bool nested ) const
{
if ( nested )
return "!isnan(" + mSymName + "[gid0])" ;
FormulaToken* ref = mFormulaTree->GetFormulaToken();
size_t nStrings = 0;
if (ref->GetType() == formula::svSingleVectorRef)
{
const formula::SingleVectorRefToken* pSVR =
static_cast <const formula::SingleVectorRefToken*>(ref);
nStrings = pSVR->GetArrayLength();
}
else if (ref->GetType() == formula::svDoubleVectorRef)
{
const formula::DoubleVectorRefToken* pDVR =
static_cast <const formula::DoubleVectorRefToken*>(ref);
nStrings = pDVR->GetArrayLength();
}
else
return "!isnan(" + mSymName + "[gid0])" ;
outputstream ss;
ss << "(gid0 < " << nStrings << "? !isnan(" << mSymName << "[gid0]):NAN)" ;
return ss.str();
}
namespace {
/// A mixed string/numeric vector
class DynamicKernelMixedArgument : public VectorRef
{
public :
DynamicKernelMixedArgument( const ScCalcConfig& config, const std::string& s,
const FormulaTreeNodeRef& ft ) :
VectorRef(config, s, ft), mStringArgument(config, s + "s" , ft) { }
virtual void GenSlidingWindowDecl( outputstream& ss ) const override
{
VectorRef::GenSlidingWindowDecl(ss);
ss << ", " ;
mStringArgument.GenSlidingWindowDecl(ss);
}
virtual void GenSlidingWindowFunction( outputstream& ) override { }
/// Generate declaration
virtual void GenDecl( outputstream& ss ) const override
{
VectorRef::GenDecl(ss);
ss << ", " ;
mStringArgument.GenDecl(ss);
}
virtual void GenDeclRef( outputstream& ss ) const override
{
VectorRef::GenDeclRef(ss);
ss << "," ;
mStringArgument.GenDeclRef(ss);
}
virtual std::string GenSlidingWindowDeclRef( bool nested ) const override
{
outputstream ss;
ss << "(!isnan(" << VectorRef::GenSlidingWindowDeclRef(nested);
ss << ")?" << VectorRef::GenSlidingWindowDeclRef(nested);
ss << ":" << mStringArgument.GenSlidingWindowDeclRef(nested);
ss << ")" ;
return ss.str();
}
virtual std::string GenDoubleSlidingWindowDeclRef( bool nested = false ) const override
{
outputstream ss;
ss << VectorRef::GenSlidingWindowDeclRef( nested );
return ss.str();
}
virtual std::string GenStringSlidingWindowDeclRef( bool nested = false ) const override
{
outputstream ss;
ss << mStringArgument.GenSlidingWindowDeclRef( nested );
return ss.str();
}
virtual std::string GenIsString( bool nested = false ) const override
{
return mStringArgument.GenIsString( nested );
}
virtual size_t Marshal( cl_kernel k, int argno, int vw, cl_program p ) override
{
int i = VectorRef::Marshal(k, argno, vw, p);
i += mStringArgument.Marshal(k, argno + i, vw, p);
return i;
}
protected :
DynamicKernelStringArgument mStringArgument;
};
}
template <class Base>
DynamicKernelSlidingArgument<Base>::DynamicKernelSlidingArgument(
const ScCalcConfig& config, const std::string& s, const FormulaTreeNodeRef& ft,
std::shared_ptr<SlidingFunctionBase> CodeGen, int index)
: Base(config, s, ft, index)
, mpCodeGen(std::move(CodeGen))
{
FormulaToken* t = ft->GetFormulaToken();
if (t->GetType() != formula::svDoubleVectorRef)
throw Unhandled(__FILE__, __LINE__);
mpDVR = static_cast <const formula::DoubleVectorRefToken*>(t);
bIsStartFixed = mpDVR->IsStartFixed();
bIsEndFixed = mpDVR->IsEndFixed();
}
template <class Base>
bool DynamicKernelSlidingArgument<Base>::NeedParallelReduction() const
{
assert(dynamic_cast <OpSumIfs*>(mpCodeGen.get()));
return GetWindowSize() > 100 &&
((GetStartFixed() && GetEndFixed()) ||
(!GetStartFixed() && !GetEndFixed()));
}
template <class Base>
std::string DynamicKernelSlidingArgument<Base>::GenSlidingWindowDeclRef( bool nested ) const
{
size_t nArrayLength = mpDVR->GetArrayLength();
outputstream ss;
if (!bIsStartFixed && !bIsEndFixed)
{
if (!nested)
ss << "((i+gid0) <" << nArrayLength << "?" ;
ss << Base::GetName() << "[i + gid0]" ;
if (!nested)
ss << ":NAN)" ;
}
else
{
if (!nested)
ss << "(i <" << nArrayLength << "?" ;
ss << Base::GetName() << "[i]" ;
if (!nested)
ss << ":NAN)" ;
}
return ss.str();
}
template <class Base>
size_t DynamicKernelSlidingArgument<Base>::GenReductionLoopHeader( outputstream& ss, bool & needBody )
{
assert(mpDVR);
size_t nCurWindowSize = mpDVR->GetRefRowSize();
if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
{
ss << "for (int i = " ;
ss << "gid0; i < " << mpDVR->GetArrayLength();
ss << " && i < " << nCurWindowSize << "; i++){\n\t\t" ;
needBody = true ;
return nCurWindowSize;
}
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
{
ss << "for (int i = " ;
ss << "0; i < " << mpDVR->GetArrayLength();
ss << " && i < gid0+" << nCurWindowSize << "; i++){\n\t\t" ;
needBody = true ;
return nCurWindowSize;
}
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
{
ss << "tmpBottom = " << mpCodeGen->GetBottom() << ";\n\t" ;
ss << "{int i;\n\t" ;
outputstream temp1, temp2;
int outLoopSize = UNROLLING_FACTOR;
if (nCurWindowSize / outLoopSize != 0)
{
ss << "for(int outLoop=0; outLoop<" << nCurWindowSize / outLoopSize << "; outLoop++){\n\t" ;
for (int count = 0; count < outLoopSize; count++)
{
ss << "i = outLoop*" << outLoopSize << "+" << count << ";\n\t" ;
if (count == 0)
{
temp1 << "if(i + gid0 < " << mpDVR->GetArrayLength();
temp1 << "){\n\t\t" ;
temp1 << "tmp = legalize(" ;
temp1 << mpCodeGen->Gen2(GenSlidingWindowDeclRef(), "tmp" );
temp1 << ", tmp);\n\t\t\t" ;
temp1 << "}\n\t" ;
}
ss << temp1.str();
}
ss << "}\n\t" ;
}
// The residual of mod outLoopSize
for (size_t count = nCurWindowSize / outLoopSize * outLoopSize; count < nCurWindowSize; count++)
{
ss << "i = " << count << ";\n\t" ;
if (count == nCurWindowSize / outLoopSize * outLoopSize)
{
temp2 << "if(i + gid0 < " << mpDVR->GetArrayLength();
temp2 << "){\n\t\t" ;
temp2 << "tmp = legalize(" ;
temp2 << mpCodeGen->Gen2(GenSlidingWindowDeclRef(), "tmp" );
temp2 << ", tmp);\n\t\t\t" ;
temp2 << "}\n\t" ;
}
ss << temp2.str();
}
ss << "}\n" ;
needBody = false ;
return nCurWindowSize;
}
// (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
else
{
ss << "\n\t" ;
ss << "tmpBottom = " << mpCodeGen->GetBottom() << ";\n\t" ;
ss << "{int i;\n\t" ;
outputstream temp1, temp2;
int outLoopSize = UNROLLING_FACTOR;
if (nCurWindowSize / outLoopSize != 0)
{
ss << "for(int outLoop=0; outLoop<" << nCurWindowSize / outLoopSize << "; outLoop++){\n\t" ;
for (int count = 0; count < outLoopSize; count++)
{
ss << "i = outLoop*" << outLoopSize << "+" << count << ";\n\t" ;
if (count == 0)
{
temp1 << "if(i < " << mpDVR->GetArrayLength();
temp1 << "){\n\t\t" ;
temp1 << "tmp = legalize(" ;
temp1 << mpCodeGen->Gen2(GenSlidingWindowDeclRef(), "tmp" );
temp1 << ", tmp);\n\t\t\t" ;
temp1 << "}\n\t" ;
}
ss << temp1.str();
}
ss << "}\n\t" ;
}
// The residual of mod outLoopSize
for (size_t count = nCurWindowSize / outLoopSize * outLoopSize; count < nCurWindowSize; count++)
{
ss << "i = " << count << ";\n\t" ;
if (count == nCurWindowSize / outLoopSize * outLoopSize)
{
temp2 << "if(i < " << mpDVR->GetArrayLength();
temp2 << "){\n\t\t" ;
temp2 << "tmp = legalize(" ;
temp2 << mpCodeGen->Gen2(GenSlidingWindowDeclRef(), "tmp" );
temp2 << ", tmp);\n\t\t\t" ;
temp2 << "}\n\t" ;
}
ss << temp2.str();
}
ss << "}\n" ;
needBody = false ;
return nCurWindowSize;
}
}
template class DynamicKernelSlidingArgument<VectorRef>;
template class DynamicKernelSlidingArgument<VectorRefStringsToZero>;
template class DynamicKernelSlidingArgument<DynamicKernelStringArgument>;
namespace {
/// A mixed string/numeric vector
class DynamicKernelMixedSlidingArgument : public VectorRef
{
public :
DynamicKernelMixedSlidingArgument( const ScCalcConfig& config, const std::string& s,
const FormulaTreeNodeRef& ft, const std::shared_ptr<SlidingFunctionBase>& CodeGen,
int index ) :
VectorRef(config, s, ft),
mDoubleArgument(mCalcConfig, s, ft, CodeGen, index),
mStringArgument(mCalcConfig, s + "s" , ft, CodeGen, index) { }
virtual void GenSlidingWindowDecl( outputstream& ss ) const override
{
mDoubleArgument.GenSlidingWindowDecl(ss);
ss << ", " ;
mStringArgument.GenSlidingWindowDecl(ss);
}
virtual void GenSlidingWindowFunction( outputstream& ) override { }
/// Generate declaration
virtual void GenDecl( outputstream& ss ) const override
{
mDoubleArgument.GenDecl(ss);
ss << ", " ;
mStringArgument.GenDecl(ss);
}
virtual void GenDeclRef( outputstream& ss ) const override
{
mDoubleArgument.GenDeclRef(ss);
ss << "," ;
mStringArgument.GenDeclRef(ss);
}
virtual std::string GenSlidingWindowDeclRef( bool nested ) const override
{
outputstream ss;
ss << "(!isnan(" << mDoubleArgument.GenSlidingWindowDeclRef(nested);
ss << ")?" << mDoubleArgument.GenSlidingWindowDeclRef(nested);
ss << ":" << mStringArgument.GenSlidingWindowDeclRef(nested);
ss << ")" ;
return ss.str();
}
virtual std::string GenDoubleSlidingWindowDeclRef( bool = false ) const override
{
outputstream ss;
ss << mDoubleArgument.GenSlidingWindowDeclRef();
return ss.str();
}
virtual std::string GenStringSlidingWindowDeclRef( bool = false ) const override
{
outputstream ss;
ss << mStringArgument.GenSlidingWindowDeclRef();
return ss.str();
}
virtual size_t Marshal( cl_kernel k, int argno, int vw, cl_program p ) override
{
int i = mDoubleArgument.Marshal(k, argno, vw, p);
i += mStringArgument.Marshal(k, argno + i, vw, p);
return i;
}
protected :
DynamicKernelSlidingArgument<VectorRef> mDoubleArgument;
DynamicKernelSlidingArgument<DynamicKernelStringArgument> mStringArgument;
};
/// Holds the symbol table for a given dynamic kernel
class SymbolTable
{
public :
typedef std::map<const formula::FormulaToken*, DynamicKernelArgumentRef> ArgumentMap;
// This avoids instability caused by using pointer as the key type
SymbolTable() : mCurId(0) { }
template <class T>
const DynamicKernelArgument* DeclRefArg(const ScCalcConfig& config, const FormulaTreeNodeRef&,
std::shared_ptr<SlidingFunctionBase> pCodeGen, int nResultSize);
/// Used to generate sliding window helpers
void DumpSlidingWindowFunctions( outputstream& ss )
{
for (auto const & argument : mParams)
{
argument->GenSlidingWindowFunction(ss);
ss << "\n" ;
}
}
/// Memory mapping from host to device and pass buffers to the given kernel as
/// arguments
void Marshal( cl_kernel, int , cl_program );
private :
unsigned int mCurId;
ArgumentMap mSymbols;
std::vector<DynamicKernelArgumentRef> mParams;
};
void SymbolTable::Marshal( cl_kernel k, int nVectorWidth, cl_program pProgram )
{
int i = 1; //The first argument is reserved for results
for (auto const & argument : mParams)
{
i += argument->Marshal(k, i, nVectorWidth, pProgram);
}
}
}
template <class Base>
ParallelReductionVectorRef<Base>::ParallelReductionVectorRef(
const ScCalcConfig& config, const std::string& s, const FormulaTreeNodeRef& ft,
std::shared_ptr<SlidingFunctionBase> CodeGen, int index)
: Base(config, s, ft, index)
, mpCodeGen(std::move(CodeGen))
, mpClmem2(nullptr)
{
FormulaToken* t = ft->GetFormulaToken();
if (t->GetType() != formula::svDoubleVectorRef)
throw Unhandled(__FILE__, __LINE__);
mpDVR = static_cast <const formula::DoubleVectorRefToken*>(t);
bIsStartFixed = mpDVR->IsStartFixed();
bIsEndFixed = mpDVR->IsEndFixed();
}
template <class Base>
void ParallelReductionVectorRef<Base>::GenSlidingWindowFunction( outputstream& ss )
{
if (!dynamic_cast <OpAverage*>(mpCodeGen.get()))
{
std::string name = Base::GetName();
ss << "__kernel void " << name;
ss << "_reduction(__global double* A, "
"__global double *result,int arrayLength,int windowSize){\n" ;
ss << " double tmp, current_result =" <<
mpCodeGen->GetBottom();
ss << ";\n" ;
ss << " int writePos = get_group_id(1);\n" ;
ss << " int lidx = get_local_id(0);\n" ;
ss << " __local double shm_buf[256];\n" ;
if (mpDVR->IsStartFixed())
ss << " int offset = 0;\n" ;
else // if (!mpDVR->IsStartFixed())
ss << " int offset = get_group_id(1);\n" ;
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
ss << " int end = windowSize;\n" ;
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
ss << " int end = offset + windowSize;\n" ;
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
ss << " int end = windowSize + get_group_id(1);\n" ;
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
ss << " int end = windowSize;\n" ;
ss << " end = min(end, arrayLength);\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " int loop = arrayLength/512 + 1;\n" ;
ss << " for (int l=0; l;
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n" ;
ss << " int loopOffset = l*512;\n" ;
ss << " if((loopOffset + lidx + offset + 256) < end) {\n" ;
ss << " tmp = legalize(" << mpCodeGen->Gen2(
"A[loopOffset + lidx + offset]" , "tmp" ) << ", tmp);\n" ;
ss << " tmp = legalize(" << mpCodeGen->Gen2(
"A[loopOffset + lidx + offset + 256]" , "tmp" ) << ", tmp);\n" ;
ss << " } else if ((loopOffset + lidx + offset) < end)\n" ;
ss << " tmp = legalize(" << mpCodeGen->Gen2(
"A[loopOffset + lidx + offset]" , "tmp" ) << ", tmp);\n" ;
ss << " shm_buf[lidx] = tmp;\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " for (int i = 128; i >0; i/=2) {\n" ;
ss << " if (lidx < i)\n" ;
ss << " shm_buf[lidx] = " ;
// Special case count
if (dynamic_cast <OpCount*>(mpCodeGen.get()))
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n" ;
else
ss << mpCodeGen->Gen2("shm_buf[lidx]" , "shm_buf[lidx + i]" ) << ";\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " }\n" ;
ss << " if (lidx == 0)\n" ;
ss << " current_result =" ;
if (dynamic_cast <OpCount*>(mpCodeGen.get()))
ss << "current_result + shm_buf[0]" ;
else
ss << mpCodeGen->Gen2("current_result" , "shm_buf[0]" );
ss << ";\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " }\n" ;
ss << " if (lidx == 0)\n" ;
ss << " result[writePos] = current_result;\n" ;
ss << "}\n" ;
}
else
{
std::string name = Base::GetName();
/*sum reduction*/
ss << "__kernel void " << name << "_sum" ;
ss << "_reduction(__global double* A, "
"__global double *result,int arrayLength,int windowSize){\n" ;
ss << " double tmp, current_result =" <<
mpCodeGen->GetBottom();
ss << ";\n" ;
ss << " int writePos = get_group_id(1);\n" ;
ss << " int lidx = get_local_id(0);\n" ;
ss << " __local double shm_buf[256];\n" ;
if (mpDVR->IsStartFixed())
ss << " int offset = 0;\n" ;
else // if (!mpDVR->IsStartFixed())
ss << " int offset = get_group_id(1);\n" ;
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
ss << " int end = windowSize;\n" ;
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
ss << " int end = offset + windowSize;\n" ;
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
ss << " int end = windowSize + get_group_id(1);\n" ;
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
ss << " int end = windowSize;\n" ;
ss << " end = min(end, arrayLength);\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " int loop = arrayLength/512 + 1;\n" ;
ss << " for (int l=0; l;
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n" ;
ss << " int loopOffset = l*512;\n" ;
ss << " if((loopOffset + lidx + offset + 256) < end) {\n" ;
ss << " tmp = legalize(" ;
ss << "(A[loopOffset + lidx + offset]+ tmp)" ;
ss << ", tmp);\n" ;
ss << " tmp = legalize((A[loopOffset + lidx + offset + 256]+ tmp)" ;
ss << ", tmp);\n" ;
ss << " } else if ((loopOffset + lidx + offset) < end)\n" ;
ss << " tmp = legalize((A[loopOffset + lidx + offset] + tmp)" ;
ss << ", tmp);\n" ;
ss << " shm_buf[lidx] = tmp;\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " for (int i = 128; i >0; i/=2) {\n" ;
ss << " if (lidx < i)\n" ;
ss << " shm_buf[lidx] = " ;
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " }\n" ;
ss << " if (lidx == 0)\n" ;
ss << " current_result =" ;
ss << "current_result + shm_buf[0]" ;
ss << ";\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " }\n" ;
ss << " if (lidx == 0)\n" ;
ss << " result[writePos] = current_result;\n" ;
ss << "}\n" ;
/*count reduction*/
ss << "__kernel void " << name << "_count" ;
ss << "_reduction(__global double* A, "
"__global double *result,int arrayLength,int windowSize){\n" ;
ss << " double tmp, current_result =" <<
mpCodeGen->GetBottom();
ss << ";\n" ;
ss << " int writePos = get_group_id(1);\n" ;
ss << " int lidx = get_local_id(0);\n" ;
ss << " __local double shm_buf[256];\n" ;
if (mpDVR->IsStartFixed())
ss << " int offset = 0;\n" ;
else // if (!mpDVR->IsStartFixed())
ss << " int offset = get_group_id(1);\n" ;
if (mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
ss << " int end = windowSize;\n" ;
else if (!mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
ss << " int end = offset + windowSize;\n" ;
else if (mpDVR->IsStartFixed() && !mpDVR->IsEndFixed())
ss << " int end = windowSize + get_group_id(1);\n" ;
else if (!mpDVR->IsStartFixed() && mpDVR->IsEndFixed())
ss << " int end = windowSize;\n" ;
ss << " end = min(end, arrayLength);\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " int loop = arrayLength/512 + 1;\n" ;
ss << " for (int l=0; l;
ss << " tmp = " << mpCodeGen->GetBottom() << ";\n" ;
ss << " int loopOffset = l*512;\n" ;
ss << " if((loopOffset + lidx + offset + 256) < end) {\n" ;
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)" ;
ss << ", tmp);\n" ;
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset+256])?tmp:tmp+1.0)" ;
ss << ", tmp);\n" ;
ss << " } else if ((loopOffset + lidx + offset) < end)\n" ;
ss << " tmp = legalize((isnan(A[loopOffset + lidx + offset])?tmp:tmp+1.0)" ;
ss << ", tmp);\n" ;
ss << " shm_buf[lidx] = tmp;\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " for (int i = 128; i >0; i/=2) {\n" ;
ss << " if (lidx < i)\n" ;
ss << " shm_buf[lidx] = " ;
ss << "shm_buf[lidx] + shm_buf[lidx + i];\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " }\n" ;
ss << " if (lidx == 0)\n" ;
ss << " current_result =" ;
ss << "current_result + shm_buf[0];" ;
ss << ";\n" ;
ss << " barrier(CLK_LOCAL_MEM_FENCE);\n" ;
ss << " }\n" ;
ss << " if (lidx == 0)\n" ;
ss << " result[writePos] = current_result;\n" ;
ss << "}\n" ;
}
}
template <class Base>
std::string ParallelReductionVectorRef<Base>::GenSlidingWindowDeclRef( bool ) const
{
outputstream ss;
if (!bIsStartFixed && !bIsEndFixed)
ss << Base::GetName() << "[i + gid0]" ;
else
ss << Base::GetName() << "[i]" ;
return ss.str();
}
template <class Base>
size_t ParallelReductionVectorRef<Base>::GenReductionLoopHeader(
outputstream& ss, int nResultSize, bool & needBody )
{
assert(mpDVR);
size_t nCurWindowSize = mpDVR->GetRefRowSize();
std::string temp = Base::GetName() + "[gid0]" ;
ss << "tmp = " ;
// Special case count
if (dynamic_cast <OpAverage*>(mpCodeGen.get()))
{
ss << mpCodeGen->Gen2(temp, "tmp" ) << ";\n" ;
ss << "nCount = nCount-1;\n" ;
ss << "nCount = nCount +" ; /*re-assign nCount from count reduction*/
ss << Base::GetName() << "[gid0+" << nResultSize << "]" << ";\n" ;
}
else if (dynamic_cast <OpCount*>(mpCodeGen.get()))
ss << temp << "+ tmp" ;
else
ss << mpCodeGen->Gen2(temp, "tmp" );
ss << ";\n\t" ;
needBody = false ;
return nCurWindowSize;
}
template <class Base>
size_t ParallelReductionVectorRef<Base>::Marshal( cl_kernel k, int argno, int w, cl_program mpProgram )
{
assert(Base::mpClmem == nullptr);
OpenCLZone zone;
openclwrapper::KernelEnv kEnv;
openclwrapper::setKernelEnv(&kEnv);
cl_int err;
size_t nInput = mpDVR->GetArrayLength();
size_t nCurWindowSize = mpDVR->GetRefRowSize();
// create clmem buffer
if (mpDVR->GetArrays()[Base::mnIndex].mpNumericArray == nullptr)
throw Unhandled(__FILE__, __LINE__);
double * pHostBuffer = const_cast <double *>(
mpDVR->GetArrays()[Base::mnIndex].mpNumericArray);
size_t szHostBuffer = nInput * sizeof (double );
Base::mpClmem = clCreateBuffer(kEnv.mpkContext,
cl_mem_flags(CL_MEM_READ_ONLY) | CL_MEM_USE_HOST_PTR,
szHostBuffer,
pHostBuffer, &err);
SAL_INFO("sc.opencl" , "Created buffer " << Base::mpClmem << " size " << nInput << "*" << sizeof (double ) << "=" << szHostBuffer << " using host buffer " << pHostBuffer);
mpClmem2 = clCreateBuffer(kEnv.mpkContext,
CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
sizeof (double ) * w, nullptr, nullptr);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem2 << " size " << sizeof (double ) << "*" << w << "=" << (sizeof (double )*w));
// reproduce the reduction function name
std::string kernelName;
if (!dynamic_cast <OpAverage*>(mpCodeGen.get()))
kernelName = Base::GetName() + "_reduction" ;
else
kernelName = Base::GetName() + "_sum_reduction" ;
cl_kernel redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError("clCreateKernel" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram);
// set kernel arg of reduction kernel
// TODO(Wei Wei): use unique name for kernel
cl_mem buf = Base::GetCLBuffer();
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf);
err = clSetKernelArg(redKernel, 0, sizeof (cl_mem),
static_cast <void *>(&buf));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2);
err = clSetKernelArg(redKernel, 1, sizeof (cl_mem), &mpClmem2);
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput);
err = clSetKernelArg(redKernel, 2, sizeof (cl_int), static_cast <void *>(&nInput));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize);
err = clSetKernelArg(redKernel, 3, sizeof (cl_int), static_cast <void *>(&nCurWindowSize));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
// set work group size and execute
size_t global_work_size[] = { 256, static_cast <size_t>(w) };
size_t const local_work_size[] = { 256, 1 };
SAL_INFO("sc.opencl" , "Enqueuing kernel " << redKernel);
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
global_work_size, local_work_size, 0, nullptr, nullptr);
if (CL_SUCCESS != err)
throw OpenCLError("clEnqueueNDRangeKernel" , err, __FILE__, __LINE__);
err = clFinish(kEnv.mpkCmdQueue);
if (CL_SUCCESS != err)
throw OpenCLError("clFinish" , err, __FILE__, __LINE__);
if (dynamic_cast <OpAverage*>(mpCodeGen.get()))
{
/*average need more reduction kernel for count computing*/
std::unique_ptr<double []> pAllBuffer(new double [2 * w]);
double * resbuf = static_cast <double *>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
mpClmem2,
CL_TRUE, CL_MAP_READ, 0,
sizeof (double ) * w, 0, nullptr, nullptr,
&err));
if (err != CL_SUCCESS)
throw OpenCLError("clEnqueueMapBuffer" , err, __FILE__, __LINE__);
for (int i = 0; i < w; i++)
pAllBuffer[i] = resbuf[i];
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr);
if (err != CL_SUCCESS)
throw OpenCLError("clEnqueueUnmapMemObject" , err, __FILE__, __LINE__);
kernelName = Base::GetName() + "_count_reduction" ;
redKernel = clCreateKernel(mpProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError("clCreateKernel" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created kernel " << redKernel << " with name " << kernelName << " in program " << mpProgram);
// set kernel arg of reduction kernel
buf = Base::GetCLBuffer();
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 0 << ": cl_mem: " << buf);
err = clSetKernelArg(redKernel, 0, sizeof (cl_mem),
static_cast <void *>(&buf));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 1 << ": cl_mem: " << mpClmem2);
err = clSetKernelArg(redKernel, 1, sizeof (cl_mem), &mpClmem2);
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 2 << ": cl_int: " << nInput);
err = clSetKernelArg(redKernel, 2, sizeof (cl_int), static_cast <void *>(&nInput));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << 3 << ": cl_int: " << nCurWindowSize);
err = clSetKernelArg(redKernel, 3, sizeof (cl_int), static_cast <void *>(&nCurWindowSize));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
// set work group size and execute
size_t global_work_size1[] = { 256, static_cast <size_t>(w) };
size_t const local_work_size1[] = { 256, 1 };
SAL_INFO("sc.opencl" , "Enqueuing kernel " << redKernel);
err = clEnqueueNDRangeKernel(kEnv.mpkCmdQueue, redKernel, 2, nullptr,
global_work_size1, local_work_size1, 0, nullptr, nullptr);
if (CL_SUCCESS != err)
throw OpenCLError("clEnqueueNDRangeKernel" , err, __FILE__, __LINE__);
err = clFinish(kEnv.mpkCmdQueue);
if (CL_SUCCESS != err)
throw OpenCLError("clFinish" , err, __FILE__, __LINE__);
resbuf = static_cast <double *>(clEnqueueMapBuffer(kEnv.mpkCmdQueue,
mpClmem2,
CL_TRUE, CL_MAP_READ, 0,
sizeof (double ) * w, 0, nullptr, nullptr,
&err));
if (err != CL_SUCCESS)
throw OpenCLError("clEnqueueMapBuffer" , err, __FILE__, __LINE__);
for (int i = 0; i < w; i++)
pAllBuffer[i + w] = resbuf[i];
err = clEnqueueUnmapMemObject(kEnv.mpkCmdQueue, mpClmem2, resbuf, 0, nullptr, nullptr);
// FIXME: Is it intentional to not throw an OpenCLError even if the clEnqueueUnmapMemObject() fails?
if (CL_SUCCESS != err)
SAL_WARN("sc.opencl" , "clEnqueueUnmapMemObject failed: " << openclwrapper::errorString(err));
if (mpClmem2)
{
err = clReleaseMemObject(mpClmem2);
SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl" , "clReleaseMemObject failed: " << openclwrapper::errorString(err));
mpClmem2 = nullptr;
}
mpClmem2 = clCreateBuffer(kEnv.mpkContext,
cl_mem_flags(CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
w * sizeof (double ) * 2, pAllBuffer.get(), &err);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem2 << " size " << w << "*" << sizeof (double ) << "=" << (w*sizeof (double )) << " copying host buffer " << pAllBuffer.get());
}
// set kernel arg
SAL_INFO("sc.opencl" , "Kernel " << k << " arg " << argno << ": cl_mem: " << mpClmem2);
err = clSetKernelArg(k, argno, sizeof (cl_mem), &mpClmem2);
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
return 1;
}
template <class Base>
ParallelReductionVectorRef<Base>::~ParallelReductionVectorRef()
{
if (mpClmem2)
{
cl_int err;
err = clReleaseMemObject(mpClmem2);
SAL_WARN_IF(err != CL_SUCCESS, "sc.opencl" , "clReleaseMemObject failed: " << openclwrapper::errorString(err));
mpClmem2 = nullptr;
}
}
template class ParallelReductionVectorRef<VectorRef>;
namespace {
struct SumIfsArgs
{
explicit SumIfsArgs(cl_mem x) : mCLMem(x), mConst(0.0) { }
explicit SumIfsArgs(double x) : mCLMem(nullptr), mConst(x) { }
cl_mem mCLMem;
double mConst;
};
/// Helper functions that have multiple buffers
class DynamicKernelSoPArguments : public DynamicKernelArgument
{
public :
typedef std::vector<DynamicKernelArgumentRef> SubArgumentsType;
DynamicKernelSoPArguments( const ScCalcConfig& config,
const std::string& s, const FormulaTreeNodeRef& ft,
std::shared_ptr<SlidingFunctionBase> pCodeGen, int nResultSize );
/// Create buffer and pass the buffer to a given kernel
virtual size_t Marshal( cl_kernel k, int argno, int nVectorWidth, cl_program pProgram ) override
{
OpenCLZone zone;
unsigned i = 0;
for (const auto & rxSubArgument : mvSubArguments)
{
i += rxSubArgument->Marshal(k, argno + i, nVectorWidth, pProgram);
}
if (OpSumIfs* OpSumCodeGen = dynamic_cast <OpSumIfs*>(mpCodeGen.get()))
{
openclwrapper::KernelEnv kEnv;
openclwrapper::setKernelEnv(&kEnv);
cl_int err;
DynamicKernelArgument* Arg = mvSubArguments[0].get();
DynamicKernelSlidingArgument<VectorRef>* slidingArgPtr =
static_cast <DynamicKernelSlidingArgument<VectorRef>*>(Arg);
mpClmem2 = nullptr;
if (OpSumCodeGen->NeedReductionKernel())
{
size_t nInput = slidingArgPtr->GetArrayLength();
size_t nCurWindowSize = slidingArgPtr->GetWindowSize();
std::vector<SumIfsArgs> vclmem;
for (const auto & rxSubArgument : mvSubArguments)
{
if (VectorRef* VR = dynamic_cast <VectorRef*>(rxSubArgument.get()))
vclmem.emplace_back(VR->GetCLBuffer());
else if (DynamicKernelConstantArgument* CA = dynamic_cast <DynamicKernelConstantArgument*>(rxSubArgument.get()))
vclmem.emplace_back(CA->GetDouble());
else
vclmem.emplace_back(nullptr);
}
mpClmem2 = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE,
sizeof (double ) * nVectorWidth, nullptr, &err);
if (CL_SUCCESS != err)
throw OpenCLError("clCreateBuffer" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created buffer " << mpClmem2 << " size " << sizeof (double ) << "*" << nVectorWidth << "=" << (sizeof (double )*nVectorWidth));
std::string kernelName = mvSubArguments[0]->GetName() + "_SumIfs_reduction" ;
cl_kernel redKernel = clCreateKernel(pProgram, kernelName.c_str(), &err);
if (err != CL_SUCCESS)
throw OpenCLError("clCreateKernel" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Created kernel " << redKernel << " with name " << kernelName << " in program " << pProgram);
// set kernel arg of reduction kernel
for (size_t j = 0; j < vclmem.size(); j++)
{
if (vclmem[j].mCLMem)
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << j << ": cl_mem: " << vclmem[j].mCLMem);
else
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << j << ": double: " << preciseFloat( vclmem[j].mConst ));
err = clSetKernelArg(redKernel, j,
vclmem[j].mCLMem ? sizeof (cl_mem) : sizeof (double ),
vclmem[j].mCLMem ? static_cast <void *>(&vclmem[j].mCLMem) :
static_cast <void *>(&vclmem[j].mConst));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
}
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << vclmem.size() << ": cl_mem: " << mpClmem2);
err = clSetKernelArg(redKernel, vclmem.size(), sizeof (cl_mem), static_cast <void *>(&mpClmem2));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << (vclmem.size() + 1) << ": cl_int: " << nInput);
err = clSetKernelArg(redKernel, vclmem.size() + 1, sizeof (cl_int), static_cast <void *>(&nInput));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
SAL_INFO("sc.opencl" , "Kernel " << redKernel << " arg " << (vclmem.size() + 2) << ": cl_int: " << nCurWindowSize);
err = clSetKernelArg(redKernel, vclmem.size() + 2, sizeof (cl_int), static_cast <void *>(&nCurWindowSize));
if (CL_SUCCESS != err)
throw OpenCLError("clSetKernelArg" , err, __FILE__, __LINE__);
// set work group size and execute
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5 C=97 H=94 G=95
¤ Dauer der Verarbeitung: 0.20 Sekunden
¤
*© Formatika GbR, Deutschland