Processing Device Arrays with C++ Metaprogramming
Transcription
Processing Device Arrays with C++ Metaprogramming
Processing Device Arrays with C++ Metaprogramming Jonathan Cohen (NVIDIA Research) GTC, San Jose Convention Center, CA | Sept. 20–23, 2010 Motivating Example struct DArray1D { int _size; float *_ptr; // device pointer DeviceArray1D(int size) : _size(size), _ptr(0) {…} ~DeviceArray1D() {…} }; __global__ void addition_kernel( int n, float *result, float *a, float *b) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) result [i] = a[i] + b[i]; } void addition( DArray1D &result, DArray1D &a, DArray1D &b) { int n = result._size; addition_kernel<<(n+255)/256, 256>>>( n, result._ptr, b._ptr, b._ptr); } API Design – Take 1: Small building blocks shift(DArray1D &output, DArray1D &input, int amount); scale(shift(DArray1D &output, DArray1D &input, float scale); add(DArray1D &output, DArray1D &input); Combine these to build complex expressions: shift(a_min1, a, -1); shift(a_plu1, a, 1); scale(a_cent, a, -2.0); add(result, a_min1, a_plu1); add(result, result, a_cent); // result = Laplacian(a) Problem: intermediate storage + bandwidth shift then add using intermediate storage Shift SM R1 = a[i+1]; b[i] = R1 MEMORY a b’ Add SM R1 = a[i]; R2 = b[i]; R3 = R1+R2; c[i] = R3 3 reads, 2 writes 1 extra array MEMORY a b’ c Better: store intermediate results in registers Fused shift then add Add + Shift SM R1 = a[i]; R2 = b[i+1]; R3 = R1+R2; c[i] = R3 MEMORY a b c 2 reads, 1 write Intermediate stored in registers API Design – Take 2: Fused kernels Lets turn this into an API Fuse all possible combinations together Benefit: Efficient execution, minimal storage __global__ void addition3_kernel( __global__ void addition3_sc_sh_kernel( int n, float *result, float *a, float *b, int n, float *result, float *a, float *b, float *c) __global__ void addition3_sc_kernel( float *c, float as, float bs, float cs, __global__ addition2_sc_sh_mult_kernel( { int n, float *result,void float *b,csh) __global__ void addition3_sc_sh_mult_kernel( int ash,*a, int float bsh, int int n, float *result, float int i = int threadIdx.x + blockIdx.x * blockDim.x; float *c, as, bs, float cs) *a, float *b, n, float *result, floatfloat *a, float *b, { float float *d, float as, float bs, *c, { float *d, float as, float bs, cs, if (i < float n) int float i = threadIdx.x + blockIdx.x * blockDim.x; int ash, int bsh) int ash, int bsh, int csh) (i < n) result [i] = a[i] + c[i]; int+ ib[i] = threadIdx.x + if blockIdx.x * blockDim.x; { { result [i] = as*a[i+ash] + bc*b[i+bsh] + } if (i < n) i = threadIdx.x + blockIdx.x * blockDim.x; int i = threadIdx.x + blockIdx.x int * blockDim.x; cs*c[i+csh]; (i < n)+ bc*b[i] + cs*c[i]; result [i] = if as*a[i] if (i < n) } result [i] void addition( result [i] } = (as*a[i+ash] + bc*b[i+bsh] + = (as*a[i+ash] + bc*b[i+bsh]) * d[i]; DArray1D &result,cs*c[i+csh]) DArray1D &a, DArray1D &b, * }d[i]; void addition( } DArray1D &c) void addition( DArray1D &result, DArray1D &a, DArray1D &b, void addition( DArray1D float as,&b, float bs, float cs, { DArray1D &result, DArray1D &a,&c, DArray1D DArray1D &result, DArray1D &a, DArray1D &b, void addition( ash, bs, int bsh, intcs) csh) int n = result._size; DArray1D &c, float as,int float float DArray1D &d, DArray1D &result, DArray1D &a, DArray1D &b, { addition3_kernel<<(n+255)/256, float as, float bs, { DArray1D &d, 256>>>( DArray1D &c, int n = result._size; int ash, int bsh) n, result._ptr, b._ptr, b._ptr, int n =float result._size; float as, float bs, cs, c._ptr); addition3_sc_sh_kernel<<(n+255)/256, 256>>>( } 256>>>( int ash, int addition3_sc_kernel<<(n+255)/256, bsh, int csh) { n, result._ptr, b._ptr, b._ptr, c._ptr, int n = result._size; { n, result._ptr, b._ptr, b._ptr, c._ptr, as, bs, cs, ash, bsh, csh); addition2_sc_sh_mult_kernel<<(n+255)/256, 256>>>( int n = result._size; } as, bs, cs); n, result._ptr, addition3_sc_sh_mult_kernel<<(n+255)/256, 256>>>( a._ptr, b._ptr, d._ptr, } as, bs, ash, bsh); n, result._ptr, b._ptr, b._ptr, c._ptr, d._ptr, as, bs, cs, ash, bsh, csh); } } All routines do basically the same thing! And that’s just for addition! void addition( DArray1D &result, DArray1D &a, DArray1D &b); void addition( DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &c); void addition_pt_wise_scale( DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &scale); void addition_pt_wise_scale( DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &c, DArray1D &scale); void addition_pt_wise_scale_shift( DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &scale, int ash, int bsh, int ); void addition_ptwise_scale_scale_shift( DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &c, DArray1D &d, float as, float bs, float cs, int ash, int bsh, int csh) void addition_scale_shift( DArray1D &result, DArray1D &a, DArray1D &b, DArray1D &c, float as, float bs, float cs, int ash, int bsh, int csh) void addition_scale_shift( DArray1D &result, DArray1D &a, DArray1D &b, float as, float bs, int ash, int bsh) API Design – Take 3: Emit fused kernels on-demand Expression Templates: C++ technique for performing compile-time calculations based on expressions nvcc includes robust C++ template support DArray1D a, result; result = a[-1] + a[1] – constant(2.0) * a[0]; Client code __global__ void auto_generated( int n, float *result, const float *a) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) result[i] = (a[i-1] + a[i+1] – 2.0f * a[i]); } Generated Kernel Expression Templates Approach 1. Build Abstract Syntax Tree (AST) using C++ types 2. Generate AST nodes automatically using templated functions 3. Provide framework code for calling emitted kernels 4. ―Walk‖ AST to emit kernel 1. Built AST Using C++ Types template<class PARM> struct Op { __device__ static float exec(int i, const PARM &p) { /* return some function of p and i */ } }; template<class OP, class PARM> struct OpWithParm { OpWithParm(const PARM &p) : parm(p) { } PARM parm; __device__ float exec(int i) const { return OP::exec(i, parm); } }; Abstract Operator Provide implementation – exec() routine Input: Index, abstract parameters Abstract Closure Bind operator with specific parameter state 5.0 OP = LeafOp<ConstantParm> PARM = ConstantParm = {5.0} a[i] OP = LeafOp<ArrayLookupParm> PARM = ArrayLookupParm = {a, 0} template<class PARM> struct LeafOp { __device__ static float exec(int i, const PARM &p) { return p.value(i); } }; + 5.0 a[i] struct ConstantParm { float _value; struct ArrayLookupParm { const float *_ptr; int _shift; __device__ float value(int i) const { return _value; } }; __device__ float value(int i) const { return _ptr[(i+_shift)]; } }; + OP = PlusOp<LeafOp<…>, LeafOp<…>, ConstantParm, ArrayLookupParm> PARM = ParmPair<ConstantParm, ArrayLookupParm> = {{5.0}, {a, 0}} + 5.0 a[i] template<class LOP, class ROP, class LPARM, class RPARM> struct PlusOp { __device__ static float exec( int i, const ParmPair<LPARM, RPARM> &p) { return LOP::exec(i,p.left) + ROP::exec(i,p.right); } }; template<typename LPARM, typename RPARM> struct ParmPair { LPARM left; RPARM right; ParmPair(const LPARM &l, const RPARM &r) : left(l), right(r) { } }; 2. Generate AST nodes from templated functions Expression Templates: use templated functions to compute output types as well as output values T. Veldhuizen. ―Expression Templates,‖ C++ Report, 26-31, June 1995. template<typename A> computed_type<A> my_function(A) { return my_type<A>(…); } my_class a; my_function(a); // <= return type is computed_type<my_class> 5.0 OP = LeafOp<ConstantParm> PARM = ConstantParm = {5.0} OpWithParm<LeafOp<ConstantParm>, ConstantParm> constant(float value) { return OpWithParm<LeafOp<ConstantParm>, ConstantParm >( ConstantParm (value)); } node = constant(5.0) a[i] OP = LeafOp<ArrayLookupParm> PARM = ArrayLookupParm = {a, 0} OpWithParm<LeafOp<ArrayLookupParm>, ArrayLookupParm> DArray1D::operator[](int shift) { return OpWithParm<LeafOp<ArrayLookupParm>,ArrayLookupParm>( ArrayLookupParm(g, shift)); } node = a[0]; + OP = PlusOp<LeafOp<…>, LeafOp<…>, ConstantParm, ArrayLookupParm> PARM = ParmPair<ConstantParm, ArrayLookupParm> = {{5}, {a, 0}} template<class LOP, class LPARM, class ROP, class RPARM> OpWithParm<AddOp<LOP, LPARM, ROP, RPARM>, ParmPair<LPARM, RPARM> > operator+( const OpWithParm<LOP, LPARM> &left, const OpWithParm<ROP, RPARM> &right) { return OpWithParm<AddOp<LOP, ROP, LPARM, RPARM>, ParmPair<LPARM, RPARM> >( ParmPair<LPARM, RPARM>(left.parm, right.parm)) } node = constant(5.0) + a[0]; 3. Framework for calling kernels OP::exec(index, parm) routine provides per-index evalulation OpWithParm binds parm instance: — OP::exec(… , …) => OP::exec(… , parm) // (aka ―currying‖) Call OP::exec once per array index <= CUDA kernel Assign result to each entry in result array <= overload DArray1D::operator=(OpWithParm) template <typename OP, typename PARM> __global__ void kernel_assign( const OpWithParm<OP,PARM> functor, float *result, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { result[i] = functor.exec(i); } } template<typename OP, typename PARM> DArray1D & DArray1D::operator=(const OpWithParm<OP,PARM> &func) { kernel_assign<<<(_size+255)/256, 256>>>(func, _ptr, _size); return *this; } 4. Walk AST to emit function Invocation of top-level OpWithParm::exec triggers recursive expansion + 5.0 a[i] OpWithParm::exec(i) => PlusOp::exec(i, parm) => LeafOp<ConstantParm>::exec(i, parm) => parm.value(i) => return 5 LeafOp<ArrayLookupParm>::exec(i, parm) => parm.value(i) => return a[i] * DeviceArray1D A(100), B(100), C(100); A = constant(0.5f) * (B[0] + C[0]); + 5.0 B[i] C[i] __global__ void generated_kernel_assign( generated_OpWithParm ftor, float *dst, int nx) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < nx) { dst[i] = ftor.parm.left._value * ( ftor.parm.right.left._ptr[i+functor.parm.right.left._shift] + ftor.parm.right.right._ptr[i+functor.parm.right.right._shift] ); } Generated kernel: properly fused Add + Scale SM R1 = b[i]; R2 = c[i]; R3 = R1+R2; R4 = 0.5*R3 a[i] = R4 MEMORY c b a 2 reads, 1 write Intermediate stored in registers One AST => Many emitted functions So far, code generator builds per-index exec(int, parm) kernel Other useful things: — Validate that all array reads are in-bounds — Count number of FLOPs — Count number of bytes read — Etc. Example: bounds checking Array padding to DArray1D: DArray1D a(8,1); -1 0 1 2 3 a._ptr A[-1] .. A[8] are valid accesses 4 5 6 7 8 Shifted accesses Now we can implement 1D Laplacian: DArray1D a(n,1), result(n,1); … // initialize a result = a[-1] – constant(2.0) * a[0] + a[1]; BUT, improper padding will generate out-of-bounds access: DArray1D a(n,0), result(n,1); We would like to catch these errors before kernel launch Solution: Generate range checker from AST template<class PARM> struct Op { ... bool validate( const Range &rng, const PARM &p) const { ... } }; template<class OP, class PARM> struct OpWithParm { ... bool validate(const Range &rng) const { return OP::validate(rng, parm); } }; OP: Validate index range based on any array accesses OpWithParm: Validate index range for entire tree template<typename OP, typename PARM> DArray1D & DArray1D::operator=(const OpWithParm<OP,PARM> &func) { if (!func.validate(this->range())) { // run-time error } kernel_assign<<<(_size+255)/256, 256>>>(func, _ptr, _size); return *this; } Output range validated before kernel launch Illegal shifts generate run-time error - bad memory accesses impossible Results – 1D Heat Equation – Explicit Euler Implementation Lines of code Time (speedup) Laptop1 Time (speedup) HPC Workstation2 SERIAL (CPU) 28 5,760.5 ms (1x) 3,068.3 ms (1x) METAPROG 18 543.3 ms (11x) 36.5 ms (84x) HAND 20 host + 34 device 523.2 ms (11x) 29.5 ms (104x) HAND-OPT 20 host + 54 device 217.8 ms (26x) 30.2 ms (101x) 1 - Laptop: NVIDIA Quadro FX 570M + 2-core Intel T7300 Centrino @ 2Ghz 2 - Workstation: NVIDIA Tesla C2050 w/ ECC + 4-core Intel Core i7 @ 3.07Ghz Getting Complicated… Write routines: my_array(0,127) = constant(1.0); Array slices my_array(0,127,2) = my_array[-1] + my_array[+1]; Array reshaping (like Fortran) DArray1D from(64); from = constant(-1.0); my_array(0,127,2) = from(0,63); void restrict_residual( DArray1D &U_f, const DArray1D &B_f, DArray1D &R_f, DArray1D &B_c, FP h, int level) { int n = R_f._size; update_bc(U_f, level); R_f = B_f[0] - constant(1.0/(h*h)) * (U_f[1] + U_f[-1] - constant(2) * U_f[0]); B_c = constant(.5) * (R_f.read(0, n-2, 2) + R_f.read(1, n-1, 2)); } void prolong( DArray1D &U_c, DArray1D &U_f, int level) { update_bc(U_c, level+1); int n_c = U_c._size, n_f = U_f._size; U_f(-1, n_f-1, 2) = U_f.read(-1, n_f-1, 2) + constant(.75) constant(.25) U_f( 0, n_f , 2) = U_f.read( 0, n_f , 2) + constant(.25) constant(.75) update_bc(U_f, level); } * * * * U_c.read(-1, n_c-1) + U_c.read(0, n_c); U_c.read(-1, n_c-1) + U_c.read(0, n_c); Results – 1D Poisson Equation - Multigrid Implementation Lines of code (excl. ws,//,/* */) Time (speedup) Laptop1 – fp32 Time (speedup) HPC Workstation2 SERIAL (CPU) 127 26,973 (5x) 12,819 (26x) METAPROG 119 5,291 (1x) 501 (1x) 1 - Laptop: NVIDIA Quadro FX 570M + 2-core Intel T7300 Centrino @ 2Ghz 2 - Workstation: NVIDIA Tesla C2050 w/ ECC + 4-core Intel Core i7 @ 3.07Ghz Future Work Multi-GPU backend – infer data movement from access patterns More backends: OpenMP, OpenCL, naïve single-threaded, etc. More parallel primitives + sophisticated fusion = Copperhead Use of shared memory – map work to parallel thread sets, rather than parallel threads For more info Forthcoming chapter in GPU Computing Gems Volume 2 Googlecode: http://code.google.com/p/cuda-metaprog/ jocohen@nvidia.com http://research.nvidia.com/users/jonathan-cohen