New hat/backends/ffi/cuda/cpp/squares.cpp

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include "cuda_backend.h"
 27 class KernelContextWithBufferState{
 28 public:
 29 int x;
 30 int maxX;
 31 BufferState bufferState;
 32 };
 33 struct ArgArray_2 {
 34     int argc;
 35     u8_t pad12[12];
 36     KernelArg argv[2];
 37 };
 38 
 39 
 40 struct S32Array1024WithBufferState {
 41     int length;
 42     int array[1024];
 43     BufferState bufferState;
 44 };
 45 int main(int argc, char **argv) {
 46     CudaBackend cudaBackend(0
 47             | Backend::Config::Config::INFO_BIT
 48             | Backend::Config::Config::TRACE_CALLS_BIT
 49             | Backend::Config::Config::TRACE_COPIES_BIT
 50     );
 51 
 52     //std::string cudaPath =  "/home/gfrost/github/grfrost/babylon-grfrost-fork/hat/squares.cuda";
 53     CudaSource cudaSource((char *) R"(
 54        #define NDRANGE_CUDA
 55        #define __global
 56        typedef char s8_t;
 57        typedef char byte;
 58        typedef char boolean;
 59        typedef unsigned char u8_t;
 60        typedef short s16_t;
 61        typedef unsigned short u16_t;
 62        typedef unsigned int u32_t;
 63        typedef int s32_t;
 64        typedef float f32_t;
 65        typedef long s64_t;
 66        typedef unsigned long u64_t;
 67        typedef struct KernelContext_s{
 68          int x;
 69          int maxX;
 70        }KernelContext_t;
 71        typedef struct S32Array_s{
 72           int length;
 73           int array[1];
 74        }S32Array_t;
 75 
 76        extern "C" __device__ inline int squareit(
 77            int v
 78        ){
 79           return v*v;
 80        }
 81 
 82        extern "C" __global__ void squareKernel(
 83            KernelContext_t *global_kc,  S32Array_t* s32Array
 84        ){
 85          KernelContext_t mine;
 86          KernelContext_t* kc=&mine;
 87          kc->x=blockIdx.x*blockDim.x+threadIdx.x;
 88          kc->maxX=global_kc->maxX;
 89          if(kc->x<kc->maxX){
 90              int value = s32Array->array[(long)kc->x];
 91              s32Array->array[(long)kc->x]=squareit(value);
 92          }
 93          return;
 94         }
 95     )");
 96 
 97     PtxSource ptxSource((char*)R"(
 98        // Generated by NVIDIA NVVM Compiler
 99        //
100        // Compiler Build ID: CL-33191640
101        // Cuda compilation tools, release 12.2, V12.2.140
102        // Based on NVVM 7.0.1
103        //
104 
105        .version 8.2
106        .target sm_52
107        .address_size 64
108 
109                // .globl        squareKernel
110 
111        .visible .entry squareKernel(
112                .param .u64 squareKernel_param_0,
113                .param .u64 squareKernel_param_1
114        )
115        {
116                .reg .pred           %p<2>;
117                .reg .b32            %r<8>;
118                .reg .b64            %rd<7>;
119 
120 
121                ld.param.u64         %rd2, [squareKernel_param_0];
122                ld.param.u64         %rd1, [squareKernel_param_1];
123                cvta.to.global.u64         %rd3, %rd2;
124                mov.u32              %r2, %ntid.x;
125                mov.u32              %r3, %ctaid.x;
126                mov.u32              %r4, %tid.x;
127                mad.lo.s32           %r1, %r3, %r2, %r4;
128                ld.global.u32        %r5, [%rd3+4];
129                setp.ge.s32          %p1, %r1, %r5;
130                @%p1 bra             $L__BB0_2;
131 
132                cvta.to.global.u64   %rd4, %rd1;
133                mul.wide.s32         %rd5, %r1, 4;
134                add.s64              %rd6, %rd4, %rd5;
135                ld.global.u32        %r6, [%rd6+4];
136                mul.lo.s32           %r7, %r6, %r6;
137                st.global.u32        [%rd6+4], %r7;
138 
139        $L__BB0_2:
140                ret;
141        }
142     )");
143 int maxX = 1024;
144     auto *module =cudaBackend.compile(cudaSource);
145      auto  *kernelContextWithBufferState = bufferOf<KernelContextWithBufferState>("kernelcontext");
146     kernelContextWithBufferState->x=0;
147     kernelContextWithBufferState->maxX=maxX;
148     auto *pS32Array1024WithBufferState = bufferOf<S32Array1024WithBufferState>("s32Arrayx1024");
149     pS32Array1024WithBufferState->length=maxX;
150     for (int i=0; i<pS32Array1024WithBufferState->length; i++){
151         pS32Array1024WithBufferState->array[i]=i;
152     }
153 
154     ArgArray_2 args2Array{.argc = 2, .argv={
155             {.idx = 0, .variant = '&',.value = {.buffer ={.memorySegment = (void *) kernelContextWithBufferState, .sizeInBytes = sizeof(KernelContextWithBufferState), .access = RO_BYTE}}},
156             {.idx = 1, .variant = '&',.value = {.buffer ={.memorySegment = (void *) pS32Array1024WithBufferState, .sizeInBytes = sizeof(S32Array1024WithBufferState), .access = RW_BYTE}}}
157     }};
158     auto kernel = module->getCudaKernel((char*)"squareKernel");
159     std::cout << kernel->name <<std::endl;
160     kernel->ndrange( reinterpret_cast<ArgArray_s *>(&args2Array));
161     for (int i=0; i<pS32Array1024WithBufferState->length; i++){
162         int sq = pS32Array1024WithBufferState->array[i];
163         std::cout << i << " sq="<<sq <<std::endl;
164     }
165 }
166