New hat/backends/ffi/cuda/src/main/native/cpp/squares.cpp

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include "cuda_backend.h"
 27 #include "../include/cuda_backend.h"
 28 
 29 class KernelContextWithBufferState{
 30 public:
 31 int x;
 32 int maxX;
 33 BufferState bufferState;
 34 };
 35 struct ArgArray_2 {
 36     int argc;
 37     u8_t pad12[12];
 38     KernelArg argv[2];
 39 };
 40 
 41 
 42 struct S32Array1024WithBufferState {
 43     int length;
 44     int array[1024];
 45     BufferState bufferState;
 46 };
 47 int main(int argc, char **argv) {
 48     CudaBackend cudaBackend(0
 49             | Backend::Config::Config::INFO_BIT
 50             | Backend::Config::Config::TRACE_CALLS_BIT
 51             | Backend::Config::Config::TRACE_COPIES_BIT
 52     );
 53     CudaSource* cudaSource = new CudaSource((char *) R"(
 54        #define NDRANGE_CUDA
 55        #define __global
 56        typedef char s8_t;
 57        typedef char byte;
 58        typedef char boolean;
 59        typedef unsigned char u8_t;
 60        typedef short s16_t;
 61        typedef unsigned short u16_t;
 62        typedef unsigned int u32_t;
 63        typedef int s32_t;
 64        typedef float f32_t;
 65        typedef long s64_t;
 66        typedef unsigned long u64_t;
 67        typedef struct KernelContext_s{
 68          int x;
 69          int maxX;
 70        }KernelContext_t;
 71        typedef struct S32Array_s{
 72           int length;
 73           int array[1];
 74        }S32Array_t;
 75 
 76        extern "C" __device__ int squareit(
 77            int v
 78        ){
 79           return v*v;
 80        }
 81 
 82        extern "C" __global__ void squareKernel(
 83            KernelContext_t *global_kc,  S32Array_t* s32Array
 84        ){
 85          KernelContext_t mine;
 86          KernelContext_t* kc=&mine;
 87          kc->x=blockIdx.x*blockDim.x+threadIdx.x;
 88          kc->maxX=global_kc->maxX;
 89          if(kc->x<kc->maxX){
 90              int value = s32Array->array[(long)kc->x];
 91              s32Array->array[(long)kc->x]=squareit(value);
 92          }
 93          return;
 94         }
 95     )");
 96 
 97     PtxSource *ptxSource = new PtxSource((char *)(R"(
 98        // Generated by NVIDIA NVVM Compiler
 99        //
100        // Compiler Build ID: CL-33191640
101        // Cuda compilation tools, release 12.2, V12.2.140
102        // Based on NVVM 7.0.1
103        //
104 
105        .version 8.2
106        .target sm_52
107        .address_size 64
108 
109                // .globl        squareKernel
110 
111        .visible .entry squareKernel(
112                .param .u64 squareKernel_param_0,
113                .param .u64 squareKernel_param_1
114        )
115        {
116                .reg .pred           %p<2>;
117                .reg .b32            %r<8>;
118                .reg .b64            %rd<7>;
119 
120 
121                ld.param.u64         %rd2, [squareKernel_param_0];
122                ld.param.u64         %rd1, [squareKernel_param_1];
123                cvta.to.global.u64         %rd3, %rd2;
124                mov.u32              %r2, %ntid.x;
125                mov.u32              %r3, %ctaid.x;
126                mov.u32              %r4, %tid.x;
127                mad.lo.s32           %r1, %r3, %r2, %r4;
128                ld.global.u32        %r5, [%rd3+4];
129                setp.ge.s32          %p1, %r1, %r5;
130                @%p1 bra             $L__BB0_2;
131 
132                cvta.to.global.u64   %rd4, %rd1;
133                mul.wide.s32         %rd5, %r1, 4;
134                add.s64              %rd6, %rd4, %rd5;
135                ld.global.u32        %r6, [%rd6+4];
136                mul.lo.s32           %r7, %r6, %r6;
137                st.global.u32        [%rd6+4], %r7;
138 
139        $L__BB0_2:
140                ret;
141        }
142     )"));
143     const int maxX = 32;
144 
145     bool useCuda=false;
146     std::cerr<<"using " << (useCuda?"CUDA":"PTX")<<std::endl;
147     auto *module =useCuda?cudaBackend.compile(cudaSource):cudaBackend.compile(ptxSource);
148    //auto *module =cudaBackend.compile(ptxSource);
149 
150      auto  *kernelContextWithBufferState = bufferOf<KernelContextWithBufferState>("kernelcontext");
151     kernelContextWithBufferState->x=0;
152     kernelContextWithBufferState->maxX=maxX;
153     auto *pS32Array1024WithBufferState = bufferOf<S32Array1024WithBufferState>("s32Arrayx1024");
154     pS32Array1024WithBufferState->length=maxX;
155     for (int i=0; i<pS32Array1024WithBufferState->length; i++){
156         pS32Array1024WithBufferState->array[i]=i;
157     }
158 
159     ArgArray_2 args2Array{.argc = 2, .argv={
160             {.idx = 0, .variant = '&',.value = {.buffer ={.memorySegment = static_cast<void *>(kernelContextWithBufferState), .sizeInBytes = sizeof(KernelContextWithBufferState), .access = RO_BYTE}}},
161             {.idx = 1, .variant = '&',.value = {.buffer ={.memorySegment = static_cast<void *>(pS32Array1024WithBufferState), .sizeInBytes = sizeof(S32Array1024WithBufferState), .access = RW_BYTE}}}
162     }};
163     const auto kernel = module->getCudaKernel((char*)"squareKernel");
164     std::cout << kernel->name <<std::endl;
165     kernel->ndrange( &args2Array);
166     for (int i=0; i<pS32Array1024WithBufferState->length; i++){
167         const int sq = pS32Array1024WithBufferState->array[i];
168         std::cout << i << " sq="<<sq <<std::endl;
169     }
170 }
171