1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include "cuda_backend.h" 27 class KernelContextWithBufferState{ 28 public: 29 int x; 30 int maxX; 31 BufferState bufferState; 32 }; 33 struct ArgArray_2 { 34 int argc; 35 u8_t pad12[12]; 36 KernelArg argv[2]; 37 }; 38 39 40 struct S32Array1024WithBufferState { 41 int length; 42 int array[1024]; 43 BufferState bufferState; 44 }; 45 int main(int argc, char **argv) { 46 CudaBackend cudaBackend(0 47 | Backend::Config::Config::INFO_BIT 48 | Backend::Config::Config::TRACE_CALLS_BIT 49 | Backend::Config::Config::TRACE_COPIES_BIT 50 ); 51 52 //std::string cudaPath = "/home/gfrost/github/grfrost/babylon-grfrost-fork/hat/squares.cuda"; 53 CudaSource cudaSource((char *) R"( 54 #define NDRANGE_CUDA 55 #define __global 56 typedef char s8_t; 57 typedef char byte; 58 typedef char boolean; 59 typedef unsigned char u8_t; 60 typedef short s16_t; 61 typedef unsigned short u16_t; 62 typedef unsigned int u32_t; 63 typedef int s32_t; 64 typedef float f32_t; 65 typedef long s64_t; 66 typedef unsigned long u64_t; 67 typedef struct KernelContext_s{ 68 int x; 69 int maxX; 70 }KernelContext_t; 71 typedef struct S32Array_s{ 72 int length; 73 int array[1]; 74 }S32Array_t; 75 76 extern "C" __device__ inline int squareit( 77 int v 78 ){ 79 return v*v; 80 } 81 82 extern "C" __global__ void squareKernel( 83 KernelContext_t *global_kc, S32Array_t* s32Array 84 ){ 85 KernelContext_t mine; 86 KernelContext_t* kc=&mine; 87 kc->x=blockIdx.x*blockDim.x+threadIdx.x; 88 kc->maxX=global_kc->maxX; 89 if(kc->x<kc->maxX){ 90 int value = s32Array->array[(long)kc->x]; 91 s32Array->array[(long)kc->x]=squareit(value); 92 } 93 return; 94 } 95 )"); 96 97 PtxSource ptxSource((char*)R"( 98 // Generated by NVIDIA NVVM Compiler 99 // 100 // Compiler Build ID: CL-33191640 101 // Cuda compilation tools, release 12.2, V12.2.140 102 // Based on NVVM 7.0.1 103 // 104 105 .version 8.2 106 .target sm_52 107 .address_size 64 108 109 // .globl squareKernel 110 111 .visible .entry squareKernel( 112 .param .u64 squareKernel_param_0, 113 .param .u64 squareKernel_param_1 114 ) 115 { 116 .reg .pred %p<2>; 117 .reg .b32 %r<8>; 118 .reg .b64 %rd<7>; 119 120 121 ld.param.u64 %rd2, [squareKernel_param_0]; 122 ld.param.u64 %rd1, [squareKernel_param_1]; 123 cvta.to.global.u64 %rd3, %rd2; 124 mov.u32 %r2, %ntid.x; 125 mov.u32 %r3, %ctaid.x; 126 mov.u32 %r4, %tid.x; 127 mad.lo.s32 %r1, %r3, %r2, %r4; 128 ld.global.u32 %r5, [%rd3+4]; 129 setp.ge.s32 %p1, %r1, %r5; 130 @%p1 bra $L__BB0_2; 131 132 cvta.to.global.u64 %rd4, %rd1; 133 mul.wide.s32 %rd5, %r1, 4; 134 add.s64 %rd6, %rd4, %rd5; 135 ld.global.u32 %r6, [%rd6+4]; 136 mul.lo.s32 %r7, %r6, %r6; 137 st.global.u32 [%rd6+4], %r7; 138 139 $L__BB0_2: 140 ret; 141 } 142 )"); 143 int maxX = 1024; 144 auto *module =cudaBackend.compile(cudaSource); 145 auto *kernelContextWithBufferState = bufferOf<KernelContextWithBufferState>("kernelcontext"); 146 kernelContextWithBufferState->x=0; 147 kernelContextWithBufferState->maxX=maxX; 148 auto *pS32Array1024WithBufferState = bufferOf<S32Array1024WithBufferState>("s32Arrayx1024"); 149 pS32Array1024WithBufferState->length=maxX; 150 for (int i=0; i<pS32Array1024WithBufferState->length; i++){ 151 pS32Array1024WithBufferState->array[i]=i; 152 } 153 154 ArgArray_2 args2Array{.argc = 2, .argv={ 155 {.idx = 0, .variant = '&',.value = {.buffer ={.memorySegment = (void *) kernelContextWithBufferState, .sizeInBytes = sizeof(KernelContextWithBufferState), .access = RO_BYTE}}}, 156 {.idx = 1, .variant = '&',.value = {.buffer ={.memorySegment = (void *) pS32Array1024WithBufferState, .sizeInBytes = sizeof(S32Array1024WithBufferState), .access = RW_BYTE}}} 157 }}; 158 auto kernel = module->getCudaKernel((char*)"squareKernel"); 159 std::cout << kernel->name <<std::endl; 160 kernel->ndrange( reinterpret_cast<ArgArray_s *>(&args2Array)); 161 for (int i=0; i<pS32Array1024WithBufferState->length; i++){ 162 int sq = pS32Array1024WithBufferState->array[i]; 163 std::cout << i << " sq="<<sq <<std::endl; 164 } 165 } 166