1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 #include "cuda_backend.h"
27 #include "../include/cuda_backend.h"
28
29 class KernelContextWithBufferState{
30 public:
31 int x;
32 int maxX;
33 BufferState bufferState;
34 };
35 struct ArgArray_2 {
36 int argc;
37 u8_t pad12[12];
38 KernelArg argv[2];
39 };
40
41
42 struct S32Array1024WithBufferState {
43 int length;
44 int array[1024];
45 BufferState bufferState;
46 };
47 int main(int argc, char **argv) {
48 CudaBackend cudaBackend(0
49 | Backend::Config::Config::INFO_BIT
50 | Backend::Config::Config::TRACE_CALLS_BIT
51 | Backend::Config::Config::TRACE_COPIES_BIT
52 );
53 CudaSource* cudaSource = new CudaSource((char *) R"(
54 #define NDRANGE_CUDA
55 #define __global
56 typedef char s8_t;
57 typedef char byte;
58 typedef char boolean;
59 typedef unsigned char u8_t;
60 typedef short s16_t;
61 typedef unsigned short u16_t;
62 typedef unsigned int u32_t;
63 typedef int s32_t;
64 typedef float f32_t;
65 typedef long s64_t;
66 typedef unsigned long u64_t;
67 typedef struct KernelContext_s{
68 int x;
69 int maxX;
70 }KernelContext_t;
71 typedef struct S32Array_s{
72 int length;
73 int array[1];
74 }S32Array_t;
75
76 extern "C" __device__ inline int squareit(
77 int v
78 ){
79 return v*v;
80 }
81
82 extern "C" __global__ void squareKernel(
83 KernelContext_t *global_kc, S32Array_t* s32Array
84 ){
85 KernelContext_t mine;
86 KernelContext_t* kc=&mine;
87 kc->x=blockIdx.x*blockDim.x+threadIdx.x;
88 kc->maxX=global_kc->maxX;
89 if(kc->x<kc->maxX){
90 int value = s32Array->array[(long)kc->x];
91 s32Array->array[(long)kc->x]=squareit(value);
92 }
93 return;
94 }
95 )");
96
97 PtxSource *ptxSource = new PtxSource((char *)(R"(
98 // Generated by NVIDIA NVVM Compiler
99 //
100 // Compiler Build ID: CL-33191640
101 // Cuda compilation tools, release 12.2, V12.2.140
102 // Based on NVVM 7.0.1
103 //
104
105 .version 8.2
106 .target sm_52
107 .address_size 64
108
109 // .globl squareKernel
110
111 .visible .entry squareKernel(
112 .param .u64 squareKernel_param_0,
113 .param .u64 squareKernel_param_1
114 )
115 {
116 .reg .pred %p<2>;
117 .reg .b32 %r<8>;
118 .reg .b64 %rd<7>;
119
120
121 ld.param.u64 %rd2, [squareKernel_param_0];
122 ld.param.u64 %rd1, [squareKernel_param_1];
123 cvta.to.global.u64 %rd3, %rd2;
124 mov.u32 %r2, %ntid.x;
125 mov.u32 %r3, %ctaid.x;
126 mov.u32 %r4, %tid.x;
127 mad.lo.s32 %r1, %r3, %r2, %r4;
128 ld.global.u32 %r5, [%rd3+4];
129 setp.ge.s32 %p1, %r1, %r5;
130 @%p1 bra $L__BB0_2;
131
132 cvta.to.global.u64 %rd4, %rd1;
133 mul.wide.s32 %rd5, %r1, 4;
134 add.s64 %rd6, %rd4, %rd5;
135 ld.global.u32 %r6, [%rd6+4];
136 mul.lo.s32 %r7, %r6, %r6;
137 st.global.u32 [%rd6+4], %r7;
138
139 $L__BB0_2:
140 ret;
141 }
142 )"));
143 const int maxX = 32;
144
145 bool useCuda=false;
146 std::cerr<<"using " << (useCuda?"CUDA":"PTX")<<std::endl;
147 auto *module =useCuda?cudaBackend.compile(cudaSource):cudaBackend.compile(ptxSource);
148 //auto *module =cudaBackend.compile(ptxSource);
149
150 auto *kernelContextWithBufferState = bufferOf<KernelContextWithBufferState>("kernelcontext");
151 kernelContextWithBufferState->x=0;
152 kernelContextWithBufferState->maxX=maxX;
153 auto *pS32Array1024WithBufferState = bufferOf<S32Array1024WithBufferState>("s32Arrayx1024");
154 pS32Array1024WithBufferState->length=maxX;
155 for (int i=0; i<pS32Array1024WithBufferState->length; i++){
156 pS32Array1024WithBufferState->array[i]=i;
157 }
158
159 ArgArray_2 args2Array{.argc = 2, .argv={
160 {.idx = 0, .variant = '&',.value = {.buffer ={.memorySegment = static_cast<void *>(kernelContextWithBufferState), .sizeInBytes = sizeof(KernelContextWithBufferState), .access = RO_BYTE}}},
161 {.idx = 1, .variant = '&',.value = {.buffer ={.memorySegment = static_cast<void *>(pS32Array1024WithBufferState), .sizeInBytes = sizeof(S32Array1024WithBufferState), .access = RW_BYTE}}}
162 }};
163 const auto kernel = module->getCudaKernel((char*)"squareKernel");
164 std::cout << kernel->name <<std::endl;
165 kernel->ndrange( &args2Array);
166 for (int i=0; i<pS32Array1024WithBufferState->length; i++){
167 const int sq = pS32Array1024WithBufferState->array[i];
168 std::cout << i << " sq="<<sq <<std::endl;
169 }
170 }
171