1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include <cuda_runtime_api.h>
 29 #include "cuda_backend.h"
 30 
 31 Ptx::Ptx(size_t len)
 32         : len(len), text(len > 0 ? new char[len] : nullptr) {
 33     std::cout << "in Ptx with buffer allocated "<<len << std::endl;
 34 }
 35 
 36 Ptx::~Ptx() {
 37     if (len > 0 && text != nullptr) {
 38         std::cout << "in ~Ptx with deleting allocated "<<len << std::endl;
 39         delete[] text;
 40     }
 41 }
 42 
 43 uint64_t timeSinceEpochMillisec() {
 44     using namespace std::chrono;
 45     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 46 }
 47 
 48 Ptx *Ptx::nvcc(const char *cudaSource, size_t len) {
 49     Ptx *ptx = nullptr;
 50     uint64_t time = timeSinceEpochMillisec();
 51     std::stringstream timestampPtx;
 52     timestampPtx << "./tmp" << time << ".ptx";
 53     const char *ptxPath = strdup(timestampPtx.str().c_str());
 54    // std::cout << "ptx " << ptxPath << std::endl;
 55     // we are going to fork exec nvcc
 56     int pid;
 57     if ((pid = fork()) == 0) {
 58         std::ofstream cuda;
 59         std::stringstream timestampCuda;
 60         timestampCuda << "./tmp" << time << ".cu";
 61         const char *cudaPath = strdup(timestampCuda.str().c_str());
 62         std::cout << "cuda " << cudaPath << std::endl;
 63         cuda.open(cudaPath, std::ofstream::trunc);
 64         cuda.write(cudaSource, len);
 65         cuda.close();
 66         const char *path = "/usr/bin/nvcc";
 67         //const char *path = "/usr/local/cuda-12.2/bin/nvcc";
 68         const char *argv[]{"nvcc", "-ptx", cudaPath, "-o", ptxPath, nullptr};
 69         // we can't free cudaPath or ptxpath in child because we need them in exec, no prob through
 70         // because we get a new proc so they are released to os
 71         execvp(path, (char *const *) argv);
 72 
 73     } else if (pid < 0) {
 74         // fork failed.
 75         std::cerr << "fork of nvcc failed" << std::endl;
 76         std::exit(1);
 77     } else {
 78         int status;
 79      //   std::cerr << "fork suceeded waiting for child" << std::endl;
 80         pid_t result = wait(&status);
 81         std::cerr << "child finished" << std::endl;
 82         std::ifstream ptxStream;
 83         ptxStream.open(ptxPath);
 84       //  if (ptxStream.is_open()) {
 85             ptxStream.seekg(0, std::ios::end);
 86             size_t ptxLen = ptxStream.tellg();
 87             ptxStream.close();
 88             ptxStream.open(ptxPath);
 89             free((void *) ptxPath);
 90             ptxPath = nullptr;
 91             if (ptxLen > 0) {
 92                 std::cerr << "ptx len "<< ptxLen << std::endl;
 93                 ptx = new Ptx(ptxLen + 1);
 94                 std::cerr << "about to read  "<< ptx->len << std::endl;
 95                 ptxStream.read(ptx->text, ptx->len);
 96                 ptxStream.close();
 97                 std::cerr << "about to read  "<< ptx->len << std::endl;
 98                 ptx->text[ptx->len - 1] = '\0';
 99                 std::cerr << "read text "<< ptx->text << std::endl;
100 
101             } else {
102                 std::cerr << "no ptx! ptxLen == 0?";
103                 exit(1);
104             }
105       //  }else{
106         //    std::cerr << "no ptx!";
107        //     exit(1);
108       //  }
109     }
110     std::cout << "returning PTX" << std::endl;
111     return ptx;
112 }
113 
114 /*
115 //http://mercury.pr.erau.edu/~siewerts/extra/code/digital-media/CUDA/cuda_work/samples/0_Simple/matrixMulDrv/matrixMulDrv.cpp
116  */
117 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::CudaBuffer(Backend::Program::Kernel *kernel, Arg_s *arg)
118         : Buffer(kernel, arg), devicePtr() {
119     /*
120      *   (void *) arg->value.buffer.memorySegment,
121      *   (size_t) arg->value.buffer.sizeInBytes);
122      */
123   //  std::cout << "cuMemAlloc()" << std::endl;
124     CUresult status = cuMemAlloc(&devicePtr, (size_t) arg->value.buffer.sizeInBytes);
125     if (CUDA_SUCCESS != status) {
126         std::cerr << "cuMemFree() CUDA error = " << status
127                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
128                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
129         exit(-1);
130     }
131   //  std::cout << "devptr " << std::hex<<  (long)devicePtr <<std::dec <<std::endl;
132     arg->value.buffer.vendorPtr = static_cast<void *>(this);
133 }
134 
135 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::~CudaBuffer() {
136 
137  //   std::cout << "cuMemFree()"
138   //          << "devptr " << std::hex<<  (long)devicePtr <<std::dec
139    //         << std::endl;
140     CUresult  status = cuMemFree(devicePtr);
141     if (CUDA_SUCCESS != status) {
142         std::cerr << "cuMemFree() CUDA error = " << status
143                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
144                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
145         exit(-1);
146     }
147     arg->value.buffer.vendorPtr = nullptr;
148 }
149 
150 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyToDevice() {
151     auto cudaKernel = dynamic_cast<CudaKernel*>(kernel);
152  //   std::cout << "copyToDevice() 0x"   << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " "
153  //             << "devptr " << std::hex<<  (long)devicePtr <<std::dec
154  //             << std::endl;
155     char *ptr = (char*)arg->value.buffer.memorySegment;
156 
157     CUresult status = cuMemcpyHtoDAsync(devicePtr, arg->value.buffer.memorySegment, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream);
158     if (CUDA_SUCCESS != status) {
159         std::cerr << "cuMemcpyHtoDAsync() CUDA error = " << status
160                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
161                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
162         exit(-1);
163     }
164     status = static_cast<CUresult >(cudaStreamSynchronize(cudaKernel->cudaStream));
165     if (CUDA_SUCCESS != status) {
166         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
167                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
168                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
169         exit(-1);
170     }
171 }
172 
173 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyFromDevice() {
174     auto cudaKernel = dynamic_cast<CudaKernel*>(kernel);
175  //   std::cout << "copyFromDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " "
176  //             << "devptr " << std::hex<<  (long)devicePtr <<std::dec
177   //            << std::endl;
178     char *ptr = (char*)arg->value.buffer.memorySegment;
179 
180     CUresult status =cuMemcpyDtoHAsync(arg->value.buffer.memorySegment, devicePtr, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream);
181     if (CUDA_SUCCESS != status) {
182         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
183                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
184                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
185         exit(-1);
186     }
187     cudaError_t t1 = cudaStreamSynchronize(cudaKernel->cudaStream);
188     if (static_cast<cudaError_t>(CUDA_SUCCESS) != t1) {
189         std::cerr << "CUDA error = " << t1
190                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(t1))
191                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
192         exit(-1);
193     }
194 
195 }
196 
197 CudaBackend::CudaProgram::CudaKernel::CudaKernel(Backend::Program *program,char * name, CUfunction function)
198         : Backend::Program::Kernel(program, name), function(function),cudaStream() {
199 }
200 
201 CudaBackend::CudaProgram::CudaKernel::~CudaKernel() = default;
202 
203 long CudaBackend::CudaProgram::CudaKernel::ndrange(void *argArray) {
204   //  std::cout << "ndrange(" << range << ") " << name << std::endl;
205 
206     cudaStreamCreate(&cudaStream);
207     ArgSled argSled(static_cast<ArgArray_s *>(argArray));
208  //   Schema::dumpSled(std::cout, argArray);
209     void *argslist[argSled.argc()];
210     NDRange *ndrange = nullptr;
211 #ifdef VERBOSE
212     std::cerr << "there are " << argSled.argc() << "args " << std::endl;
213 #endif
214     for (int i = 0; i < argSled.argc(); i++) {
215         Arg_s *arg = argSled.arg(i);
216         switch (arg->variant) {
217             case '&': {
218                 if (arg->idx == 0){
219                     ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment);
220                 }
221                 auto cudaBuffer = new CudaBuffer(this, arg);
222                 cudaBuffer->copyToDevice();
223                 argslist[arg->idx] = static_cast<void *>(&cudaBuffer->devicePtr);
224                 break;
225             }
226             case 'I':
227             case 'F':
228             case 'J':
229             case 'D':
230             case 'C':
231             case 'S': {
232                 argslist[arg->idx] = static_cast<void *>(&arg->value);
233                 break;
234             }
235             default: {
236                 std::cerr << " unhandled variant " << (char) arg->variant << std::endl;
237                 break;
238             }
239         }
240     }
241 
242     int range = ndrange->maxX;
243     int rangediv1024 = range / 1024;
244     int rangemod1024 = range % 1024;
245     if (rangemod1024 > 0) {
246         rangediv1024++;
247     }
248    // std::cout << "Running the kernel..." << std::endl;
249   //  std::cout << "   Requested range   = " << range << std::endl;
250   //  std::cout << "   Range mod 1024    = " << rangemod1024 << std::endl;
251    // std::cout << "   Actual range 1024 = " << (rangediv1024 * 1024) << std::endl;
252     auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
253     if (CUDA_SUCCESS != status) {
254         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
255                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
256                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
257         exit(-1);
258     }
259 
260     status= cuLaunchKernel(function,
261                                    rangediv1024, 1, 1,
262                                    1024, 1, 1,
263                                    0, cudaStream,
264                     argslist, 0);
265     if (CUDA_SUCCESS != status) {
266         std::cerr << "cuLaunchKernel() CUDA error = " << status
267                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
268                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
269         exit(-1);
270     }
271     status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
272     if (CUDA_SUCCESS != status) {
273         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
274                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
275                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
276         exit(-1);
277     }
278 
279     //std::cout << "Kernel complete..."<<cudaGetErrorString(t)<<std::endl;
280 
281     for (int i = 0; i < argSled.argc(); i++) {
282         Arg_s *arg = argSled.arg(i);
283         if (arg->variant == '&') {
284             static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr)->copyFromDevice();
285 
286         }
287     }
288     status=   static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
289     if (CUDA_SUCCESS != status) {
290         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
291                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
292                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
293         exit(-1);
294     }
295 
296     for (int i = 0; i < argSled.argc(); i++) {
297         Arg_s *arg = argSled.arg(i);
298         if (arg->variant == '&') {
299             delete static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr);
300             arg->value.buffer.vendorPtr = nullptr;
301         }
302     }
303     cudaStreamDestroy(cudaStream);
304     return (long) 0;
305 }
306 
307 
308 CudaBackend::CudaProgram::CudaProgram(Backend *backend, BuildInfo *buildInfo, Ptx *ptx, CUmodule module)
309         : Backend::Program(backend, buildInfo), ptx(ptx), module(module) {
310 }
311 
312 CudaBackend::CudaProgram::~CudaProgram() = default;
313 
314 long CudaBackend::CudaProgram::getKernel(int nameLen, char *name) {
315     CUfunction function;
316     CUresult status= cuModuleGetFunction(&function, module, name);
317     if (CUDA_SUCCESS != status) {
318         std::cerr << "cuModuleGetFunction() CUDA error = " << status
319                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
320                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
321         exit(-1);
322     }
323     long kernelHandle =  reinterpret_cast<long>(new CudaKernel(this, name, function));
324     return kernelHandle;
325 }
326 
327 bool CudaBackend::CudaProgram::programOK() {
328     return true;
329 }
330 
331 CudaBackend::CudaBackend(int mode)
332         : Backend(mode), device(),context()  {
333   //  std::cout << "CudaBackend constructor " << ((cudaConfig == nullptr) ? "cudaConfig== null" : "got cudaConfig")
334     //          << std::endl;
335     int deviceCount = 0;
336     CUresult err = cuInit(0);
337     if (err == CUDA_SUCCESS) {
338         cuDeviceGetCount(&deviceCount);
339         std::cout << "CudaBackend device count" << std::endl;
340         cuDeviceGet(&device, 0);
341         std::cout << "CudaBackend device ok" << std::endl;
342         cuCtxCreate(&context, 0, device);
343         std::cout << "CudaBackend context created ok" << std::endl;
344     } else {
345         std::cout << "CudaBackend failed, we seem to have the runtime library but no device, no context, nada "
346                   << std::endl;
347         exit(1);
348     }
349 }
350 
351 //CudaBackend::CudaBackend() : CudaBackend(nullptr, 0, nullptr) {
352 //
353 //}
354 
355 CudaBackend::~CudaBackend() {
356     std::cout << "freeing context" << std::endl;
357     CUresult status = cuCtxDestroy(context);
358     if (CUDA_SUCCESS != status) {
359         std::cerr << "cuCtxDestroy(() CUDA error = " << status
360                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
361                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
362         exit(-1);
363     }
364 }
365 
366 int CudaBackend::getMaxComputeUnits() {
367     std::cout << "getMaxComputeUnits()" << std::endl;
368     int value = 1;
369     return value;
370 }
371 
372 void CudaBackend::info() {
373     char name[100];
374     cuDeviceGetName(name, sizeof(name), device);
375     std::cout << "> Using device 0: " << name << std::endl;
376 
377     // get compute capabilities and the devicename
378     int major = 0, minor = 0;
379     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
380     cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
381     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
382 
383     int warpSize;
384     cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
385     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
386 
387     int threadsPerBlock;
388     cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
389     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
390 
391     int cores;
392     cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
393     std::cout << "> GPU Cores " << cores << std::endl;
394 
395     size_t totalGlobalMem;
396     cuDeviceTotalMem(&totalGlobalMem, device);
397     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
398     std::cout << "  64-bit Memory Address:           " <<
399               ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
400 
401 }
402 
403 long CudaBackend::compileProgram(int len, char *source) {
404     Ptx *ptx = Ptx::nvcc(source, len);
405     CUmodule module;
406     std::cout << "inside compileProgram" << std::endl;
407     std::cout << "cuda " << source << std::endl;
408     if (ptx->text != nullptr) {
409         std::cout << "ptx " << ptx->text << std::endl;
410 
411         // in this branch we use compilation with parameters
412         const unsigned int jitNumOptions = 2;
413         auto jitOptions = new CUjit_option[jitNumOptions];
414         void **jitOptVals = new void *[jitNumOptions];
415 
416         // set up size of compilation log buffer
417         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
418         int jitLogBufferSize = 8192;
419         jitOptVals[0] = (void *) (size_t) jitLogBufferSize;
420 
421         // set up pointer to the compilation log buffer
422         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
423         char *jitLogBuffer = new char[jitLogBufferSize];
424         jitOptVals[1] = jitLogBuffer;
425         int status = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals);
426 
427         printf("> PTX JIT log:\n%s\n", jitLogBuffer);
428         return reinterpret_cast<long>(new CudaProgram(this, nullptr, ptx, module));
429 
430         //delete ptx;
431     } else {
432         std::cout << "no ptx content!" << std::endl;
433         exit(1);
434     }
435 }
436 
437 long getCudaBackend(int mode) {
438     long backendHandle= reinterpret_cast<long>(new CudaBackend(mode);
439     std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
440     return backendHandle;
441 }
442 
443 
444