New hat/backends/ffi/cuda/cpp/cuda

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include <cuda_runtime_api.h>
 29 #include "cuda_backend.h"
 30 
 31 Ptx::Ptx(size_t len)
 32         : len(len), text(len > 0 ? new char[len] : nullptr) {
 33     std::cout << "in Ptx with buffer allocated "<<len << std::endl;
 34 }
 35 
 36 Ptx::~Ptx() {
 37     if (len > 0 && text != nullptr) {
 38         std::cout << "in ~Ptx with deleting allocated "<<len << std::endl;
 39         delete[] text;
 40     }
 41 }
 42 
 43 uint64_t timeSinceEpochMillisec() {
 44     using namespace std::chrono;
 45     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 46 }
 47 
 48 Ptx *Ptx::nvcc(const char *cudaSource, size_t len) {
 49     Ptx *ptx = nullptr;
 50     uint64_t time = timeSinceEpochMillisec();
 51     std::stringstream timestampPtx;
 52     timestampPtx << "./tmp" << time << ".ptx";
 53     const char *ptxPath = strdup(timestampPtx.str().c_str());
 54    // std::cout << "ptx " << ptxPath << std::endl;
 55     // we are going to fork exec nvcc
 56     int pid;
 57     if ((pid = fork()) == 0) {
 58         std::ofstream cuda;
 59         std::stringstream timestampCuda;
 60         timestampCuda << "./tmp" << time << ".cu";
 61         const char *cudaPath = strdup(timestampCuda.str().c_str());
 62         std::cout << "cuda " << cudaPath << std::endl;
 63         cuda.open(cudaPath, std::ofstream::trunc);
 64         cuda.write(cudaSource, len);
 65         cuda.close();
 66         const char *path = "/usr/bin/nvcc";
 67         //const char *path = "/usr/local/cuda-12.2/bin/nvcc";
 68         const char *argv[]{"nvcc", "-ptx", cudaPath, "-o", ptxPath, nullptr};
 69         // we can't free cudaPath or ptxpath in child because we need them in exec, no prob through
 70         // because we get a new proc so they are released to os
 71         execvp(path, (char *const *) argv);
 72 
 73     } else if (pid < 0) {
 74         // fork failed.
 75         std::cerr << "fork of nvcc failed" << std::endl;
 76         std::exit(1);
 77     } else {
 78         int status;
 79      //   std::cerr << "fork suceeded waiting for child" << std::endl;
 80         pid_t result = wait(&status);
 81         std::cerr << "child finished" << std::endl;
 82         std::ifstream ptxStream;
 83         ptxStream.open(ptxPath);
 84       //  if (ptxStream.is_open()) {
 85             ptxStream.seekg(0, std::ios::end);
 86             size_t ptxLen = ptxStream.tellg();
 87             ptxStream.close();
 88             ptxStream.open(ptxPath);
 89             free((void *) ptxPath);
 90             ptxPath = nullptr;
 91             if (ptxLen > 0) {
 92                 std::cerr << "ptx len "<< ptxLen << std::endl;
 93                 ptx = new Ptx(ptxLen + 1);
 94                 std::cerr << "about to read  "<< ptx->len << std::endl;
 95                 ptxStream.read(ptx->text, ptx->len);
 96                 ptxStream.close();
 97                 std::cerr << "about to read  "<< ptx->len << std::endl;
 98                 ptx->text[ptx->len - 1] = '\0';
 99                 std::cerr << "read text "<< ptx->text << std::endl;
100 
101             } else {
102                 std::cerr << "no ptx! ptxLen == 0?";
103                 exit(1);
104             }
105       //  }else{
106         //    std::cerr << "no ptx!";
107        //     exit(1);
108       //  }
109     }
110     std::cout << "returning PTX" << std::endl;
111     return ptx;
112 }
113 
114 /*
115 //http://mercury.pr.erau.edu/~siewerts/extra/code/digital-media/CUDA/cuda_work/samples/0_Simple/matrixMulDrv/matrixMulDrv.cpp
116  */
117 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::CudaBuffer(Backend::Program::Kernel *kernel, Arg_s *arg)
118         : Buffer(kernel, arg), devicePtr() {
119     /*
120      *   (void *) arg->value.buffer.memorySegment,
121      *   (size_t) arg->value.buffer.sizeInBytes);
122      */
123   //  std::cout << "cuMemAlloc()" << std::endl;
124     CUresult status = cuMemAlloc(&devicePtr, (size_t) arg->value.buffer.sizeInBytes);
125     if (CUDA_SUCCESS != status) {
126         std::cerr << "cuMemFree() CUDA error = " << status
127                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
128                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
129         exit(-1);
130     }
131   //  std::cout << "devptr " << std::hex<<  (long)devicePtr <<std::dec <<std::endl;
132     arg->value.buffer.vendorPtr = static_cast<void *>(this);
133 }
134 
135 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::~CudaBuffer() {
136 
137  //   std::cout << "cuMemFree()"
138   //          << "devptr " << std::hex<<  (long)devicePtr <<std::dec
139    //         << std::endl;
140     CUresult  status = cuMemFree(devicePtr);
141     if (CUDA_SUCCESS != status) {
142         std::cerr << "cuMemFree() CUDA error = " << status
143                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
144                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
145         exit(-1);
146     }
147     arg->value.buffer.vendorPtr = nullptr;
148 }
149 
150 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyToDevice() {
151     auto cudaKernel = dynamic_cast<CudaKernel*>(kernel);
152  //   std::cout << "copyToDevice() 0x"   << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " "
153  //             << "devptr " << std::hex<<  (long)devicePtr <<std::dec
154  //             << std::endl;
155     char *ptr = (char*)arg->value.buffer.memorySegment;
156 
157     unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16);
158     unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8);
159 
160     if (ifacefacade1 != 0x1face00000facadeL && ifacefacade1 != ifacefacade2) {
161         std::cerr<<"End of buf marker before HtoD"<< std::hex << ifacefacade1 << ifacefacade2<< " buffer corrupt !" <<std::endl
162                 <<" " << __FILE__ << " line " << __LINE__ << std::endl;
163         exit(-1);
164     }
165 
166 
167     CUresult status = cuMemcpyHtoDAsync(devicePtr, arg->value.buffer.memorySegment, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream);
168     if (CUDA_SUCCESS != status) {
169         std::cerr << "cuMemcpyHtoDAsync() CUDA error = " << status
170                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
171                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
172         exit(-1);
173     }
174     status = static_cast<CUresult >(cudaStreamSynchronize(cudaKernel->cudaStream));
175     if (CUDA_SUCCESS != status) {
176         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
177                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
178                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
179         exit(-1);
180     }
181 }
182 
183 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyFromDevice() {
184     auto cudaKernel = dynamic_cast<CudaKernel*>(kernel);
185  //   std::cout << "copyFromDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " "
186  //             << "devptr " << std::hex<<  (long)devicePtr <<std::dec
187   //            << std::endl;
188     char *ptr = (char*)arg->value.buffer.memorySegment;
189 
190     unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16);
191     unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8);
192 
193     if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) {
194         std::cerr<<"end of buf marker before  DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl
195                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
196         exit(-1);
197     }
198     CUresult status =cuMemcpyDtoHAsync(arg->value.buffer.memorySegment, devicePtr, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream);
199     if (CUDA_SUCCESS != status) {
200         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
201                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
202                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
203         exit(-1);
204     }
205     cudaError_t t1 = cudaStreamSynchronize(cudaKernel->cudaStream);
206     if (static_cast<cudaError_t>(CUDA_SUCCESS) != t1) {
207         std::cerr << "CUDA error = " << t1
208                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(t1))
209                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
210         exit(-1);
211     }
212     ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16);
213     ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8);
214 
215     if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) {
216         std::cerr<<"end of buf marker after  DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl
217                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
218         exit(-1);
219     }
220 }
221 
222 CudaBackend::CudaProgram::CudaKernel::CudaKernel(Backend::Program *program,char * name, CUfunction function)
223         : Backend::Program::Kernel(program, name), function(function),cudaStream() {
224 }
225 
226 CudaBackend::CudaProgram::CudaKernel::~CudaKernel() = default;
227 
228 long CudaBackend::CudaProgram::CudaKernel::ndrange(void *argArray) {
229   //  std::cout << "ndrange(" << range << ") " << name << std::endl;
230 
231     cudaStreamCreate(&cudaStream);
232     ArgSled argSled(static_cast<ArgArray_s *>(argArray));
233  //   Schema::dumpSled(std::cout, argArray);
234     void *argslist[argSled.argc()];
235     NDRange *ndrange = nullptr;
236 #ifdef VERBOSE
237     std::cerr << "there are " << argSled.argc() << "args " << std::endl;
238 #endif
239     for (int i = 0; i < argSled.argc(); i++) {
240         Arg_s *arg = argSled.arg(i);
241         switch (arg->variant) {
242             case '&': {
243                 if (arg->idx == 0){
244                     ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment);
245                 }
246                 auto cudaBuffer = new CudaBuffer(this, arg);
247                 cudaBuffer->copyToDevice();
248                 argslist[arg->idx] = static_cast<void *>(&cudaBuffer->devicePtr);
249                 break;
250             }
251             case 'I':
252             case 'F':
253             case 'J':
254             case 'D':
255             case 'C':
256             case 'S': {
257                 argslist[arg->idx] = static_cast<void *>(&arg->value);
258                 break;
259             }
260             default: {
261                 std::cerr << " unhandled variant " << (char) arg->variant << std::endl;
262                 break;
263             }
264         }
265     }
266 
267     int range = ndrange->maxX;
268     int rangediv1024 = range / 1024;
269     int rangemod1024 = range % 1024;
270     if (rangemod1024 > 0) {
271         rangediv1024++;
272     }
273    // std::cout << "Running the kernel..." << std::endl;
274   //  std::cout << "   Requested range   = " << range << std::endl;
275   //  std::cout << "   Range mod 1024    = " << rangemod1024 << std::endl;
276    // std::cout << "   Actual range 1024 = " << (rangediv1024 * 1024) << std::endl;
277     auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
278     if (CUDA_SUCCESS != status) {
279         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
280                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
281                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
282         exit(-1);
283     }
284 
285     status= cuLaunchKernel(function,
286                                    rangediv1024, 1, 1,
287                                    1024, 1, 1,
288                                    0, cudaStream,
289                     argslist, 0);
290     if (CUDA_SUCCESS != status) {
291         std::cerr << "cuLaunchKernel() CUDA error = " << status
292                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
293                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
294         exit(-1);
295     }
296     status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
297     if (CUDA_SUCCESS != status) {
298         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
299                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
300                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
301         exit(-1);
302     }
303 
304     //std::cout << "Kernel complete..."<<cudaGetErrorString(t)<<std::endl;
305 
306     for (int i = 0; i < argSled.argc(); i++) {
307         Arg_s *arg = argSled.arg(i);
308         if (arg->variant == '&') {
309             static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr)->copyFromDevice();
310 
311         }
312     }
313     status=   static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
314     if (CUDA_SUCCESS != status) {
315         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
316                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
317                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
318         exit(-1);
319     }
320 
321     for (int i = 0; i < argSled.argc(); i++) {
322         Arg_s *arg = argSled.arg(i);
323         if (arg->variant == '&') {
324             delete static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr);
325             arg->value.buffer.vendorPtr = nullptr;
326         }
327     }
328     cudaStreamDestroy(cudaStream);
329     return (long) 0;
330 }
331 
332 
333 CudaBackend::CudaProgram::CudaProgram(Backend *backend, BuildInfo *buildInfo, Ptx *ptx, CUmodule module)
334         : Backend::Program(backend, buildInfo), ptx(ptx), module(module) {
335 }
336 
337 CudaBackend::CudaProgram::~CudaProgram() = default;
338 
339 long CudaBackend::CudaProgram::getKernel(int nameLen, char *name) {
340     CUfunction function;
341     CUresult status= cuModuleGetFunction(&function, module, name);
342     if (CUDA_SUCCESS != status) {
343         std::cerr << "cuModuleGetFunction() CUDA error = " << status
344                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
345                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
346         exit(-1);
347     }
348     long kernelHandle =  reinterpret_cast<long>(new CudaKernel(this, name, function));
349     return kernelHandle;
350 }
351 
352 bool CudaBackend::CudaProgram::programOK() {
353     return true;
354 }
355 
356 CudaBackend::CudaBackend(CudaBackend::CudaConfig *cudaConfig, int
357 configSchemaLen, char *configSchema)
358         : Backend((Backend::Config*) cudaConfig, configSchemaLen, configSchema), device(),context()  {
359   //  std::cout << "CudaBackend constructor " << ((cudaConfig == nullptr) ? "cudaConfig== null" : "got cudaConfig")
360     //          << std::endl;
361     int deviceCount = 0;
362     CUresult err = cuInit(0);
363     if (err == CUDA_SUCCESS) {
364         cuDeviceGetCount(&deviceCount);
365         std::cout << "CudaBackend device count" << std::endl;
366         cuDeviceGet(&device, 0);
367         std::cout << "CudaBackend device ok" << std::endl;
368         cuCtxCreate(&context, 0, device);
369         std::cout << "CudaBackend context created ok" << std::endl;
370     } else {
371         std::cout << "CudaBackend failed, we seem to have the runtime library but no device, no context, nada "
372                   << std::endl;
373         exit(1);
374     }
375 }
376 
377 CudaBackend::CudaBackend() : CudaBackend(nullptr, 0, nullptr) {
378 
379 }
380 
381 CudaBackend::~CudaBackend() {
382     std::cout << "freeing context" << std::endl;
383     CUresult status = cuCtxDestroy(context);
384     if (CUDA_SUCCESS != status) {
385         std::cerr << "cuCtxDestroy(() CUDA error = " << status
386                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
387                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
388         exit(-1);
389     }
390 }
391 
392 int CudaBackend::getMaxComputeUnits() {
393     std::cout << "getMaxComputeUnits()" << std::endl;
394     int value = 1;
395     return value;
396 }
397 
398 void CudaBackend::info() {
399     char name[100];
400     cuDeviceGetName(name, sizeof(name), device);
401     std::cout << "> Using device 0: " << name << std::endl;
402 
403     // get compute capabilities and the devicename
404     int major = 0, minor = 0;
405     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
406     cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
407     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
408 
409     int warpSize;
410     cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
411     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
412 
413     int threadsPerBlock;
414     cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
415     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
416 
417     int cores;
418     cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
419     std::cout << "> GPU Cores " << cores << std::endl;
420 
421     size_t totalGlobalMem;
422     cuDeviceTotalMem(&totalGlobalMem, device);
423     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
424     std::cout << "  64-bit Memory Address:           " <<
425               ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
426 
427 }
428 
429 long CudaBackend::compileProgram(int len, char *source) {
430     Ptx *ptx = Ptx::nvcc(source, len);
431     CUmodule module;
432     std::cout << "inside compileProgram" << std::endl;
433     std::cout << "cuda " << source << std::endl;
434     if (ptx->text != nullptr) {
435         std::cout << "ptx " << ptx->text << std::endl;
436 
437         // in this branch we use compilation with parameters
438         const unsigned int jitNumOptions = 2;
439         auto jitOptions = new CUjit_option[jitNumOptions];
440         void **jitOptVals = new void *[jitNumOptions];
441 
442         // set up size of compilation log buffer
443         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
444         int jitLogBufferSize = 8192;
445         jitOptVals[0] = (void *) (size_t) jitLogBufferSize;
446 
447         // set up pointer to the compilation log buffer
448         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
449         char *jitLogBuffer = new char[jitLogBufferSize];
450         jitOptVals[1] = jitLogBuffer;
451         int status = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals);
452 
453         printf("> PTX JIT log:\n%s\n", jitLogBuffer);
454         return reinterpret_cast<long>(new CudaProgram(this, nullptr, ptx, module));
455 
456         //delete ptx;
457     } else {
458         std::cout << "no ptx content!" << std::endl;
459         exit(1);
460     }
461 }
462 
463 long getBackend(void *config, int configSchemaLen, char *configSchema) {
464     long backendHandle= reinterpret_cast<long>(
465             new CudaBackend(static_cast<CudaBackend::CudaConfig *>(config), configSchemaLen,
466                             configSchema));
467     std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
468     return backendHandle;
469 }
470 
471 
472