New hat/backends/ptx/cpp/ptx

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include <cuda_runtime_api.h>
 29 #include "ptx_backend.h"
 30 
 31 Ptx::Ptx(size_t len)
 32         : len(len), text(len > 0 ? new char[len] : nullptr) {
 33     std::cout << "in Ptx with buffer allocated "<<len << std::endl;
 34 }
 35 
 36 Ptx::~Ptx() {
 37     if (len > 0 && text != nullptr) {
 38         std::cout << "in ~Ptx with deleting allocated "<<len << std::endl;
 39         delete[] text;
 40     }
 41 }
 42 
 43 uint64_t timeSinceEpochMillisec() {
 44     using namespace std::chrono;
 45     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 46 }
 47 
 48 Ptx *Ptx::nvcc(const char *ptxSource, size_t len) {
 49     Ptx * ptx = new Ptx(len);
 50     strcpy(ptx->text,ptxSource);
 51     return ptx;
 52 
 53 //     Ptx *ptx = nullptr;
 54 //     uint64_t time = timeSinceEpochMillisec();
 55 //     std::stringstream timestampPtx;
 56 //     timestampPtx << "./tmp" << time << ".ptx";
 57 //     const char *ptxPath = strdup(timestampPtx.str().c_str());
 58 //    // std::cout << "ptx " << ptxPath << std::endl;
 59 //     // we are going to fork exec nvcc
 60 //     int pid;
 61 //     if ((pid = fork()) == 0) {
 62 //         std::ofstream ptx;
 63 //         std::stringstream timestampPtx;
 64 //         timestampPtx << "./tmp" << time << ".cu";
 65 //         const char *ptxPath = strdup(timestampPtx.str().c_str());
 66 //         std::cout << "ptx " << ptxPath << std::endl;
 67 //         ptx.open(ptxPath, std::ofstream::trunc);
 68 //         ptx.write(ptxSource, len);
 69 //         ptx.close();
 70 //         const char *path = "/usr/bin/nvcc";
 71 //         //const char *path = "/usr/local/cuda-12.2/bin/nvcc";
 72 //         const char *argv[]{"nvcc", "-ptx", ptxPath, "-o", ptxPath, nullptr};
 73 //         // we can't free ptxPath or ptxpath in child because we need them in exec, no prob through
 74 //         // because we get a new proc so they are released to os
 75 //         execvp(path, (char *const *) argv);
 76 
 77 //     } else if (pid < 0) {
 78 //         // fork failed.
 79 //         std::cerr << "fork of nvcc failed" << std::endl;
 80 //         std::exit(1);
 81 //     } else {
 82 //         int status;
 83 //      //   std::cerr << "fork suceeded waiting for child" << std::endl;
 84 //         pid_t result = wait(&status);
 85 //         std::cerr << "child finished" << std::endl;
 86 //         std::ifstream ptxStream;
 87 //         ptxStream.open(ptxPath);
 88 //       //  if (ptxStream.is_open()) {
 89 //             ptxStream.seekg(0, std::ios::end);
 90 //             size_t ptxLen = ptxStream.tellg();
 91 //             ptxStream.close();
 92 //             ptxStream.open(ptxPath);
 93 //             free((void *) ptxPath);
 94 //             ptxPath = nullptr;
 95 //             if (ptxLen > 0) {
 96 //                 std::cerr << "ptx len "<< ptxLen << std::endl;
 97 //                 ptx = new Ptx(ptxLen + 1);
 98 //                 std::cerr << "about to read  "<< ptx->len << std::endl;
 99 //                 ptxStream.read(ptx->text, ptx->len);
100 //                 ptxStream.close();
101 //                 std::cerr << "about to read  "<< ptx->len << std::endl;
102 //                 ptx->text[ptx->len - 1] = '\0';
103 //                 std::cerr << "read text "<< ptx->text << std::endl;
104 
105 //             } else {
106 //                 std::cerr << "no ptx! ptxLen == 0?";
107 //                 exit(1);
108 //             }
109 //       //  }else{
110 //         //    std::cerr << "no ptx!";
111 //        //     exit(1);
112 //       //  }
113 //     }
114 //     std::cout << "returning PTX" << std::endl;
115 //     return ptx;
116 }
117 
118 /*
119 //http://mercury.pr.erau.edu/~siewerts/extra/code/digital-media/CUDA/cuda_work/samples/0_Simple/matrixMulDrv/matrixMulDrv.cpp
120  */
121 PtxBackend::PtxProgram::PtxKernel::PtxBuffer::PtxBuffer(Backend::Program::Kernel *kernel, Arg_s *arg)
122         : Buffer(kernel, arg), devicePtr() {
123     /*
124      *   (void *) arg->value.buffer.memorySegment,
125      *   (size_t) arg->value.buffer.sizeInBytes);
126      */
127   //  std::cout << "cuMemAlloc()" << std::endl;
128     CUresult status = cuMemAlloc(&devicePtr, (size_t) arg->value.buffer.sizeInBytes);
129     if (CUDA_SUCCESS != status) {
130         std::cerr << "cuMemFree() CUDA error = " << status
131                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
132                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
133         exit(-1);
134     }
135   //  std::cout << "devptr " << std::hex<<  (long)devicePtr <<std::dec <<std::endl;
136     arg->value.buffer.vendorPtr = static_cast<void *>(this);
137 }
138 
139 PtxBackend::PtxProgram::PtxKernel::PtxBuffer::~PtxBuffer() {
140 
141  //   std::cout << "cuMemFree()"
142   //          << "devptr " << std::hex<<  (long)devicePtr <<std::dec
143    //         << std::endl;
144     CUresult  status = cuMemFree(devicePtr);
145     if (CUDA_SUCCESS != status) {
146         std::cerr << "cuMemFree() CUDA error = " << status
147                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
148                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
149         exit(-1);
150     }
151     arg->value.buffer.vendorPtr = nullptr;
152 }
153 
154 void PtxBackend::PtxProgram::PtxKernel::PtxBuffer::copyToDevice() {
155     auto ptxKernel = dynamic_cast<PtxKernel*>(kernel);
156  //   std::cout << "copyToDevice() 0x"   << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " "
157  //             << "devptr " << std::hex<<  (long)devicePtr <<std::dec
158  //             << std::endl;
159     char *ptr = (char*)arg->value.buffer.memorySegment;
160 
161     unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16);
162     unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8);
163 
164     if (ifacefacade1 != 0x1face00000facadeL && ifacefacade1 != ifacefacade2) {
165         std::cerr<<"End of buf marker before HtoD"<< std::hex << ifacefacade1 << ifacefacade2<< " buffer corrupt !" <<std::endl
166                 <<" " << __FILE__ << " line " << __LINE__ << std::endl;
167         exit(-1);
168     }
169 
170 
171     CUresult status = cuMemcpyHtoDAsync(devicePtr, arg->value.buffer.memorySegment, arg->value.buffer.sizeInBytes,ptxKernel->cudaStream);
172     if (CUDA_SUCCESS != status) {
173         std::cerr << "cuMemcpyHtoDAsync() CUDA error = " << status
174                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
175                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
176         exit(-1);
177     }
178     status = static_cast<CUresult >(cudaStreamSynchronize(ptxKernel->cudaStream));
179     if (CUDA_SUCCESS != status) {
180         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
181                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
182                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
183         exit(-1);
184     }
185 }
186 
187 void PtxBackend::PtxProgram::PtxKernel::PtxBuffer::copyFromDevice() {
188     auto ptxKernel = dynamic_cast<PtxKernel*>(kernel);
189  //   std::cout << "copyFromDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " "
190  //             << "devptr " << std::hex<<  (long)devicePtr <<std::dec
191   //            << std::endl;
192     char *ptr = (char*)arg->value.buffer.memorySegment;
193 
194     unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16);
195     unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8);
196 
197     if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) {
198         std::cerr<<"end of buf marker before  DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl
199                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
200         exit(-1);
201     }
202     CUresult status =cuMemcpyDtoHAsync(arg->value.buffer.memorySegment, devicePtr, arg->value.buffer.sizeInBytes,ptxKernel->cudaStream);
203     if (CUDA_SUCCESS != status) {
204         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
205                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
206                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
207         exit(-1);
208     }
209     cudaError_t t1 = cudaStreamSynchronize(ptxKernel->cudaStream);
210     if (static_cast<cudaError_t>(CUDA_SUCCESS) != t1) {
211         std::cerr << "CUDA error = " << t1
212                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(t1))
213                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
214         exit(-1);
215     }
216     ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16);
217     ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8);
218 
219     if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) {
220         std::cerr<<"end of buf marker after  DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl
221                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
222         exit(-1);
223     }
224 }
225 
226 PtxBackend::PtxProgram::PtxKernel::PtxKernel(Backend::Program *program,char * name, CUfunction function)
227         : Backend::Program::Kernel(program, name), function(function),cudaStream() {
228 }
229 
230 PtxBackend::PtxProgram::PtxKernel::~PtxKernel() = default;
231 
232 long PtxBackend::PtxProgram::PtxKernel::ndrange(void *argArray) {
233   //  std::cout << "ndrange(" << range << ") " << name << std::endl;
234 
235     cudaStreamCreate(&cudaStream);
236     ArgSled argSled(static_cast<ArgArray_s *>(argArray));
237  //   Schema::dumpSled(std::cout, argArray);
238     void *argslist[argSled.argc()];
239     NDRange *ndrange = nullptr;
240 #ifdef VERBOSE
241     std::cerr << "there are " << argSled.argc() << "args " << std::endl;
242 #endif
243     for (int i = 0; i < argSled.argc(); i++) {
244         Arg_s *arg = argSled.arg(i);
245         switch (arg->variant) {
246             case '&': {
247                 if (arg->idx == 0){
248                     ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment);
249                 }
250                 auto ptxBuffer = new PtxBuffer(this, arg);
251                 ptxBuffer->copyToDevice();
252                 argslist[arg->idx] = static_cast<void *>(&ptxBuffer->devicePtr);
253                 break;
254             }
255             case 'I':
256             case 'F':
257             case 'J':
258             case 'D':
259             case 'C':
260             case 'S': {
261                 argslist[arg->idx] = static_cast<void *>(&arg->value);
262                 break;
263             }
264             default: {
265                 std::cerr << " unhandled variant " << (char) arg->variant << std::endl;
266                 break;
267             }
268         }
269     }
270 
271     int range = ndrange->maxX;
272     int rangediv1024 = range / 1024;
273     int rangemod1024 = range % 1024;
274     if (rangemod1024 > 0) {
275         rangediv1024++;
276     }
277    // std::cout << "Running the kernel..." << std::endl;
278   //  std::cout << "   Requested range   = " << range << std::endl;
279   //  std::cout << "   Range mod 1024    = " << rangemod1024 << std::endl;
280    // std::cout << "   Actual range 1024 = " << (rangediv1024 * 1024) << std::endl;
281     auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
282     if (CUDA_SUCCESS != status) {
283         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
284                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
285                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
286         exit(-1);
287     }
288 
289     status= cuLaunchKernel(function,
290                                    rangediv1024, 1, 1,
291                                    1024, 1, 1,
292                                    0, cudaStream,
293                     argslist, 0);
294     if (CUDA_SUCCESS != status) {
295         std::cerr << "cuLaunchKernel() CUDA error = " << status
296                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
297                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
298         exit(-1);
299     }
300     status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
301     if (CUDA_SUCCESS != status) {
302         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
303                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
304                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
305         exit(-1);
306     }
307 
308     //std::cout << "Kernel complete..."<<cudaGetErrorString(t)<<std::endl;
309 
310     for (int i = 0; i < argSled.argc(); i++) {
311         Arg_s *arg = argSled.arg(i);
312         if (arg->variant == '&') {
313             static_cast<PtxBuffer *>(arg->value.buffer.vendorPtr)->copyFromDevice();
314 
315         }
316     }
317     status=   static_cast<CUresult>(cudaStreamSynchronize(cudaStream));
318     if (CUDA_SUCCESS != status) {
319         std::cerr << "cudaStreamSynchronize() CUDA error = " << status
320                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
321                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
322         exit(-1);
323     }
324 
325     for (int i = 0; i < argSled.argc(); i++) {
326         Arg_s *arg = argSled.arg(i);
327         if (arg->variant == '&') {
328             delete static_cast<PtxBuffer *>(arg->value.buffer.vendorPtr);
329             arg->value.buffer.vendorPtr = nullptr;
330         }
331     }
332     cudaStreamDestroy(cudaStream);
333     return (long) 0;
334 }
335 
336 
337 PtxBackend::PtxProgram::PtxProgram(Backend *backend, BuildInfo *buildInfo, Ptx *ptx, CUmodule module)
338         : Backend::Program(backend, buildInfo), ptx(ptx), module(module) {
339 }
340 
341 PtxBackend::PtxProgram::~PtxProgram() = default;
342 
343 long PtxBackend::PtxProgram::getKernel(int nameLen, char *name) {
344     CUfunction function;
345     CUresult status= cuModuleGetFunction(&function, module, name);
346     if (CUDA_SUCCESS != status) {
347         std::cerr << "cuModuleGetFunction() CUDA error = " << status
348                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
349                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
350         exit(-1);
351     }
352     long kernelHandle =  reinterpret_cast<long>(new PtxKernel(this, name, function));
353     return kernelHandle;
354 }
355 
356 bool PtxBackend::PtxProgram::programOK() {
357     return true;
358 }
359 
360 PtxBackend::PtxBackend(PtxBackend::PtxConfig *ptxConfig, int
361 configSchemaLen, char *configSchema)
362         : Backend((Backend::Config*) ptxConfig, configSchemaLen, configSchema), device(),context()  {
363   //  std::cout << "PtxBackend constructor " << ((ptxConfig == nullptr) ? "ptxConfig== null" : "got ptxConfig")
364     //          << std::endl;
365     int deviceCount = 0;
366     CUresult err = cuInit(0);
367     if (err == CUDA_SUCCESS) {
368         cuDeviceGetCount(&deviceCount);
369         std::cout << "PtxBackend device count" << std::endl;
370         cuDeviceGet(&device, 0);
371         std::cout << "PtxBackend device ok" << std::endl;
372         cuCtxCreate(&context, 0, device);
373         std::cout << "PtxBackend context created ok" << std::endl;
374     } else {
375         std::cout << "PtxBackend failed, we seem to have the runtime library but no device, no context, nada "
376                   << std::endl;
377         exit(1);
378     }
379 }
380 
381 PtxBackend::PtxBackend() : PtxBackend(nullptr, 0, nullptr) {
382 
383 }
384 
385 PtxBackend::~PtxBackend() {
386     std::cout << "freeing context" << std::endl;
387     CUresult status = cuCtxDestroy(context);
388     if (CUDA_SUCCESS != status) {
389         std::cerr << "cuCtxDestroy(() CUDA error = " << status
390                   <<" " << cudaGetErrorString(static_cast<cudaError_t>(status))
391                   <<" " << __FILE__ << " line " << __LINE__ << std::endl;
392         exit(-1);
393     }
394 }
395 
396 int PtxBackend::getMaxComputeUnits() {
397     std::cout << "getMaxComputeUnits()" << std::endl;
398     int value = 1;
399     return value;
400 }
401 
402 void PtxBackend::info() {
403     char name[100];
404     cuDeviceGetName(name, sizeof(name), device);
405     std::cout << "> Using device 0: " << name << std::endl;
406 
407     // get compute capabilities and the devicename
408     int major = 0, minor = 0;
409     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
410     cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
411     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
412 
413     int warpSize;
414     cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
415     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
416 
417     int threadsPerBlock;
418     cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
419     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
420 
421     int cores;
422     cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
423     std::cout << "> GPU Cores " << cores << std::endl;
424 
425     size_t totalGlobalMem;
426     cuDeviceTotalMem(&totalGlobalMem, device);
427     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
428     std::cout << "  64-bit Memory Address:           " <<
429               ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
430 
431 }
432 
433 long PtxBackend::compileProgram(int len, char *source) {
434     Ptx *ptx = Ptx::nvcc(source, len);
435     CUmodule module;
436     std::cout << "inside compileProgram" << std::endl;
437     std::cout << "ptx " << source << std::endl;
438     if (ptx->text != nullptr) {
439         std::cout << "ptx " << ptx->text << std::endl;
440 
441         // in this branch we use compilation with parameters
442         const unsigned int jitNumOptions = 2;
443         auto jitOptions = new CUjit_option[jitNumOptions];
444         void **jitOptVals = new void *[jitNumOptions];
445 
446         // set up size of compilation log buffer
447         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
448         int jitLogBufferSize = 8192;
449         jitOptVals[0] = (void *) (size_t) jitLogBufferSize;
450 
451         // set up pointer to the compilation log buffer
452         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
453         char *jitLogBuffer = new char[jitLogBufferSize];
454         jitOptVals[1] = jitLogBuffer;
455         int status = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals);
456 
457         printf("> PTX JIT log:\n%s\n", jitLogBuffer);
458         printf("status: %d\n", status);
459         return reinterpret_cast<long>(new PtxProgram(this, nullptr, ptx, module));
460 
461         //delete ptx;
462     } else {
463         std::cout << "no ptx content!" << std::endl;
464         exit(1);
465     }
466 }
467 
468 long getBackend(void *config, int configSchemaLen, char *configSchema) {
469     long backendHandle= reinterpret_cast<long>(
470             new PtxBackend(static_cast<PtxBackend::PtxConfig *>(config), configSchemaLen,
471                             configSchema));
472     std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
473     return backendHandle;
474 }
475 
476 
477