1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include <cuda_runtime_api.h> 29 #include "cuda_backend.h" 30 31 Ptx::Ptx(size_t len) 32 : len(len), text(len > 0 ? new char[len] : nullptr) { 33 std::cout << "in Ptx with buffer allocated "<<len << std::endl; 34 } 35 36 Ptx::~Ptx() { 37 if (len > 0 && text != nullptr) { 38 std::cout << "in ~Ptx with deleting allocated "<<len << std::endl; 39 delete[] text; 40 } 41 } 42 43 uint64_t timeSinceEpochMillisec() { 44 using namespace std::chrono; 45 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 46 } 47 48 Ptx *Ptx::nvcc(const char *cudaSource, size_t len) { 49 Ptx *ptx = nullptr; 50 uint64_t time = timeSinceEpochMillisec(); 51 std::stringstream timestampPtx; 52 timestampPtx << "./tmp" << time << ".ptx"; 53 const char *ptxPath = strdup(timestampPtx.str().c_str()); 54 // std::cout << "ptx " << ptxPath << std::endl; 55 // we are going to fork exec nvcc 56 int pid; 57 if ((pid = fork()) == 0) { 58 std::ofstream cuda; 59 std::stringstream timestampCuda; 60 timestampCuda << "./tmp" << time << ".cu"; 61 const char *cudaPath = strdup(timestampCuda.str().c_str()); 62 std::cout << "cuda " << cudaPath << std::endl; 63 cuda.open(cudaPath, std::ofstream::trunc); 64 cuda.write(cudaSource, len); 65 cuda.close(); 66 const char *path = "/usr/bin/nvcc"; 67 //const char *path = "/usr/local/cuda-12.2/bin/nvcc"; 68 const char *argv[]{"nvcc", "-ptx", cudaPath, "-o", ptxPath, nullptr}; 69 // we can't free cudaPath or ptxpath in child because we need them in exec, no prob through 70 // because we get a new proc so they are released to os 71 execvp(path, (char *const *) argv); 72 73 } else if (pid < 0) { 74 // fork failed. 75 std::cerr << "fork of nvcc failed" << std::endl; 76 std::exit(1); 77 } else { 78 int status; 79 // std::cerr << "fork suceeded waiting for child" << std::endl; 80 pid_t result = wait(&status); 81 std::cerr << "child finished" << std::endl; 82 std::ifstream ptxStream; 83 ptxStream.open(ptxPath); 84 // if (ptxStream.is_open()) { 85 ptxStream.seekg(0, std::ios::end); 86 size_t ptxLen = ptxStream.tellg(); 87 ptxStream.close(); 88 ptxStream.open(ptxPath); 89 free((void *) ptxPath); 90 ptxPath = nullptr; 91 if (ptxLen > 0) { 92 std::cerr << "ptx len "<< ptxLen << std::endl; 93 ptx = new Ptx(ptxLen + 1); 94 std::cerr << "about to read "<< ptx->len << std::endl; 95 ptxStream.read(ptx->text, ptx->len); 96 ptxStream.close(); 97 std::cerr << "about to read "<< ptx->len << std::endl; 98 ptx->text[ptx->len - 1] = '\0'; 99 std::cerr << "read text "<< ptx->text << std::endl; 100 101 } else { 102 std::cerr << "no ptx! ptxLen == 0?"; 103 exit(1); 104 } 105 // }else{ 106 // std::cerr << "no ptx!"; 107 // exit(1); 108 // } 109 } 110 std::cout << "returning PTX" << std::endl; 111 return ptx; 112 } 113 114 /* 115 //http://mercury.pr.erau.edu/~siewerts/extra/code/digital-media/CUDA/cuda_work/samples/0_Simple/matrixMulDrv/matrixMulDrv.cpp 116 */ 117 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::CudaBuffer(Backend::Program::Kernel *kernel, Arg_s *arg) 118 : Buffer(kernel, arg), devicePtr() { 119 /* 120 * (void *) arg->value.buffer.memorySegment, 121 * (size_t) arg->value.buffer.sizeInBytes); 122 */ 123 // std::cout << "cuMemAlloc()" << std::endl; 124 CUresult status = cuMemAlloc(&devicePtr, (size_t) arg->value.buffer.sizeInBytes); 125 if (CUDA_SUCCESS != status) { 126 std::cerr << "cuMemFree() CUDA error = " << status 127 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 128 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 129 exit(-1); 130 } 131 // std::cout << "devptr " << std::hex<< (long)devicePtr <<std::dec <<std::endl; 132 arg->value.buffer.vendorPtr = static_cast<void *>(this); 133 } 134 135 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::~CudaBuffer() { 136 137 // std::cout << "cuMemFree()" 138 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 139 // << std::endl; 140 CUresult status = cuMemFree(devicePtr); 141 if (CUDA_SUCCESS != status) { 142 std::cerr << "cuMemFree() CUDA error = " << status 143 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 144 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 145 exit(-1); 146 } 147 arg->value.buffer.vendorPtr = nullptr; 148 } 149 150 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyToDevice() { 151 auto cudaKernel = dynamic_cast<CudaKernel*>(kernel); 152 // std::cout << "copyToDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " " 153 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 154 // << std::endl; 155 char *ptr = (char*)arg->value.buffer.memorySegment; 156 157 CUresult status = cuMemcpyHtoDAsync(devicePtr, arg->value.buffer.memorySegment, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream); 158 if (CUDA_SUCCESS != status) { 159 std::cerr << "cuMemcpyHtoDAsync() CUDA error = " << status 160 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 161 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 162 exit(-1); 163 } 164 status = static_cast<CUresult >(cudaStreamSynchronize(cudaKernel->cudaStream)); 165 if (CUDA_SUCCESS != status) { 166 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 167 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 168 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 169 exit(-1); 170 } 171 } 172 173 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyFromDevice() { 174 auto cudaKernel = dynamic_cast<CudaKernel*>(kernel); 175 // std::cout << "copyFromDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " " 176 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 177 // << std::endl; 178 char *ptr = (char*)arg->value.buffer.memorySegment; 179 180 CUresult status =cuMemcpyDtoHAsync(arg->value.buffer.memorySegment, devicePtr, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream); 181 if (CUDA_SUCCESS != status) { 182 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 183 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 184 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 185 exit(-1); 186 } 187 cudaError_t t1 = cudaStreamSynchronize(cudaKernel->cudaStream); 188 if (static_cast<cudaError_t>(CUDA_SUCCESS) != t1) { 189 std::cerr << "CUDA error = " << t1 190 <<" " << cudaGetErrorString(static_cast<cudaError_t>(t1)) 191 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 192 exit(-1); 193 } 194 195 } 196 197 CudaBackend::CudaProgram::CudaKernel::CudaKernel(Backend::Program *program,char * name, CUfunction function) 198 : Backend::Program::Kernel(program, name), function(function),cudaStream() { 199 } 200 201 CudaBackend::CudaProgram::CudaKernel::~CudaKernel() = default; 202 203 long CudaBackend::CudaProgram::CudaKernel::ndrange(void *argArray) { 204 // std::cout << "ndrange(" << range << ") " << name << std::endl; 205 206 cudaStreamCreate(&cudaStream); 207 ArgSled argSled(static_cast<ArgArray_s *>(argArray)); 208 // Schema::dumpSled(std::cout, argArray); 209 void *argslist[argSled.argc()]; 210 NDRange *ndrange = nullptr; 211 #ifdef VERBOSE 212 std::cerr << "there are " << argSled.argc() << "args " << std::endl; 213 #endif 214 for (int i = 0; i < argSled.argc(); i++) { 215 Arg_s *arg = argSled.arg(i); 216 switch (arg->variant) { 217 case '&': { 218 if (arg->idx == 0){ 219 ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment); 220 } 221 auto cudaBuffer = new CudaBuffer(this, arg); 222 cudaBuffer->copyToDevice(); 223 argslist[arg->idx] = static_cast<void *>(&cudaBuffer->devicePtr); 224 break; 225 } 226 case 'I': 227 case 'F': 228 case 'J': 229 case 'D': 230 case 'C': 231 case 'S': { 232 argslist[arg->idx] = static_cast<void *>(&arg->value); 233 break; 234 } 235 default: { 236 std::cerr << " unhandled variant " << (char) arg->variant << std::endl; 237 break; 238 } 239 } 240 } 241 242 int range = ndrange->maxX; 243 int rangediv1024 = range / 1024; 244 int rangemod1024 = range % 1024; 245 if (rangemod1024 > 0) { 246 rangediv1024++; 247 } 248 // std::cout << "Running the kernel..." << std::endl; 249 // std::cout << " Requested range = " << range << std::endl; 250 // std::cout << " Range mod 1024 = " << rangemod1024 << std::endl; 251 // std::cout << " Actual range 1024 = " << (rangediv1024 * 1024) << std::endl; 252 auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 253 if (CUDA_SUCCESS != status) { 254 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 255 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 256 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 257 exit(-1); 258 } 259 260 status= cuLaunchKernel(function, 261 rangediv1024, 1, 1, 262 1024, 1, 1, 263 0, cudaStream, 264 argslist, 0); 265 if (CUDA_SUCCESS != status) { 266 std::cerr << "cuLaunchKernel() CUDA error = " << status 267 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 268 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 269 exit(-1); 270 } 271 status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 272 if (CUDA_SUCCESS != status) { 273 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 274 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 275 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 276 exit(-1); 277 } 278 279 //std::cout << "Kernel complete..."<<cudaGetErrorString(t)<<std::endl; 280 281 for (int i = 0; i < argSled.argc(); i++) { 282 Arg_s *arg = argSled.arg(i); 283 if (arg->variant == '&') { 284 static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr)->copyFromDevice(); 285 286 } 287 } 288 status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 289 if (CUDA_SUCCESS != status) { 290 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 291 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 292 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 293 exit(-1); 294 } 295 296 for (int i = 0; i < argSled.argc(); i++) { 297 Arg_s *arg = argSled.arg(i); 298 if (arg->variant == '&') { 299 delete static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr); 300 arg->value.buffer.vendorPtr = nullptr; 301 } 302 } 303 cudaStreamDestroy(cudaStream); 304 return (long) 0; 305 } 306 307 308 CudaBackend::CudaProgram::CudaProgram(Backend *backend, BuildInfo *buildInfo, Ptx *ptx, CUmodule module) 309 : Backend::Program(backend, buildInfo), ptx(ptx), module(module) { 310 } 311 312 CudaBackend::CudaProgram::~CudaProgram() = default; 313 314 long CudaBackend::CudaProgram::getKernel(int nameLen, char *name) { 315 CUfunction function; 316 CUresult status= cuModuleGetFunction(&function, module, name); 317 if (CUDA_SUCCESS != status) { 318 std::cerr << "cuModuleGetFunction() CUDA error = " << status 319 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 320 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 321 exit(-1); 322 } 323 long kernelHandle = reinterpret_cast<long>(new CudaKernel(this, name, function)); 324 return kernelHandle; 325 } 326 327 bool CudaBackend::CudaProgram::programOK() { 328 return true; 329 } 330 331 CudaBackend::CudaBackend(int mode) 332 : Backend(mode), device(),context() { 333 // std::cout << "CudaBackend constructor " << ((cudaConfig == nullptr) ? "cudaConfig== null" : "got cudaConfig") 334 // << std::endl; 335 int deviceCount = 0; 336 CUresult err = cuInit(0); 337 if (err == CUDA_SUCCESS) { 338 cuDeviceGetCount(&deviceCount); 339 std::cout << "CudaBackend device count" << std::endl; 340 cuDeviceGet(&device, 0); 341 std::cout << "CudaBackend device ok" << std::endl; 342 cuCtxCreate(&context, 0, device); 343 std::cout << "CudaBackend context created ok" << std::endl; 344 } else { 345 std::cout << "CudaBackend failed, we seem to have the runtime library but no device, no context, nada " 346 << std::endl; 347 exit(1); 348 } 349 } 350 351 //CudaBackend::CudaBackend() : CudaBackend(nullptr, 0, nullptr) { 352 // 353 //} 354 355 CudaBackend::~CudaBackend() { 356 std::cout << "freeing context" << std::endl; 357 CUresult status = cuCtxDestroy(context); 358 if (CUDA_SUCCESS != status) { 359 std::cerr << "cuCtxDestroy(() CUDA error = " << status 360 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 361 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 362 exit(-1); 363 } 364 } 365 366 int CudaBackend::getMaxComputeUnits() { 367 std::cout << "getMaxComputeUnits()" << std::endl; 368 int value = 1; 369 return value; 370 } 371 372 void CudaBackend::info() { 373 char name[100]; 374 cuDeviceGetName(name, sizeof(name), device); 375 std::cout << "> Using device 0: " << name << std::endl; 376 377 // get compute capabilities and the devicename 378 int major = 0, minor = 0; 379 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 380 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 381 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 382 383 int warpSize; 384 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); 385 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 386 387 int threadsPerBlock; 388 cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); 389 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 390 391 int cores; 392 cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); 393 std::cout << "> GPU Cores " << cores << std::endl; 394 395 size_t totalGlobalMem; 396 cuDeviceTotalMem(&totalGlobalMem, device); 397 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 398 std::cout << " 64-bit Memory Address: " << 399 ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 400 401 } 402 403 long CudaBackend::compileProgram(int len, char *source) { 404 Ptx *ptx = Ptx::nvcc(source, len); 405 CUmodule module; 406 std::cout << "inside compileProgram" << std::endl; 407 std::cout << "cuda " << source << std::endl; 408 if (ptx->text != nullptr) { 409 std::cout << "ptx " << ptx->text << std::endl; 410 411 // in this branch we use compilation with parameters 412 const unsigned int jitNumOptions = 2; 413 auto jitOptions = new CUjit_option[jitNumOptions]; 414 void **jitOptVals = new void *[jitNumOptions]; 415 416 // set up size of compilation log buffer 417 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 418 int jitLogBufferSize = 8192; 419 jitOptVals[0] = (void *) (size_t) jitLogBufferSize; 420 421 // set up pointer to the compilation log buffer 422 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 423 char *jitLogBuffer = new char[jitLogBufferSize]; 424 jitOptVals[1] = jitLogBuffer; 425 int status = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals); 426 427 printf("> PTX JIT log:\n%s\n", jitLogBuffer); 428 return reinterpret_cast<long>(new CudaProgram(this, nullptr, ptx, module)); 429 430 //delete ptx; 431 } else { 432 std::cout << "no ptx content!" << std::endl; 433 exit(1); 434 } 435 } 436 437 long getCudaBackend(int mode) { 438 long backendHandle= reinterpret_cast<long>(new CudaBackend(mode); 439 std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 440 return backendHandle; 441 } 442 443 444