1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include <cuda_runtime_api.h> 29 #include "cuda_backend.h" 30 31 Ptx::Ptx(size_t len) 32 : len(len), text(len > 0 ? new char[len] : nullptr) { 33 std::cout << "in Ptx with buffer allocated "<<len << std::endl; 34 } 35 36 Ptx::~Ptx() { 37 if (len > 0 && text != nullptr) { 38 std::cout << "in ~Ptx with deleting allocated "<<len << std::endl; 39 delete[] text; 40 } 41 } 42 43 uint64_t timeSinceEpochMillisec() { 44 using namespace std::chrono; 45 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 46 } 47 48 Ptx *Ptx::nvcc(const char *cudaSource, size_t len) { 49 Ptx *ptx = nullptr; 50 uint64_t time = timeSinceEpochMillisec(); 51 std::stringstream timestampPtx; 52 timestampPtx << "./tmp" << time << ".ptx"; 53 const char *ptxPath = strdup(timestampPtx.str().c_str()); 54 // std::cout << "ptx " << ptxPath << std::endl; 55 // we are going to fork exec nvcc 56 int pid; 57 if ((pid = fork()) == 0) { 58 std::ofstream cuda; 59 std::stringstream timestampCuda; 60 timestampCuda << "./tmp" << time << ".cu"; 61 const char *cudaPath = strdup(timestampCuda.str().c_str()); 62 std::cout << "cuda " << cudaPath << std::endl; 63 cuda.open(cudaPath, std::ofstream::trunc); 64 cuda.write(cudaSource, len); 65 cuda.close(); 66 const char *path = "/usr/bin/nvcc"; 67 //const char *path = "/usr/local/cuda-12.2/bin/nvcc"; 68 const char *argv[]{"nvcc", "-ptx", cudaPath, "-o", ptxPath, nullptr}; 69 // we can't free cudaPath or ptxpath in child because we need them in exec, no prob through 70 // because we get a new proc so they are released to os 71 execvp(path, (char *const *) argv); 72 73 } else if (pid < 0) { 74 // fork failed. 75 std::cerr << "fork of nvcc failed" << std::endl; 76 std::exit(1); 77 } else { 78 int status; 79 // std::cerr << "fork suceeded waiting for child" << std::endl; 80 pid_t result = wait(&status); 81 std::cerr << "child finished" << std::endl; 82 std::ifstream ptxStream; 83 ptxStream.open(ptxPath); 84 // if (ptxStream.is_open()) { 85 ptxStream.seekg(0, std::ios::end); 86 size_t ptxLen = ptxStream.tellg(); 87 ptxStream.close(); 88 ptxStream.open(ptxPath); 89 free((void *) ptxPath); 90 ptxPath = nullptr; 91 if (ptxLen > 0) { 92 std::cerr << "ptx len "<< ptxLen << std::endl; 93 ptx = new Ptx(ptxLen + 1); 94 std::cerr << "about to read "<< ptx->len << std::endl; 95 ptxStream.read(ptx->text, ptx->len); 96 ptxStream.close(); 97 std::cerr << "about to read "<< ptx->len << std::endl; 98 ptx->text[ptx->len - 1] = '\0'; 99 std::cerr << "read text "<< ptx->text << std::endl; 100 101 } else { 102 std::cerr << "no ptx! ptxLen == 0?"; 103 exit(1); 104 } 105 // }else{ 106 // std::cerr << "no ptx!"; 107 // exit(1); 108 // } 109 } 110 std::cout << "returning PTX" << std::endl; 111 return ptx; 112 } 113 114 /* 115 //http://mercury.pr.erau.edu/~siewerts/extra/code/digital-media/CUDA/cuda_work/samples/0_Simple/matrixMulDrv/matrixMulDrv.cpp 116 */ 117 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::CudaBuffer(Backend::Program::Kernel *kernel, Arg_s *arg) 118 : Buffer(kernel, arg), devicePtr() { 119 /* 120 * (void *) arg->value.buffer.memorySegment, 121 * (size_t) arg->value.buffer.sizeInBytes); 122 */ 123 // std::cout << "cuMemAlloc()" << std::endl; 124 CUresult status = cuMemAlloc(&devicePtr, (size_t) arg->value.buffer.sizeInBytes); 125 if (CUDA_SUCCESS != status) { 126 std::cerr << "cuMemFree() CUDA error = " << status 127 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 128 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 129 exit(-1); 130 } 131 // std::cout << "devptr " << std::hex<< (long)devicePtr <<std::dec <<std::endl; 132 arg->value.buffer.vendorPtr = static_cast<void *>(this); 133 } 134 135 CudaBackend::CudaProgram::CudaKernel::CudaBuffer::~CudaBuffer() { 136 137 // std::cout << "cuMemFree()" 138 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 139 // << std::endl; 140 CUresult status = cuMemFree(devicePtr); 141 if (CUDA_SUCCESS != status) { 142 std::cerr << "cuMemFree() CUDA error = " << status 143 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 144 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 145 exit(-1); 146 } 147 arg->value.buffer.vendorPtr = nullptr; 148 } 149 150 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyToDevice() { 151 auto cudaKernel = dynamic_cast<CudaKernel*>(kernel); 152 // std::cout << "copyToDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " " 153 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 154 // << std::endl; 155 char *ptr = (char*)arg->value.buffer.memorySegment; 156 157 unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16); 158 unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8); 159 160 if (ifacefacade1 != 0x1face00000facadeL && ifacefacade1 != ifacefacade2) { 161 std::cerr<<"End of buf marker before HtoD"<< std::hex << ifacefacade1 << ifacefacade2<< " buffer corrupt !" <<std::endl 162 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 163 exit(-1); 164 } 165 166 167 CUresult status = cuMemcpyHtoDAsync(devicePtr, arg->value.buffer.memorySegment, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream); 168 if (CUDA_SUCCESS != status) { 169 std::cerr << "cuMemcpyHtoDAsync() CUDA error = " << status 170 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 171 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 172 exit(-1); 173 } 174 status = static_cast<CUresult >(cudaStreamSynchronize(cudaKernel->cudaStream)); 175 if (CUDA_SUCCESS != status) { 176 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 177 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 178 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 179 exit(-1); 180 } 181 } 182 183 void CudaBackend::CudaProgram::CudaKernel::CudaBuffer::copyFromDevice() { 184 auto cudaKernel = dynamic_cast<CudaKernel*>(kernel); 185 // std::cout << "copyFromDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " " 186 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 187 // << std::endl; 188 char *ptr = (char*)arg->value.buffer.memorySegment; 189 190 unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16); 191 unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8); 192 193 if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) { 194 std::cerr<<"end of buf marker before DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl 195 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 196 exit(-1); 197 } 198 CUresult status =cuMemcpyDtoHAsync(arg->value.buffer.memorySegment, devicePtr, arg->value.buffer.sizeInBytes,cudaKernel->cudaStream); 199 if (CUDA_SUCCESS != status) { 200 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 201 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 202 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 203 exit(-1); 204 } 205 cudaError_t t1 = cudaStreamSynchronize(cudaKernel->cudaStream); 206 if (static_cast<cudaError_t>(CUDA_SUCCESS) != t1) { 207 std::cerr << "CUDA error = " << t1 208 <<" " << cudaGetErrorString(static_cast<cudaError_t>(t1)) 209 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 210 exit(-1); 211 } 212 ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16); 213 ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8); 214 215 if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) { 216 std::cerr<<"end of buf marker after DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl 217 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 218 exit(-1); 219 } 220 } 221 222 CudaBackend::CudaProgram::CudaKernel::CudaKernel(Backend::Program *program,char * name, CUfunction function) 223 : Backend::Program::Kernel(program, name), function(function),cudaStream() { 224 } 225 226 CudaBackend::CudaProgram::CudaKernel::~CudaKernel() = default; 227 228 long CudaBackend::CudaProgram::CudaKernel::ndrange(void *argArray) { 229 // std::cout << "ndrange(" << range << ") " << name << std::endl; 230 231 cudaStreamCreate(&cudaStream); 232 ArgSled argSled(static_cast<ArgArray_s *>(argArray)); 233 // Schema::dumpSled(std::cout, argArray); 234 void *argslist[argSled.argc()]; 235 NDRange *ndrange = nullptr; 236 #ifdef VERBOSE 237 std::cerr << "there are " << argSled.argc() << "args " << std::endl; 238 #endif 239 for (int i = 0; i < argSled.argc(); i++) { 240 Arg_s *arg = argSled.arg(i); 241 switch (arg->variant) { 242 case '&': { 243 if (arg->idx == 0){ 244 ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment); 245 } 246 auto cudaBuffer = new CudaBuffer(this, arg); 247 cudaBuffer->copyToDevice(); 248 argslist[arg->idx] = static_cast<void *>(&cudaBuffer->devicePtr); 249 break; 250 } 251 case 'I': 252 case 'F': 253 case 'J': 254 case 'D': 255 case 'C': 256 case 'S': { 257 argslist[arg->idx] = static_cast<void *>(&arg->value); 258 break; 259 } 260 default: { 261 std::cerr << " unhandled variant " << (char) arg->variant << std::endl; 262 break; 263 } 264 } 265 } 266 267 int range = ndrange->maxX; 268 int rangediv1024 = range / 1024; 269 int rangemod1024 = range % 1024; 270 if (rangemod1024 > 0) { 271 rangediv1024++; 272 } 273 // std::cout << "Running the kernel..." << std::endl; 274 // std::cout << " Requested range = " << range << std::endl; 275 // std::cout << " Range mod 1024 = " << rangemod1024 << std::endl; 276 // std::cout << " Actual range 1024 = " << (rangediv1024 * 1024) << std::endl; 277 auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 278 if (CUDA_SUCCESS != status) { 279 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 280 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 281 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 282 exit(-1); 283 } 284 285 status= cuLaunchKernel(function, 286 rangediv1024, 1, 1, 287 1024, 1, 1, 288 0, cudaStream, 289 argslist, 0); 290 if (CUDA_SUCCESS != status) { 291 std::cerr << "cuLaunchKernel() CUDA error = " << status 292 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 293 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 294 exit(-1); 295 } 296 status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 297 if (CUDA_SUCCESS != status) { 298 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 299 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 300 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 301 exit(-1); 302 } 303 304 //std::cout << "Kernel complete..."<<cudaGetErrorString(t)<<std::endl; 305 306 for (int i = 0; i < argSled.argc(); i++) { 307 Arg_s *arg = argSled.arg(i); 308 if (arg->variant == '&') { 309 static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr)->copyFromDevice(); 310 311 } 312 } 313 status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 314 if (CUDA_SUCCESS != status) { 315 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 316 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 317 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 318 exit(-1); 319 } 320 321 for (int i = 0; i < argSled.argc(); i++) { 322 Arg_s *arg = argSled.arg(i); 323 if (arg->variant == '&') { 324 delete static_cast<CudaBuffer *>(arg->value.buffer.vendorPtr); 325 arg->value.buffer.vendorPtr = nullptr; 326 } 327 } 328 cudaStreamDestroy(cudaStream); 329 return (long) 0; 330 } 331 332 333 CudaBackend::CudaProgram::CudaProgram(Backend *backend, BuildInfo *buildInfo, Ptx *ptx, CUmodule module) 334 : Backend::Program(backend, buildInfo), ptx(ptx), module(module) { 335 } 336 337 CudaBackend::CudaProgram::~CudaProgram() = default; 338 339 long CudaBackend::CudaProgram::getKernel(int nameLen, char *name) { 340 CUfunction function; 341 CUresult status= cuModuleGetFunction(&function, module, name); 342 if (CUDA_SUCCESS != status) { 343 std::cerr << "cuModuleGetFunction() CUDA error = " << status 344 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 345 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 346 exit(-1); 347 } 348 long kernelHandle = reinterpret_cast<long>(new CudaKernel(this, name, function)); 349 return kernelHandle; 350 } 351 352 bool CudaBackend::CudaProgram::programOK() { 353 return true; 354 } 355 356 CudaBackend::CudaBackend(CudaBackend::CudaConfig *cudaConfig, int 357 configSchemaLen, char *configSchema) 358 : Backend((Backend::Config*) cudaConfig, configSchemaLen, configSchema), device(),context() { 359 // std::cout << "CudaBackend constructor " << ((cudaConfig == nullptr) ? "cudaConfig== null" : "got cudaConfig") 360 // << std::endl; 361 int deviceCount = 0; 362 CUresult err = cuInit(0); 363 if (err == CUDA_SUCCESS) { 364 cuDeviceGetCount(&deviceCount); 365 std::cout << "CudaBackend device count" << std::endl; 366 cuDeviceGet(&device, 0); 367 std::cout << "CudaBackend device ok" << std::endl; 368 cuCtxCreate(&context, 0, device); 369 std::cout << "CudaBackend context created ok" << std::endl; 370 } else { 371 std::cout << "CudaBackend failed, we seem to have the runtime library but no device, no context, nada " 372 << std::endl; 373 exit(1); 374 } 375 } 376 377 CudaBackend::CudaBackend() : CudaBackend(nullptr, 0, nullptr) { 378 379 } 380 381 CudaBackend::~CudaBackend() { 382 std::cout << "freeing context" << std::endl; 383 CUresult status = cuCtxDestroy(context); 384 if (CUDA_SUCCESS != status) { 385 std::cerr << "cuCtxDestroy(() CUDA error = " << status 386 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 387 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 388 exit(-1); 389 } 390 } 391 392 int CudaBackend::getMaxComputeUnits() { 393 std::cout << "getMaxComputeUnits()" << std::endl; 394 int value = 1; 395 return value; 396 } 397 398 void CudaBackend::info() { 399 char name[100]; 400 cuDeviceGetName(name, sizeof(name), device); 401 std::cout << "> Using device 0: " << name << std::endl; 402 403 // get compute capabilities and the devicename 404 int major = 0, minor = 0; 405 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 406 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 407 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 408 409 int warpSize; 410 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); 411 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 412 413 int threadsPerBlock; 414 cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); 415 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 416 417 int cores; 418 cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); 419 std::cout << "> GPU Cores " << cores << std::endl; 420 421 size_t totalGlobalMem; 422 cuDeviceTotalMem(&totalGlobalMem, device); 423 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 424 std::cout << " 64-bit Memory Address: " << 425 ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 426 427 } 428 429 long CudaBackend::compileProgram(int len, char *source) { 430 Ptx *ptx = Ptx::nvcc(source, len); 431 CUmodule module; 432 std::cout << "inside compileProgram" << std::endl; 433 std::cout << "cuda " << source << std::endl; 434 if (ptx->text != nullptr) { 435 std::cout << "ptx " << ptx->text << std::endl; 436 437 // in this branch we use compilation with parameters 438 const unsigned int jitNumOptions = 2; 439 auto jitOptions = new CUjit_option[jitNumOptions]; 440 void **jitOptVals = new void *[jitNumOptions]; 441 442 // set up size of compilation log buffer 443 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 444 int jitLogBufferSize = 8192; 445 jitOptVals[0] = (void *) (size_t) jitLogBufferSize; 446 447 // set up pointer to the compilation log buffer 448 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 449 char *jitLogBuffer = new char[jitLogBufferSize]; 450 jitOptVals[1] = jitLogBuffer; 451 int status = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals); 452 453 printf("> PTX JIT log:\n%s\n", jitLogBuffer); 454 return reinterpret_cast<long>(new CudaProgram(this, nullptr, ptx, module)); 455 456 //delete ptx; 457 } else { 458 std::cout << "no ptx content!" << std::endl; 459 exit(1); 460 } 461 } 462 463 long getBackend(void *config, int configSchemaLen, char *configSchema) { 464 long backendHandle= reinterpret_cast<long>( 465 new CudaBackend(static_cast<CudaBackend::CudaConfig *>(config), configSchemaLen, 466 configSchema)); 467 std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 468 return backendHandle; 469 } 470 471 472