1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include <cuda_runtime_api.h> 29 #include "ptx_backend.h" 30 31 Ptx::Ptx(size_t len) 32 : len(len), text(len > 0 ? new char[len] : nullptr) { 33 std::cout << "in Ptx with buffer allocated "<<len << std::endl; 34 } 35 36 Ptx::~Ptx() { 37 if (len > 0 && text != nullptr) { 38 std::cout << "in ~Ptx with deleting allocated "<<len << std::endl; 39 delete[] text; 40 } 41 } 42 43 uint64_t timeSinceEpochMillisec() { 44 using namespace std::chrono; 45 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 46 } 47 48 Ptx *Ptx::nvcc(const char *ptxSource, size_t len) { 49 Ptx * ptx = new Ptx(len); 50 strcpy(ptx->text,ptxSource); 51 return ptx; 52 53 // Ptx *ptx = nullptr; 54 // uint64_t time = timeSinceEpochMillisec(); 55 // std::stringstream timestampPtx; 56 // timestampPtx << "./tmp" << time << ".ptx"; 57 // const char *ptxPath = strdup(timestampPtx.str().c_str()); 58 // // std::cout << "ptx " << ptxPath << std::endl; 59 // // we are going to fork exec nvcc 60 // int pid; 61 // if ((pid = fork()) == 0) { 62 // std::ofstream ptx; 63 // std::stringstream timestampPtx; 64 // timestampPtx << "./tmp" << time << ".cu"; 65 // const char *ptxPath = strdup(timestampPtx.str().c_str()); 66 // std::cout << "ptx " << ptxPath << std::endl; 67 // ptx.open(ptxPath, std::ofstream::trunc); 68 // ptx.write(ptxSource, len); 69 // ptx.close(); 70 // const char *path = "/usr/bin/nvcc"; 71 // //const char *path = "/usr/local/cuda-12.2/bin/nvcc"; 72 // const char *argv[]{"nvcc", "-ptx", ptxPath, "-o", ptxPath, nullptr}; 73 // // we can't free ptxPath or ptxpath in child because we need them in exec, no prob through 74 // // because we get a new proc so they are released to os 75 // execvp(path, (char *const *) argv); 76 77 // } else if (pid < 0) { 78 // // fork failed. 79 // std::cerr << "fork of nvcc failed" << std::endl; 80 // std::exit(1); 81 // } else { 82 // int status; 83 // // std::cerr << "fork suceeded waiting for child" << std::endl; 84 // pid_t result = wait(&status); 85 // std::cerr << "child finished" << std::endl; 86 // std::ifstream ptxStream; 87 // ptxStream.open(ptxPath); 88 // // if (ptxStream.is_open()) { 89 // ptxStream.seekg(0, std::ios::end); 90 // size_t ptxLen = ptxStream.tellg(); 91 // ptxStream.close(); 92 // ptxStream.open(ptxPath); 93 // free((void *) ptxPath); 94 // ptxPath = nullptr; 95 // if (ptxLen > 0) { 96 // std::cerr << "ptx len "<< ptxLen << std::endl; 97 // ptx = new Ptx(ptxLen + 1); 98 // std::cerr << "about to read "<< ptx->len << std::endl; 99 // ptxStream.read(ptx->text, ptx->len); 100 // ptxStream.close(); 101 // std::cerr << "about to read "<< ptx->len << std::endl; 102 // ptx->text[ptx->len - 1] = '\0'; 103 // std::cerr << "read text "<< ptx->text << std::endl; 104 105 // } else { 106 // std::cerr << "no ptx! ptxLen == 0?"; 107 // exit(1); 108 // } 109 // // }else{ 110 // // std::cerr << "no ptx!"; 111 // // exit(1); 112 // // } 113 // } 114 // std::cout << "returning PTX" << std::endl; 115 // return ptx; 116 } 117 118 /* 119 //http://mercury.pr.erau.edu/~siewerts/extra/code/digital-media/CUDA/cuda_work/samples/0_Simple/matrixMulDrv/matrixMulDrv.cpp 120 */ 121 PtxBackend::PtxProgram::PtxKernel::PtxBuffer::PtxBuffer(Backend::Program::Kernel *kernel, Arg_s *arg) 122 : Buffer(kernel, arg), devicePtr() { 123 /* 124 * (void *) arg->value.buffer.memorySegment, 125 * (size_t) arg->value.buffer.sizeInBytes); 126 */ 127 // std::cout << "cuMemAlloc()" << std::endl; 128 CUresult status = cuMemAlloc(&devicePtr, (size_t) arg->value.buffer.sizeInBytes); 129 if (CUDA_SUCCESS != status) { 130 std::cerr << "cuMemFree() CUDA error = " << status 131 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 132 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 133 exit(-1); 134 } 135 // std::cout << "devptr " << std::hex<< (long)devicePtr <<std::dec <<std::endl; 136 arg->value.buffer.vendorPtr = static_cast<void *>(this); 137 } 138 139 PtxBackend::PtxProgram::PtxKernel::PtxBuffer::~PtxBuffer() { 140 141 // std::cout << "cuMemFree()" 142 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 143 // << std::endl; 144 CUresult status = cuMemFree(devicePtr); 145 if (CUDA_SUCCESS != status) { 146 std::cerr << "cuMemFree() CUDA error = " << status 147 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 148 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 149 exit(-1); 150 } 151 arg->value.buffer.vendorPtr = nullptr; 152 } 153 154 void PtxBackend::PtxProgram::PtxKernel::PtxBuffer::copyToDevice() { 155 auto ptxKernel = dynamic_cast<PtxKernel*>(kernel); 156 // std::cout << "copyToDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " " 157 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 158 // << std::endl; 159 char *ptr = (char*)arg->value.buffer.memorySegment; 160 161 unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16); 162 unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8); 163 164 if (ifacefacade1 != 0x1face00000facadeL && ifacefacade1 != ifacefacade2) { 165 std::cerr<<"End of buf marker before HtoD"<< std::hex << ifacefacade1 << ifacefacade2<< " buffer corrupt !" <<std::endl 166 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 167 exit(-1); 168 } 169 170 171 CUresult status = cuMemcpyHtoDAsync(devicePtr, arg->value.buffer.memorySegment, arg->value.buffer.sizeInBytes,ptxKernel->cudaStream); 172 if (CUDA_SUCCESS != status) { 173 std::cerr << "cuMemcpyHtoDAsync() CUDA error = " << status 174 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 175 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 176 exit(-1); 177 } 178 status = static_cast<CUresult >(cudaStreamSynchronize(ptxKernel->cudaStream)); 179 if (CUDA_SUCCESS != status) { 180 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 181 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 182 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 183 exit(-1); 184 } 185 } 186 187 void PtxBackend::PtxProgram::PtxKernel::PtxBuffer::copyFromDevice() { 188 auto ptxKernel = dynamic_cast<PtxKernel*>(kernel); 189 // std::cout << "copyFromDevice() 0x" << std::hex<<arg->value.buffer.sizeInBytes<<std::dec << " "<< arg->value.buffer.sizeInBytes << " " 190 // << "devptr " << std::hex<< (long)devicePtr <<std::dec 191 // << std::endl; 192 char *ptr = (char*)arg->value.buffer.memorySegment; 193 194 unsigned long ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16); 195 unsigned long ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8); 196 197 if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) { 198 std::cerr<<"end of buf marker before DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl 199 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 200 exit(-1); 201 } 202 CUresult status =cuMemcpyDtoHAsync(arg->value.buffer.memorySegment, devicePtr, arg->value.buffer.sizeInBytes,ptxKernel->cudaStream); 203 if (CUDA_SUCCESS != status) { 204 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 205 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 206 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 207 exit(-1); 208 } 209 cudaError_t t1 = cudaStreamSynchronize(ptxKernel->cudaStream); 210 if (static_cast<cudaError_t>(CUDA_SUCCESS) != t1) { 211 std::cerr << "CUDA error = " << t1 212 <<" " << cudaGetErrorString(static_cast<cudaError_t>(t1)) 213 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 214 exit(-1); 215 } 216 ifacefacade1 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-16); 217 ifacefacade2 = *reinterpret_cast<unsigned long*>(ptr+arg->value.buffer.sizeInBytes-8); 218 219 if (ifacefacade1 != 0x1face00000facadeL || ifacefacade1 != ifacefacade2) { 220 std::cerr<<"end of buf marker after DtoH"<< std::hex << ifacefacade1 << ifacefacade2<< std::dec<< " buffer corrupt !"<<std::endl 221 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 222 exit(-1); 223 } 224 } 225 226 PtxBackend::PtxProgram::PtxKernel::PtxKernel(Backend::Program *program,char * name, CUfunction function) 227 : Backend::Program::Kernel(program, name), function(function),cudaStream() { 228 } 229 230 PtxBackend::PtxProgram::PtxKernel::~PtxKernel() = default; 231 232 long PtxBackend::PtxProgram::PtxKernel::ndrange(void *argArray) { 233 // std::cout << "ndrange(" << range << ") " << name << std::endl; 234 235 cudaStreamCreate(&cudaStream); 236 ArgSled argSled(static_cast<ArgArray_s *>(argArray)); 237 // Schema::dumpSled(std::cout, argArray); 238 void *argslist[argSled.argc()]; 239 NDRange *ndrange = nullptr; 240 #ifdef VERBOSE 241 std::cerr << "there are " << argSled.argc() << "args " << std::endl; 242 #endif 243 for (int i = 0; i < argSled.argc(); i++) { 244 Arg_s *arg = argSled.arg(i); 245 switch (arg->variant) { 246 case '&': { 247 if (arg->idx == 0){ 248 ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment); 249 } 250 auto ptxBuffer = new PtxBuffer(this, arg); 251 ptxBuffer->copyToDevice(); 252 argslist[arg->idx] = static_cast<void *>(&ptxBuffer->devicePtr); 253 break; 254 } 255 case 'I': 256 case 'F': 257 case 'J': 258 case 'D': 259 case 'C': 260 case 'S': { 261 argslist[arg->idx] = static_cast<void *>(&arg->value); 262 break; 263 } 264 default: { 265 std::cerr << " unhandled variant " << (char) arg->variant << std::endl; 266 break; 267 } 268 } 269 } 270 271 int range = ndrange->maxX; 272 int rangediv1024 = range / 1024; 273 int rangemod1024 = range % 1024; 274 if (rangemod1024 > 0) { 275 rangediv1024++; 276 } 277 // std::cout << "Running the kernel..." << std::endl; 278 // std::cout << " Requested range = " << range << std::endl; 279 // std::cout << " Range mod 1024 = " << rangemod1024 << std::endl; 280 // std::cout << " Actual range 1024 = " << (rangediv1024 * 1024) << std::endl; 281 auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 282 if (CUDA_SUCCESS != status) { 283 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 284 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 285 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 286 exit(-1); 287 } 288 289 status= cuLaunchKernel(function, 290 rangediv1024, 1, 1, 291 1024, 1, 1, 292 0, cudaStream, 293 argslist, 0); 294 if (CUDA_SUCCESS != status) { 295 std::cerr << "cuLaunchKernel() CUDA error = " << status 296 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 297 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 298 exit(-1); 299 } 300 status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 301 if (CUDA_SUCCESS != status) { 302 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 303 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 304 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 305 exit(-1); 306 } 307 308 //std::cout << "Kernel complete..."<<cudaGetErrorString(t)<<std::endl; 309 310 for (int i = 0; i < argSled.argc(); i++) { 311 Arg_s *arg = argSled.arg(i); 312 if (arg->variant == '&') { 313 static_cast<PtxBuffer *>(arg->value.buffer.vendorPtr)->copyFromDevice(); 314 315 } 316 } 317 status= static_cast<CUresult>(cudaStreamSynchronize(cudaStream)); 318 if (CUDA_SUCCESS != status) { 319 std::cerr << "cudaStreamSynchronize() CUDA error = " << status 320 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 321 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 322 exit(-1); 323 } 324 325 for (int i = 0; i < argSled.argc(); i++) { 326 Arg_s *arg = argSled.arg(i); 327 if (arg->variant == '&') { 328 delete static_cast<PtxBuffer *>(arg->value.buffer.vendorPtr); 329 arg->value.buffer.vendorPtr = nullptr; 330 } 331 } 332 cudaStreamDestroy(cudaStream); 333 return (long) 0; 334 } 335 336 337 PtxBackend::PtxProgram::PtxProgram(Backend *backend, BuildInfo *buildInfo, Ptx *ptx, CUmodule module) 338 : Backend::Program(backend, buildInfo), ptx(ptx), module(module) { 339 } 340 341 PtxBackend::PtxProgram::~PtxProgram() = default; 342 343 long PtxBackend::PtxProgram::getKernel(int nameLen, char *name) { 344 CUfunction function; 345 CUresult status= cuModuleGetFunction(&function, module, name); 346 if (CUDA_SUCCESS != status) { 347 std::cerr << "cuModuleGetFunction() CUDA error = " << status 348 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 349 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 350 exit(-1); 351 } 352 long kernelHandle = reinterpret_cast<long>(new PtxKernel(this, name, function)); 353 return kernelHandle; 354 } 355 356 bool PtxBackend::PtxProgram::programOK() { 357 return true; 358 } 359 360 PtxBackend::PtxBackend(PtxBackend::PtxConfig *ptxConfig, int 361 configSchemaLen, char *configSchema) 362 : Backend((Backend::Config*) ptxConfig, configSchemaLen, configSchema), device(),context() { 363 // std::cout << "PtxBackend constructor " << ((ptxConfig == nullptr) ? "ptxConfig== null" : "got ptxConfig") 364 // << std::endl; 365 int deviceCount = 0; 366 CUresult err = cuInit(0); 367 if (err == CUDA_SUCCESS) { 368 cuDeviceGetCount(&deviceCount); 369 std::cout << "PtxBackend device count" << std::endl; 370 cuDeviceGet(&device, 0); 371 std::cout << "PtxBackend device ok" << std::endl; 372 cuCtxCreate(&context, 0, device); 373 std::cout << "PtxBackend context created ok" << std::endl; 374 } else { 375 std::cout << "PtxBackend failed, we seem to have the runtime library but no device, no context, nada " 376 << std::endl; 377 exit(1); 378 } 379 } 380 381 PtxBackend::PtxBackend() : PtxBackend(nullptr, 0, nullptr) { 382 383 } 384 385 PtxBackend::~PtxBackend() { 386 std::cout << "freeing context" << std::endl; 387 CUresult status = cuCtxDestroy(context); 388 if (CUDA_SUCCESS != status) { 389 std::cerr << "cuCtxDestroy(() CUDA error = " << status 390 <<" " << cudaGetErrorString(static_cast<cudaError_t>(status)) 391 <<" " << __FILE__ << " line " << __LINE__ << std::endl; 392 exit(-1); 393 } 394 } 395 396 int PtxBackend::getMaxComputeUnits() { 397 std::cout << "getMaxComputeUnits()" << std::endl; 398 int value = 1; 399 return value; 400 } 401 402 void PtxBackend::info() { 403 char name[100]; 404 cuDeviceGetName(name, sizeof(name), device); 405 std::cout << "> Using device 0: " << name << std::endl; 406 407 // get compute capabilities and the devicename 408 int major = 0, minor = 0; 409 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 410 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 411 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 412 413 int warpSize; 414 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); 415 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 416 417 int threadsPerBlock; 418 cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); 419 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 420 421 int cores; 422 cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); 423 std::cout << "> GPU Cores " << cores << std::endl; 424 425 size_t totalGlobalMem; 426 cuDeviceTotalMem(&totalGlobalMem, device); 427 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 428 std::cout << " 64-bit Memory Address: " << 429 ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 430 431 } 432 433 long PtxBackend::compileProgram(int len, char *source) { 434 Ptx *ptx = Ptx::nvcc(source, len); 435 CUmodule module; 436 std::cout << "inside compileProgram" << std::endl; 437 std::cout << "ptx " << source << std::endl; 438 if (ptx->text != nullptr) { 439 std::cout << "ptx " << ptx->text << std::endl; 440 441 // in this branch we use compilation with parameters 442 const unsigned int jitNumOptions = 2; 443 auto jitOptions = new CUjit_option[jitNumOptions]; 444 void **jitOptVals = new void *[jitNumOptions]; 445 446 // set up size of compilation log buffer 447 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 448 int jitLogBufferSize = 8192; 449 jitOptVals[0] = (void *) (size_t) jitLogBufferSize; 450 451 // set up pointer to the compilation log buffer 452 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 453 char *jitLogBuffer = new char[jitLogBufferSize]; 454 jitOptVals[1] = jitLogBuffer; 455 int status = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals); 456 457 printf("> PTX JIT log:\n%s\n", jitLogBuffer); 458 printf("status: %d\n", status); 459 return reinterpret_cast<long>(new PtxProgram(this, nullptr, ptx, module)); 460 461 //delete ptx; 462 } else { 463 std::cout << "no ptx content!" << std::endl; 464 exit(1); 465 } 466 } 467 468 long getBackend(void *config, int configSchemaLen, char *configSchema) { 469 long backendHandle= reinterpret_cast<long>( 470 new PtxBackend(static_cast<PtxBackend::PtxConfig *>(config), configSchemaLen, 471 configSchema)); 472 std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 473 return backendHandle; 474 } 475 476 477