1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include "cuda_backend.h" 29 #include <iostream> 30 31 PtxSource::PtxSource() 32 : Text(0L) { 33 } 34 35 PtxSource::PtxSource(size_t len) 36 : Text(len) { 37 } 38 39 PtxSource::PtxSource(char *text) 40 : Text(text, false) { 41 } 42 43 PtxSource::PtxSource(size_t len, char *text) 44 : Text(len, text, true) { 45 } 46 PtxSource::PtxSource(size_t len, char *text, bool isCopy) 47 : Text(len, text, isCopy) { 48 } 49 50 CudaSource::CudaSource(size_t len) 51 : Text(len) { 52 } 53 54 CudaSource::CudaSource(char *text) 55 : Text(text, false) { 56 } 57 58 CudaSource::CudaSource(size_t len, char *text, bool isCopy) 59 : Text(len, text, isCopy) { 60 } 61 62 CudaSource::CudaSource() 63 : Text(0) { 64 } 65 66 uint64_t timeSinceEpochMillisec() { 67 using namespace std::chrono; 68 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 69 } 70 71 std::string tmpFileName(uint64_t time, const std::string &suffix) { 72 std::stringstream timestamp; 73 timestamp << "./tmp" << time << suffix; 74 return timestamp.str(); 75 } 76 77 CudaBackend::CudaBackend(int configBits) 78 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() { 79 int deviceCount = 0; 80 81 if (initStatus == CUDA_SUCCESS) { 82 CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount"); 83 std::cout << "CudaBackend device count = " << deviceCount << std::endl; 84 CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet"); 85 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12080 86 CUctxCreateParams ctxCreateParams = {}; 87 CUDA_CHECK(cuCtxCreate_v4(&context, &ctxCreateParams, 0, device), "cuCtxCreate"); 88 #else 89 // Invoke previous implementation with 3 parameters 90 CUDA_CHECK(cuCtxCreate(&context, 0, device), "cuCtxCreate"); 91 #endif 92 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl; 93 dynamic_cast<CudaQueue *>(queue)->init(); 94 } else { 95 CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device"); 96 } 97 } 98 99 CudaBackend::~CudaBackend() { 100 std::cout << "freeing context" << std::endl; 101 CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy"); 102 } 103 104 void CudaBackend::info() { 105 char name[100]; 106 CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName"); 107 108 std::cout << "> Using device 0: " << name << std::endl; 109 110 // get compute capabilities and the device name 111 int major = 0, minor = 0; 112 CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute"); 113 CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute"); 114 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 115 116 int warpSize; 117 CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute"); 118 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 119 120 int threadsPerBlock; 121 CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute"); 122 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 123 124 int cores; 125 CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute"); 126 std::cout << "> GPU Cores " << cores << std::endl; 127 128 size_t totalGlobalMem; 129 CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem"); 130 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 131 std::cout << " 64-bit Memory Address: " << 132 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 133 } 134 135 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) { 136 const uint64_t time = timeSinceEpochMillisec(); 137 const std::string ptxPath = tmpFileName(time, ".ptx"); 138 const std::string cudaPath = tmpFileName(time, ".cu"); 139 int pid; 140 cudaSource->write(cudaPath); 141 if ((pid = fork()) == 0) { //child 142 const auto path = "/usr/local/cuda/bin/nvcc"; 143 const char *argv[] { 144 "/usr/local/cuda/bin/nvcc", 145 "-ptx", 146 "-Wno-deprecated-gpu-targets", 147 cudaPath.c_str(), 148 "-o", 149 ptxPath.c_str(), 150 nullptr 151 }; 152 const int stat = execvp(path, (char *const *) argv); 153 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl; 154 std::exit(errno); 155 } else if (pid < 0) {// fork failed. 156 std::cerr << "fork of nvcc failed" << std::endl; 157 std::exit(1); 158 } else { //parent 159 int status; 160 pid_t result = wait(&status); 161 auto *ptx = new PtxSource(); 162 ptx->read(ptxPath); 163 return ptx; 164 } 165 } 166 167 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) { 168 return compile(&cudaSource); 169 } 170 171 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) { 172 const PtxSource *ptxSource = nvcc(cudaSource); 173 return compile(ptxSource); 174 } 175 176 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) { 177 return compile(&ptxSource); 178 } 179 180 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) { 181 CUmodule module; 182 if (ptx->text != nullptr) { 183 const Log *infLog = new Log(8192); 184 const Log *errLog = new Log(8192); 185 constexpr unsigned int optc = 5; 186 const auto jitOptions = new CUjit_option[optc]; 187 auto jitOptVals = new void *[optc]; 188 189 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 190 jitOptVals[0] = reinterpret_cast<void *>(infLog->len); 191 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 192 jitOptVals[1] = infLog->text; 193 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 194 jitOptVals[2] = reinterpret_cast<void *>(errLog->len); 195 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; 196 jitOptVals[3] = errLog->text; 197 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO; 198 jitOptVals[4] = reinterpret_cast<void *>(1); 199 200 CUDA_CHECK(cuCtxSetCurrent(context), "cuCtxSetCurrent"); 201 CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx"); 202 203 if (*infLog->text!='\0'){ 204 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl; 205 } 206 if (*errLog->text!='\0'){ 207 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl; 208 } 209 return new CudaModule(this, ptx->text, infLog->text, true, module); 210 211 //delete ptx; 212 } else { 213 std::cout << "no ptx content!" << std::endl; 214 exit(1); 215 } 216 } 217 218 //Entry point from HAT. We use the config PTX bit to determine which Source type 219 220 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) { 221 if (config->traceCalls) { 222 std::cout << "inside compileProgram" << std::endl; 223 } 224 225 if (config->ptx){ 226 if (config->trace) { 227 std::cout << "compiling from provided ptx " << std::endl; 228 } 229 PtxSource ptxSource(len, source, false); 230 return compile(ptxSource); 231 }else{ 232 if (config->trace) { 233 std::cout << "compiling from provided cuda " << std::endl; 234 } 235 CudaSource cudaSource(len , source, false); 236 return compile(cudaSource); 237 } 238 } 239 240 /* 241 242 if (config->ptx) { 243 244 } else { 245 if (config->trace) { 246 std::cout << "compiling from cuda c99 " << std::endl; 247 } 248 if (config->showCode) { 249 std::cout << "cuda " << source << std::endl; 250 } 251 auto* cuda = new CudaSource(len, source, false); 252 ptx = nvcc(cuda); 253 } 254 if (config->showCode) { 255 std::cout << "ptx " << ptx->text << std::endl; 256 } 257 CUmodule module; 258 259 260 if (ptx->text != nullptr) { 261 constexpr unsigned int jitNumOptions = 2; 262 const auto jitOptions = new CUjit_option[jitNumOptions]; 263 const auto jitOptVals = new void *[jitNumOptions]; 264 265 // set up size of compilation log buffer 266 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 267 constexpr int jitLogBufferSize = 8192; 268 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize); 269 270 // set up pointer to the compilation log buffer 271 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 272 auto jitLogBuffer = new char[jitLogBufferSize]; 273 jitOptVals[1] = jitLogBuffer; 274 cuCtxSetCurrent(context); 275 276 WHERE{ 277 .f = __FILE__, .l = __LINE__, 278 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals), 279 .t = "cuModuleLoadDataEx" 280 }.report(); 281 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){ 282 std::cout << "PTX log:" << jitLogBuffer << std::endl; 283 } 284 return new CudaModule(this, ptx->text, jitLogBuffer, true, module); 285 } else { 286 std::cout << "no ptx content!" << std::endl; 287 exit(1); 288 } 289 } */ 290 291 extern "C" long getBackend(int mode) { 292 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode)); 293 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 294 return backendHandle; 295 } 296 297 void clCallback(void *) { 298 std::cerr << "start of compute" << std::endl; 299 } 300 301 void CudaBackend::computeEnd() { 302 queue->computeEnd(); 303 } 304 305 void CudaBackend::computeStart() { 306 queue->computeStart(); 307 } 308 309 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) { 310 if (config->traceCalls) { 311 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," << 312 std::dec << memorySegmentLength << "){" << std::endl; 313 } 314 if (config->minimizeCopies) { 315 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength); 316 if (bufferState->state == BufferState::DEVICE_OWNED) { 317 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr)); 318 if (config->traceEnqueues | config->traceCopies) { 319 std::cout << "copying buffer from device (from java access) " << std::endl; 320 } 321 queue->wait(); 322 queue->release(); 323 } else { 324 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl; 325 std::exit(1); 326 } 327 } else { 328 std::cerr << 329 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!" 330 << std::endl; 331 std::exit(1); 332 } 333 if (config->traceCalls) { 334 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl; 335 } 336 return true; 337 } 338 339 CudaBackend *CudaBackend::of(const long backendHandle) { 340 return reinterpret_cast<CudaBackend *>(backendHandle); 341 } 342 343 CudaBackend *CudaBackend::of(Backend *backend) { 344 return dynamic_cast<CudaBackend *>(backend); 345 } 346 347 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) { 348 CudaBuffer *cudaBuffer = nullptr; 349 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) { 350 cudaBuffer = new CudaBuffer(this, bufferState); 351 if (config->trace) { 352 std::cout << "We allocated arg buffer " << std::endl; 353 } 354 } else { 355 if (config->trace) { 356 std::cout << "Were reusing buffer buffer " << std::endl; 357 } 358 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr); 359 } 360 return cudaBuffer; 361 }