1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include "cuda_backend.h" 29 30 31 PtxSource::PtxSource() 32 : Text(0L) { 33 } 34 35 PtxSource::PtxSource(size_t len) 36 : Text(len) { 37 } 38 39 PtxSource::PtxSource(char *text) 40 : Text(text, false) { 41 } 42 43 PtxSource::PtxSource(size_t len, char *text) 44 : Text(len, text, true) { 45 } 46 PtxSource::PtxSource(size_t len, char *text, bool isCopy) 47 : Text(len, text, isCopy) { 48 } 49 50 CudaSource::CudaSource(size_t len) 51 : Text(len) { 52 } 53 54 CudaSource::CudaSource(char *text) 55 : Text(text, false) { 56 } 57 58 CudaSource::CudaSource(size_t len, char *text, bool isCopy) 59 : Text(len, text, isCopy) { 60 } 61 62 CudaSource::CudaSource() 63 : Text(0) { 64 } 65 66 uint64_t timeSinceEpochMillisec() { 67 using namespace std::chrono; 68 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 69 } 70 71 std::string tmpFileName(uint64_t time, const std::string &suffix) { 72 std::stringstream timestamp; 73 timestamp << "./tmp" << time << suffix; 74 return timestamp.str(); 75 } 76 77 78 79 CudaBackend::CudaBackend(int configBits) 80 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() { 81 int deviceCount = 0; 82 83 if (initStatus == CUDA_SUCCESS) { 84 WHERE{ 85 .f = __FILE__, .l = __LINE__, 86 .e = cuDeviceGetCount(&deviceCount), 87 .t = "cuDeviceGetCount" 88 }.report(); 89 std::cout << "CudaBackend device count = " << deviceCount << std::endl; 90 WHERE{ 91 .f = __FILE__, .l = __LINE__, 92 .e = cuDeviceGet(&device, 0), 93 .t = "cuDeviceGet" 94 }.report(); 95 WHERE{ 96 .f = __FILE__, .l = __LINE__, 97 .e = cuCtxCreate(&context, 0, device), 98 .t = "cuCtxCreate" 99 }.report(); 100 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl; 101 dynamic_cast<CudaQueue *>(queue)->init(); 102 } else { 103 WHERE{ 104 .f = __FILE__, .l = __LINE__, 105 .e = initStatus, 106 "cuInit() failed we seem to have the runtime library but no device" 107 }.report(); 108 } 109 } 110 111 112 CudaBackend::~CudaBackend() { 113 std::cout << "freeing context" << std::endl; 114 WHERE{ 115 .f = __FILE__, .l = __LINE__, 116 .e = cuCtxDestroy(context), 117 .t = "cuCtxDestroy" 118 }.report(); 119 } 120 121 void CudaBackend::info() { 122 char name[100]; 123 cuDeviceGetName(name, sizeof(name), device); 124 std::cout << "> Using device 0: " << name << std::endl; 125 126 // get compute capabilities and the devicename 127 int major = 0, minor = 0; 128 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 129 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 130 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 131 132 int warpSize; 133 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); 134 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 135 136 int threadsPerBlock; 137 cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); 138 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 139 140 int cores; 141 cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); 142 std::cout << "> GPU Cores " << cores << std::endl; 143 144 size_t totalGlobalMem; 145 cuDeviceTotalMem(&totalGlobalMem, device); 146 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 147 std::cout << " 64-bit Memory Address: " << 148 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 149 } 150 151 152 153 154 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) { 155 //std::cout << "inside nvcc" << std::endl; 156 const uint64_t time = timeSinceEpochMillisec(); 157 const std::string ptxPath = tmpFileName(time, ".ptx"); 158 const std::string cudaPath = tmpFileName(time, ".cu"); 159 int pid; 160 cudaSource->write(cudaPath); 161 if ((pid = fork()) == 0) { //child 162 const auto path = "/usr/local/cuda/bin/nvcc"; 163 const char *argv[]{ "/usr/local/cuda/bin/nvcc", "-ptx", "-Wno-deprecated-gpu-targets", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr}; 164 const int stat = execvp(path, (char *const *) argv); 165 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl; 166 std::exit(errno); 167 } else if (pid < 0) {// fork failed. 168 std::cerr << "fork of nvcc failed" << std::endl; 169 std::exit(1); 170 } else { //parent 171 int status; 172 pid_t result = wait(&status); 173 auto *ptx = new PtxSource(); 174 ptx->read(ptxPath); 175 return ptx; 176 } 177 } 178 179 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) { 180 return compile(&cudaSource); 181 } 182 183 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) { 184 const PtxSource *ptxSource = nvcc(cudaSource); 185 return compile(ptxSource); 186 } 187 188 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) { 189 return compile(&ptxSource); 190 } 191 192 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) { 193 194 CUmodule module; 195 // std::cout << "inside compile" << std::endl; 196 // std::cout << "cuda " << cudaSource->text << std::endl; 197 if (ptx->text != nullptr) { 198 // std::cout << "ptx " << ptx->text << std::endl; 199 const Log *infLog = new Log(8192); 200 const Log *errLog = new Log(8192); 201 constexpr unsigned int optc = 5; 202 const auto jitOptions = new CUjit_option[optc]; 203 auto jitOptVals = new void *[optc]; 204 205 206 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 207 jitOptVals[0] = reinterpret_cast<void *>(infLog->len); 208 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 209 jitOptVals[1] = infLog->text; 210 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 211 jitOptVals[2] = reinterpret_cast<void *>(errLog->len); 212 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; 213 jitOptVals[3] = errLog->text; 214 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO; 215 jitOptVals[4] = reinterpret_cast<void *>(1); 216 217 WHERE{ 218 .f = __FILE__, .l = __LINE__, 219 .e = cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), 220 .t = "cuModuleLoadDataEx" 221 }.report(); 222 if (*infLog->text!='\0'){ 223 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl; 224 } 225 if (*errLog->text!='\0'){ 226 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl; 227 } 228 return new CudaModule(this, ptx->text, infLog->text, true, module); 229 230 //delete ptx; 231 } else { 232 std::cout << "no ptx content!" << std::endl; 233 exit(1); 234 } 235 } 236 237 //Entry point from HAT. We use the config PTX bit to determine which Source type 238 239 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) { 240 if (config->traceCalls) { 241 std::cout << "inside compileProgram" << std::endl; 242 } 243 244 if (config->ptx){ 245 if (config->trace) { 246 std::cout << "compiling from provided ptx " << std::endl; 247 } 248 PtxSource ptxSource(len, source, false); 249 return compile(ptxSource); 250 }else{ 251 if (config->trace) { 252 std::cout << "compiling from provided cuda " << std::endl; 253 } 254 CudaSource cudaSource(len , source, false); 255 return compile(cudaSource); 256 } 257 } 258 259 /* 260 261 if (config->ptx) { 262 263 } else { 264 if (config->trace) { 265 std::cout << "compiling from cuda c99 " << std::endl; 266 } 267 if (config->showCode) { 268 std::cout << "cuda " << source << std::endl; 269 } 270 auto* cuda = new CudaSource(len, source, false); 271 ptx = nvcc(cuda); 272 } 273 if (config->showCode) { 274 std::cout << "ptx " << ptx->text << std::endl; 275 } 276 CUmodule module; 277 278 279 if (ptx->text != nullptr) { 280 constexpr unsigned int jitNumOptions = 2; 281 const auto jitOptions = new CUjit_option[jitNumOptions]; 282 const auto jitOptVals = new void *[jitNumOptions]; 283 284 // set up size of compilation log buffer 285 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 286 constexpr int jitLogBufferSize = 8192; 287 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize); 288 289 // set up pointer to the compilation log buffer 290 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 291 auto jitLogBuffer = new char[jitLogBufferSize]; 292 jitOptVals[1] = jitLogBuffer; 293 cuCtxSetCurrent(context); 294 295 WHERE{ 296 .f = __FILE__, .l = __LINE__, 297 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals), 298 .t = "cuModuleLoadDataEx" 299 }.report(); 300 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){ 301 std::cout << "PTX log:" << jitLogBuffer << std::endl; 302 } 303 return new CudaModule(this, ptx->text, jitLogBuffer, true, module); 304 } else { 305 std::cout << "no ptx content!" << std::endl; 306 exit(1); 307 } 308 } */ 309 310 extern "C" long getBackend(int mode) { 311 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode)); 312 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 313 return backendHandle; 314 } 315 316 void clCallback(void *) { 317 std::cerr << "start of compute" << std::endl; 318 } 319 320 321 void CudaBackend::computeEnd() { 322 queue->computeEnd(); 323 } 324 325 void CudaBackend::computeStart() { 326 queue->computeStart(); 327 } 328 329 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) { 330 if (config->traceCalls) { 331 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," << 332 std::dec << memorySegmentLength << "){" << std::endl; 333 } 334 if (config->minimizeCopies) { 335 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength); 336 if (bufferState->state == BufferState::DEVICE_OWNED) { 337 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr)); 338 if (config->traceEnqueues | config->traceCopies) { 339 std::cout << "copying buffer from device (from java access) " << std::endl; 340 } 341 queue->wait(); 342 queue->release(); 343 } else { 344 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl; 345 std::exit(1); 346 } 347 } else { 348 std::cerr << 349 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!" 350 << std::endl; 351 std::exit(1); 352 } 353 if (config->traceCalls) { 354 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl; 355 } 356 return true; 357 } 358 359 CudaBackend *CudaBackend::of(const long backendHandle) { 360 return reinterpret_cast<CudaBackend *>(backendHandle); 361 } 362 363 CudaBackend *CudaBackend::of(Backend *backend) { 364 return dynamic_cast<CudaBackend *>(backend); 365 } 366 367 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) { 368 CudaBuffer *cudaBuffer = nullptr; 369 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) { 370 cudaBuffer = new CudaBuffer(this, bufferState); 371 if (config->trace) { 372 std::cout << "We allocated arg buffer " << std::endl; 373 } 374 } else { 375 if (config->trace) { 376 std::cout << "Were reusing buffer buffer " << std::endl; 377 } 378 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr); 379 } 380 return cudaBuffer; 381 }