1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include "cuda_backend.h" 29 30 #include <iostream> 31 32 33 PtxSource::PtxSource() 34 : Text(0L) { 35 } 36 37 PtxSource::PtxSource(size_t len) 38 : Text(len) { 39 } 40 41 PtxSource::PtxSource(char *text) 42 : Text(text, false) { 43 } 44 45 PtxSource::PtxSource(size_t len, char *text) 46 : Text(len, text, true) { 47 } 48 PtxSource::PtxSource(size_t len, char *text, bool isCopy) 49 : Text(len, text, isCopy) { 50 } 51 52 CudaSource::CudaSource(size_t len) 53 : Text(len) { 54 } 55 56 CudaSource::CudaSource(char *text) 57 : Text(text, false) { 58 } 59 60 CudaSource::CudaSource(size_t len, char *text, bool isCopy) 61 : Text(len, text, isCopy) { 62 } 63 64 CudaSource::CudaSource() 65 : Text(0) { 66 } 67 68 uint64_t timeSinceEpochMillisec() { 69 using namespace std::chrono; 70 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 71 } 72 73 std::string tmpFileName(uint64_t time, const std::string &suffix) { 74 std::stringstream timestamp; 75 timestamp << "./tmp" << time << suffix; 76 return timestamp.str(); 77 } 78 79 80 81 CudaBackend::CudaBackend(int configBits) 82 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() { 83 int deviceCount = 0; 84 85 if (initStatus == CUDA_SUCCESS) { 86 WHERE{ 87 .f = __FILE__, .l = __LINE__, 88 .e = cuDeviceGetCount(&deviceCount), 89 .t = "cuDeviceGetCount" 90 }.report(); 91 std::cout << "CudaBackend device count = " << deviceCount << std::endl; 92 WHERE{ 93 .f = __FILE__, .l = __LINE__, 94 .e = cuDeviceGet(&device, 0), 95 .t = "cuDeviceGet" 96 }.report(); 97 WHERE{ 98 .f = __FILE__, .l = __LINE__, 99 .e = cuCtxCreate(&context, 0, device), 100 .t = "cuCtxCreate" 101 }.report(); 102 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl; 103 dynamic_cast<CudaQueue *>(queue)->init(); 104 } else { 105 WHERE{ 106 .f = __FILE__, .l = __LINE__, 107 .e = initStatus, 108 "cuInit() failed we seem to have the runtime library but no device" 109 }.report(); 110 } 111 } 112 113 114 CudaBackend::~CudaBackend() { 115 std::cout << "freeing context" << std::endl; 116 WHERE{ 117 .f = __FILE__, .l = __LINE__, 118 .e = cuCtxDestroy(context), 119 .t = "cuCtxDestroy" 120 }.report(); 121 } 122 123 void CudaBackend::info() { 124 char name[100]; 125 cuDeviceGetName(name, sizeof(name), device); 126 std::cout << "> Using device 0: " << name << std::endl; 127 128 // get compute capabilities and the devicename 129 int major = 0, minor = 0; 130 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 131 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 132 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 133 134 int warpSize; 135 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); 136 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 137 138 int threadsPerBlock; 139 cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); 140 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 141 142 int cores; 143 cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); 144 std::cout << "> GPU Cores " << cores << std::endl; 145 146 size_t totalGlobalMem; 147 cuDeviceTotalMem(&totalGlobalMem, device); 148 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 149 std::cout << " 64-bit Memory Address: " << 150 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 151 } 152 153 154 155 156 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) { 157 //std::cout << "inside nvcc" << std::endl; 158 const uint64_t time = timeSinceEpochMillisec(); 159 const std::string ptxPath = tmpFileName(time, ".ptx"); 160 const std::string cudaPath = tmpFileName(time, ".cu"); 161 int pid; 162 cudaSource->write(cudaPath); 163 if ((pid = fork()) == 0) { //child 164 const auto path = "/usr/local/cuda/bin/nvcc"; 165 const char *argv[]{ "/usr/local/cuda/bin/nvcc", "-ptx", "-Wno-deprecated-gpu-targets", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr}; 166 const int stat = execvp(path, (char *const *) argv); 167 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl; 168 std::exit(errno); 169 } else if (pid < 0) {// fork failed. 170 std::cerr << "fork of nvcc failed" << std::endl; 171 std::exit(1); 172 } else { //parent 173 int status; 174 pid_t result = wait(&status); 175 auto *ptx = new PtxSource(); 176 ptx->read(ptxPath); 177 return ptx; 178 } 179 } 180 181 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) { 182 return compile(&cudaSource); 183 } 184 185 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) { 186 const PtxSource *ptxSource = nvcc(cudaSource); 187 return compile(ptxSource); 188 } 189 190 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) { 191 return compile(&ptxSource); 192 } 193 194 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) { 195 196 CUmodule module; 197 // std::cout << "inside compile" << std::endl; 198 // std::cout << "cuda " << cudaSource->text << std::endl; 199 if (ptx->text != nullptr) { 200 // std::cout << "ptx " << ptx->text << std::endl; 201 const Log *infLog = new Log(8192); 202 const Log *errLog = new Log(8192); 203 constexpr unsigned int optc = 5; 204 const auto jitOptions = new CUjit_option[optc]; 205 auto jitOptVals = new void *[optc]; 206 207 208 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 209 jitOptVals[0] = reinterpret_cast<void *>(infLog->len); 210 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 211 jitOptVals[1] = infLog->text; 212 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 213 jitOptVals[2] = reinterpret_cast<void *>(errLog->len); 214 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; 215 jitOptVals[3] = errLog->text; 216 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO; 217 jitOptVals[4] = reinterpret_cast<void *>(1); 218 219 WHERE{ 220 .f = __FILE__, .l = __LINE__, 221 .e = cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), 222 .t = "cuModuleLoadDataEx" 223 }.report(); 224 if (*infLog->text!='\0'){ 225 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl; 226 } 227 if (*errLog->text!='\0'){ 228 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl; 229 } 230 return new CudaModule(this, ptx->text, infLog->text, true, module); 231 232 //delete ptx; 233 } else { 234 std::cout << "no ptx content!" << std::endl; 235 exit(1); 236 } 237 } 238 239 //Entry point from HAT. We use the config PTX bit to determine which Source type 240 241 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) { 242 if (config->traceCalls) { 243 std::cout << "inside compileProgram" << std::endl; 244 } 245 246 if (config->ptx){ 247 if (config->trace) { 248 std::cout << "compiling from provided ptx " << std::endl; 249 } 250 PtxSource ptxSource(len, source, false); 251 return compile(ptxSource); 252 }else{ 253 if (config->trace) { 254 std::cout << "compiling from provided cuda " << std::endl; 255 } 256 CudaSource cudaSource(len , source, false); 257 return compile(cudaSource); 258 } 259 } 260 261 /* 262 263 if (config->ptx) { 264 265 } else { 266 if (config->trace) { 267 std::cout << "compiling from cuda c99 " << std::endl; 268 } 269 if (config->showCode) { 270 std::cout << "cuda " << source << std::endl; 271 } 272 auto* cuda = new CudaSource(len, source, false); 273 ptx = nvcc(cuda); 274 } 275 if (config->showCode) { 276 std::cout << "ptx " << ptx->text << std::endl; 277 } 278 CUmodule module; 279 280 281 if (ptx->text != nullptr) { 282 constexpr unsigned int jitNumOptions = 2; 283 const auto jitOptions = new CUjit_option[jitNumOptions]; 284 const auto jitOptVals = new void *[jitNumOptions]; 285 286 // set up size of compilation log buffer 287 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 288 constexpr int jitLogBufferSize = 8192; 289 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize); 290 291 // set up pointer to the compilation log buffer 292 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 293 auto jitLogBuffer = new char[jitLogBufferSize]; 294 jitOptVals[1] = jitLogBuffer; 295 cuCtxSetCurrent(context); 296 297 WHERE{ 298 .f = __FILE__, .l = __LINE__, 299 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals), 300 .t = "cuModuleLoadDataEx" 301 }.report(); 302 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){ 303 std::cout << "PTX log:" << jitLogBuffer << std::endl; 304 } 305 return new CudaModule(this, ptx->text, jitLogBuffer, true, module); 306 } else { 307 std::cout << "no ptx content!" << std::endl; 308 exit(1); 309 } 310 } */ 311 312 extern "C" long getBackend(int mode) { 313 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode)); 314 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 315 return backendHandle; 316 } 317 318 void clCallback(void *) { 319 std::cerr << "start of compute" << std::endl; 320 } 321 322 323 void CudaBackend::computeEnd() { 324 queue->computeEnd(); 325 } 326 327 void CudaBackend::computeStart() { 328 queue->computeStart(); 329 } 330 331 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) { 332 if (config->traceCalls) { 333 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," << 334 std::dec << memorySegmentLength << "){" << std::endl; 335 } 336 if (config->minimizeCopies) { 337 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength); 338 if (bufferState->state == BufferState::DEVICE_OWNED) { 339 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr)); 340 if (config->traceEnqueues | config->traceCopies) { 341 std::cout << "copying buffer from device (from java access) " << std::endl; 342 } 343 queue->wait(); 344 queue->release(); 345 } else { 346 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl; 347 std::exit(1); 348 } 349 } else { 350 std::cerr << 351 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!" 352 << std::endl; 353 std::exit(1); 354 } 355 if (config->traceCalls) { 356 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl; 357 } 358 return true; 359 } 360 361 CudaBackend *CudaBackend::of(const long backendHandle) { 362 return reinterpret_cast<CudaBackend *>(backendHandle); 363 } 364 365 CudaBackend *CudaBackend::of(Backend *backend) { 366 return dynamic_cast<CudaBackend *>(backend); 367 } 368 369 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) { 370 CudaBuffer *cudaBuffer = nullptr; 371 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) { 372 cudaBuffer = new CudaBuffer(this, bufferState); 373 if (config->trace) { 374 std::cout << "We allocated arg buffer " << std::endl; 375 } 376 } else { 377 if (config->trace) { 378 std::cout << "Were reusing buffer buffer " << std::endl; 379 } 380 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr); 381 } 382 return cudaBuffer; 383 }