1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include "cuda_backend.h" 29 #include <iostream> 30 31 PtxSource::PtxSource() 32 : Text(0L) { 33 } 34 35 PtxSource::PtxSource(size_t len) 36 : Text(len) { 37 } 38 39 PtxSource::PtxSource(char *text) 40 : Text(text, false) { 41 } 42 43 PtxSource::PtxSource(size_t len, char *text) 44 : Text(len, text, true) { 45 } 46 PtxSource::PtxSource(size_t len, char *text, bool isCopy) 47 : Text(len, text, isCopy) { 48 } 49 50 CudaSource::CudaSource(size_t len) 51 : Text(len) { 52 } 53 54 CudaSource::CudaSource(char *text) 55 : Text(text, false) { 56 } 57 58 CudaSource::CudaSource(size_t len, char *text, bool isCopy) 59 : Text(len, text, isCopy) { 60 } 61 62 CudaSource::CudaSource() 63 : Text(0) { 64 } 65 66 uint64_t timeSinceEpochMillisec() { 67 using namespace std::chrono; 68 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 69 } 70 71 std::string tmpFileName(uint64_t time, const std::string &suffix) { 72 std::stringstream timestamp; 73 timestamp << "./tmp" << time << suffix; 74 return timestamp.str(); 75 } 76 77 CudaBackend::CudaBackend(int configBits) 78 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() { 79 int deviceCount = 0; 80 81 if (initStatus == CUDA_SUCCESS) { 82 CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount"); 83 std::cout << "CudaBackend device count = " << deviceCount << std::endl; 84 CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet"); 85 CUctxCreateParams ctxCreateParams = {}; 86 CUDA_CHECK(cuCtxCreate_v4(&context, &ctxCreateParams, 0, device), "cuCtxCreate"); 87 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl; 88 dynamic_cast<CudaQueue *>(queue)->init(); 89 } else { 90 CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device"); 91 } 92 } 93 94 CudaBackend::~CudaBackend() { 95 std::cout << "freeing context" << std::endl; 96 CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy"); 97 } 98 99 void CudaBackend::info() { 100 char name[100]; 101 CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName"); 102 103 std::cout << "> Using device 0: " << name << std::endl; 104 105 // get compute capabilities and the device name 106 int major = 0, minor = 0; 107 CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute"); 108 CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute"); 109 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 110 111 int warpSize; 112 CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute"); 113 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 114 115 int threadsPerBlock; 116 CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute"); 117 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 118 119 int cores; 120 CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute"); 121 std::cout << "> GPU Cores " << cores << std::endl; 122 123 size_t totalGlobalMem; 124 CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem"); 125 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 126 std::cout << " 64-bit Memory Address: " << 127 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 128 } 129 130 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) { 131 const uint64_t time = timeSinceEpochMillisec(); 132 const std::string ptxPath = tmpFileName(time, ".ptx"); 133 const std::string cudaPath = tmpFileName(time, ".cu"); 134 int pid; 135 cudaSource->write(cudaPath); 136 if ((pid = fork()) == 0) { //child 137 const auto path = "/usr/local/cuda/bin/nvcc"; 138 const char *argv[] { 139 "/usr/local/cuda/bin/nvcc", 140 "-ptx", 141 "-Wno-deprecated-gpu-targets", 142 cudaPath.c_str(), 143 "-o", 144 ptxPath.c_str(), 145 nullptr 146 }; 147 const int stat = execvp(path, (char *const *) argv); 148 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl; 149 std::exit(errno); 150 } else if (pid < 0) {// fork failed. 151 std::cerr << "fork of nvcc failed" << std::endl; 152 std::exit(1); 153 } else { //parent 154 int status; 155 pid_t result = wait(&status); 156 auto *ptx = new PtxSource(); 157 ptx->read(ptxPath); 158 return ptx; 159 } 160 } 161 162 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) { 163 return compile(&cudaSource); 164 } 165 166 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) { 167 const PtxSource *ptxSource = nvcc(cudaSource); 168 return compile(ptxSource); 169 } 170 171 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) { 172 return compile(&ptxSource); 173 } 174 175 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) { 176 CUmodule module; 177 if (ptx->text != nullptr) { 178 const Log *infLog = new Log(8192); 179 const Log *errLog = new Log(8192); 180 constexpr unsigned int optc = 5; 181 const auto jitOptions = new CUjit_option[optc]; 182 auto jitOptVals = new void *[optc]; 183 184 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 185 jitOptVals[0] = reinterpret_cast<void *>(infLog->len); 186 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 187 jitOptVals[1] = infLog->text; 188 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 189 jitOptVals[2] = reinterpret_cast<void *>(errLog->len); 190 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; 191 jitOptVals[3] = errLog->text; 192 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO; 193 jitOptVals[4] = reinterpret_cast<void *>(1); 194 195 CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx"); 196 197 if (*infLog->text!='\0'){ 198 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl; 199 } 200 if (*errLog->text!='\0'){ 201 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl; 202 } 203 return new CudaModule(this, ptx->text, infLog->text, true, module); 204 205 //delete ptx; 206 } else { 207 std::cout << "no ptx content!" << std::endl; 208 exit(1); 209 } 210 } 211 212 //Entry point from HAT. We use the config PTX bit to determine which Source type 213 214 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) { 215 if (config->traceCalls) { 216 std::cout << "inside compileProgram" << std::endl; 217 } 218 219 if (config->ptx){ 220 if (config->trace) { 221 std::cout << "compiling from provided ptx " << std::endl; 222 } 223 PtxSource ptxSource(len, source, false); 224 return compile(ptxSource); 225 }else{ 226 if (config->trace) { 227 std::cout << "compiling from provided cuda " << std::endl; 228 } 229 CudaSource cudaSource(len , source, false); 230 return compile(cudaSource); 231 } 232 } 233 234 /* 235 236 if (config->ptx) { 237 238 } else { 239 if (config->trace) { 240 std::cout << "compiling from cuda c99 " << std::endl; 241 } 242 if (config->showCode) { 243 std::cout << "cuda " << source << std::endl; 244 } 245 auto* cuda = new CudaSource(len, source, false); 246 ptx = nvcc(cuda); 247 } 248 if (config->showCode) { 249 std::cout << "ptx " << ptx->text << std::endl; 250 } 251 CUmodule module; 252 253 254 if (ptx->text != nullptr) { 255 constexpr unsigned int jitNumOptions = 2; 256 const auto jitOptions = new CUjit_option[jitNumOptions]; 257 const auto jitOptVals = new void *[jitNumOptions]; 258 259 // set up size of compilation log buffer 260 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 261 constexpr int jitLogBufferSize = 8192; 262 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize); 263 264 // set up pointer to the compilation log buffer 265 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 266 auto jitLogBuffer = new char[jitLogBufferSize]; 267 jitOptVals[1] = jitLogBuffer; 268 cuCtxSetCurrent(context); 269 270 WHERE{ 271 .f = __FILE__, .l = __LINE__, 272 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals), 273 .t = "cuModuleLoadDataEx" 274 }.report(); 275 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){ 276 std::cout << "PTX log:" << jitLogBuffer << std::endl; 277 } 278 return new CudaModule(this, ptx->text, jitLogBuffer, true, module); 279 } else { 280 std::cout << "no ptx content!" << std::endl; 281 exit(1); 282 } 283 } */ 284 285 extern "C" long getBackend(int mode) { 286 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode)); 287 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 288 return backendHandle; 289 } 290 291 void clCallback(void *) { 292 std::cerr << "start of compute" << std::endl; 293 } 294 295 void CudaBackend::computeEnd() { 296 queue->computeEnd(); 297 } 298 299 void CudaBackend::computeStart() { 300 queue->computeStart(); 301 } 302 303 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) { 304 if (config->traceCalls) { 305 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," << 306 std::dec << memorySegmentLength << "){" << std::endl; 307 } 308 if (config->minimizeCopies) { 309 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength); 310 if (bufferState->state == BufferState::DEVICE_OWNED) { 311 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr)); 312 if (config->traceEnqueues | config->traceCopies) { 313 std::cout << "copying buffer from device (from java access) " << std::endl; 314 } 315 queue->wait(); 316 queue->release(); 317 } else { 318 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl; 319 std::exit(1); 320 } 321 } else { 322 std::cerr << 323 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!" 324 << std::endl; 325 std::exit(1); 326 } 327 if (config->traceCalls) { 328 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl; 329 } 330 return true; 331 } 332 333 CudaBackend *CudaBackend::of(const long backendHandle) { 334 return reinterpret_cast<CudaBackend *>(backendHandle); 335 } 336 337 CudaBackend *CudaBackend::of(Backend *backend) { 338 return dynamic_cast<CudaBackend *>(backend); 339 } 340 341 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) { 342 CudaBuffer *cudaBuffer = nullptr; 343 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) { 344 cudaBuffer = new CudaBuffer(this, bufferState); 345 if (config->trace) { 346 std::cout << "We allocated arg buffer " << std::endl; 347 } 348 } else { 349 if (config->trace) { 350 std::cout << "Were reusing buffer buffer " << std::endl; 351 } 352 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr); 353 } 354 return cudaBuffer; 355 }