1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include "cuda_backend.h" 29 #include <iostream> 30 31 PtxSource::PtxSource() 32 : Text(0L) { 33 } 34 35 PtxSource::PtxSource(size_t len) 36 : Text(len) { 37 } 38 39 PtxSource::PtxSource(char *text) 40 : Text(text, false) { 41 } 42 43 PtxSource::PtxSource(size_t len, char *text) 44 : Text(len, text, true) { 45 } 46 PtxSource::PtxSource(size_t len, char *text, bool isCopy) 47 : Text(len, text, isCopy) { 48 } 49 50 CudaSource::CudaSource(size_t len) 51 : Text(len) { 52 } 53 54 CudaSource::CudaSource(char *text) 55 : Text(text, false) { 56 } 57 58 CudaSource::CudaSource(size_t len, char *text, bool isCopy) 59 : Text(len, text, isCopy) { 60 } 61 62 CudaSource::CudaSource() 63 : Text(0) { 64 } 65 66 uint64_t timeSinceEpochMillisec() { 67 using namespace std::chrono; 68 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 69 } 70 71 std::string tmpFileName(uint64_t time, const std::string &suffix) { 72 std::stringstream timestamp; 73 timestamp << "./tmp" << time << suffix; 74 return timestamp.str(); 75 } 76 77 CudaBackend::CudaBackend(int configBits) 78 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() { 79 int deviceCount = 0; 80 81 if (initStatus == CUDA_SUCCESS) { 82 CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount"); 83 std::cout << "CudaBackend device count = " << deviceCount << std::endl; 84 CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet"); 85 CUDA_CHECK(cuCtxCreate(&context, 0, device), "cuCtxCreate"); 86 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl; 87 dynamic_cast<CudaQueue *>(queue)->init(); 88 } else { 89 CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device"); 90 } 91 } 92 93 CudaBackend::~CudaBackend() { 94 std::cout << "freeing context" << std::endl; 95 CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy"); 96 } 97 98 void CudaBackend::info() { 99 char name[100]; 100 CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName"); 101 102 std::cout << "> Using device 0: " << name << std::endl; 103 104 // get compute capabilities and the device name 105 int major = 0, minor = 0; 106 CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute"); 107 CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute"); 108 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 109 110 int warpSize; 111 CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute"); 112 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 113 114 int threadsPerBlock; 115 CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute"); 116 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 117 118 int cores; 119 CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute"); 120 std::cout << "> GPU Cores " << cores << std::endl; 121 122 size_t totalGlobalMem; 123 CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem"); 124 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 125 std::cout << " 64-bit Memory Address: " << 126 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 127 } 128 129 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) { 130 const uint64_t time = timeSinceEpochMillisec(); 131 const std::string ptxPath = tmpFileName(time, ".ptx"); 132 const std::string cudaPath = tmpFileName(time, ".cu"); 133 int pid; 134 cudaSource->write(cudaPath); 135 if ((pid = fork()) == 0) { //child 136 const auto path = "/usr/local/cuda/bin/nvcc"; 137 const char *argv[] { 138 "/usr/local/cuda/bin/nvcc", 139 "-ptx", 140 "-Wno-deprecated-gpu-targets", 141 cudaPath.c_str(), 142 "-o", 143 ptxPath.c_str(), 144 nullptr 145 }; 146 const int stat = execvp(path, (char *const *) argv); 147 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl; 148 std::exit(errno); 149 } else if (pid < 0) {// fork failed. 150 std::cerr << "fork of nvcc failed" << std::endl; 151 std::exit(1); 152 } else { //parent 153 int status; 154 pid_t result = wait(&status); 155 auto *ptx = new PtxSource(); 156 ptx->read(ptxPath); 157 return ptx; 158 } 159 } 160 161 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) { 162 return compile(&cudaSource); 163 } 164 165 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) { 166 const PtxSource *ptxSource = nvcc(cudaSource); 167 return compile(ptxSource); 168 } 169 170 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) { 171 return compile(&ptxSource); 172 } 173 174 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) { 175 CUmodule module; 176 if (ptx->text != nullptr) { 177 const Log *infLog = new Log(8192); 178 const Log *errLog = new Log(8192); 179 constexpr unsigned int optc = 5; 180 const auto jitOptions = new CUjit_option[optc]; 181 auto jitOptVals = new void *[optc]; 182 183 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 184 jitOptVals[0] = reinterpret_cast<void *>(infLog->len); 185 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 186 jitOptVals[1] = infLog->text; 187 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; 188 jitOptVals[2] = reinterpret_cast<void *>(errLog->len); 189 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; 190 jitOptVals[3] = errLog->text; 191 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO; 192 jitOptVals[4] = reinterpret_cast<void *>(1); 193 194 CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx"); 195 196 if (*infLog->text!='\0'){ 197 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl; 198 } 199 if (*errLog->text!='\0'){ 200 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl; 201 } 202 return new CudaModule(this, ptx->text, infLog->text, true, module); 203 204 //delete ptx; 205 } else { 206 std::cout << "no ptx content!" << std::endl; 207 exit(1); 208 } 209 } 210 211 //Entry point from HAT. We use the config PTX bit to determine which Source type 212 213 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) { 214 if (config->traceCalls) { 215 std::cout << "inside compileProgram" << std::endl; 216 } 217 218 if (config->ptx){ 219 if (config->trace) { 220 std::cout << "compiling from provided ptx " << std::endl; 221 } 222 PtxSource ptxSource(len, source, false); 223 return compile(ptxSource); 224 }else{ 225 if (config->trace) { 226 std::cout << "compiling from provided cuda " << std::endl; 227 } 228 CudaSource cudaSource(len , source, false); 229 return compile(cudaSource); 230 } 231 } 232 233 /* 234 235 if (config->ptx) { 236 237 } else { 238 if (config->trace) { 239 std::cout << "compiling from cuda c99 " << std::endl; 240 } 241 if (config->showCode) { 242 std::cout << "cuda " << source << std::endl; 243 } 244 auto* cuda = new CudaSource(len, source, false); 245 ptx = nvcc(cuda); 246 } 247 if (config->showCode) { 248 std::cout << "ptx " << ptx->text << std::endl; 249 } 250 CUmodule module; 251 252 253 if (ptx->text != nullptr) { 254 constexpr unsigned int jitNumOptions = 2; 255 const auto jitOptions = new CUjit_option[jitNumOptions]; 256 const auto jitOptVals = new void *[jitNumOptions]; 257 258 // set up size of compilation log buffer 259 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 260 constexpr int jitLogBufferSize = 8192; 261 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize); 262 263 // set up pointer to the compilation log buffer 264 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 265 auto jitLogBuffer = new char[jitLogBufferSize]; 266 jitOptVals[1] = jitLogBuffer; 267 cuCtxSetCurrent(context); 268 269 WHERE{ 270 .f = __FILE__, .l = __LINE__, 271 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals), 272 .t = "cuModuleLoadDataEx" 273 }.report(); 274 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){ 275 std::cout << "PTX log:" << jitLogBuffer << std::endl; 276 } 277 return new CudaModule(this, ptx->text, jitLogBuffer, true, module); 278 } else { 279 std::cout << "no ptx content!" << std::endl; 280 exit(1); 281 } 282 } */ 283 284 extern "C" long getBackend(int mode) { 285 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode)); 286 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 287 return backendHandle; 288 } 289 290 void clCallback(void *) { 291 std::cerr << "start of compute" << std::endl; 292 } 293 294 void CudaBackend::computeEnd() { 295 queue->computeEnd(); 296 } 297 298 void CudaBackend::computeStart() { 299 queue->computeStart(); 300 } 301 302 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) { 303 if (config->traceCalls) { 304 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," << 305 std::dec << memorySegmentLength << "){" << std::endl; 306 } 307 if (config->minimizeCopies) { 308 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength); 309 if (bufferState->state == BufferState::DEVICE_OWNED) { 310 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr)); 311 if (config->traceEnqueues | config->traceCopies) { 312 std::cout << "copying buffer from device (from java access) " << std::endl; 313 } 314 queue->wait(); 315 queue->release(); 316 } else { 317 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl; 318 std::exit(1); 319 } 320 } else { 321 std::cerr << 322 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!" 323 << std::endl; 324 std::exit(1); 325 } 326 if (config->traceCalls) { 327 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl; 328 } 329 return true; 330 } 331 332 CudaBackend *CudaBackend::of(const long backendHandle) { 333 return reinterpret_cast<CudaBackend *>(backendHandle); 334 } 335 336 CudaBackend *CudaBackend::of(Backend *backend) { 337 return dynamic_cast<CudaBackend *>(backend); 338 } 339 340 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) { 341 CudaBuffer *cudaBuffer = nullptr; 342 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) { 343 cudaBuffer = new CudaBuffer(this, bufferState); 344 if (config->trace) { 345 std::cout << "We allocated arg buffer " << std::endl; 346 } 347 } else { 348 if (config->trace) { 349 std::cout << "Were reusing buffer buffer " << std::endl; 350 } 351 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr); 352 } 353 return cudaBuffer; 354 }