1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include "cuda_backend.h" 29 30 31 PtxSource::PtxSource() 32 : Text(0L) { 33 } 34 PtxSource::PtxSource(size_t len) 35 : Text(len) { 36 } 37 PtxSource::PtxSource(char *text) 38 : Text(text, false) { 39 } 40 PtxSource::PtxSource(size_t len, char *text) 41 : Text(len, text , true) { 42 } 43 CudaSource::CudaSource(size_t len) 44 : Text(len) { 45 } 46 CudaSource::CudaSource(char *text) 47 : Text(text, false) { 48 } 49 CudaSource::CudaSource(size_t len, char *text, bool isCopy) 50 :Text(len, text, isCopy){ 51 52 } 53 CudaSource::CudaSource() 54 : Text(0) { 55 } 56 uint64_t timeSinceEpochMillisec() { 57 using namespace std::chrono; 58 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(); 59 } 60 61 std::string tmpFileName(uint64_t time, const std::string& suffix){ 62 std::stringstream timestamp; 63 timestamp << "./tmp" << time << suffix; 64 return timestamp.str(); 65 } 66 67 PtxSource *PtxSource::nvcc(const char *cudaSource, size_t len) { 68 CudaSource cSource(len,(char*)cudaSource,false); 69 70 uint64_t time = timeSinceEpochMillisec(); 71 std::string ptxPath = tmpFileName(time, ".ptx"); 72 std::string cudaPath = tmpFileName(time, ".cu"); 73 // we are going to fork exec nvcc 74 int pid; 75 cSource.write(cudaPath); 76 if ((pid = fork()) == 0) { 77 const char *path = "/usr/local/cuda-12.2/bin/nvcc"; 78 const char *argv[]{"nvcc", "-ptx", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr}; 79 // std::cerr << "child about to exec nvcc" << std::endl; 80 // std::cerr << "path " << path<< " " << argv[1]<< " " << argv[2]<< " " << argv[3]<< " " << argv[4]<< std::endl; 81 int stat = execvp(path, (char *const *) argv); 82 std::cerr << " nvcc stat = "<<stat << " errno="<< errno<< " '"<< std::strerror(errno)<< "'"<<std::endl; 83 std::exit(errno); 84 } else if (pid < 0) { 85 // fork failed. 86 std::cerr << "fork of nvcc failed" << std::endl; 87 std::exit(1); 88 } else { 89 int status; 90 // std::cerr << "parent waiting for child nvcc exec" << std::endl; 91 pid_t result = wait(&status); 92 //std::cerr << "child finished should be safe to read "<< ptxPath << std::endl; 93 PtxSource *ptx= new PtxSource(); 94 ptx->read(ptxPath); 95 return ptx; 96 } 97 std::cerr << "we should never get here !"; 98 exit(1); 99 return nullptr; 100 } 101 102 103 CudaBackend::CudaBackend(int configBits) 104 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(),context() { 105 int deviceCount = 0; 106 107 if (initStatus == CUDA_SUCCESS) { 108 WHERE{.f=__FILE__, .l=__LINE__, 109 .e=cuDeviceGetCount(&deviceCount), 110 .t="cuDeviceGetCount" 111 }.report(); 112 std::cout << "CudaBackend device count = "<< deviceCount << std::endl; 113 WHERE{.f=__FILE__, .l=__LINE__, 114 .e=cuDeviceGet(&device, 0), 115 .t="cuDeviceGet" 116 }.report(); 117 WHERE{.f=__FILE__, .l=__LINE__, 118 .e=cuCtxCreate(&context, 0, device), 119 .t="cuCtxCreate" 120 }.report(); 121 std::cout << "CudaBackend context created ok (id="<<context<<")" << std::endl; 122 dynamic_cast<CudaQueue *>(queue)->init(); 123 } else { 124 WHERE{.f=__FILE__, .l=__LINE__, 125 .e=initStatus, 126 "cuInit() failed we seem to have the runtime library but no device" 127 }.report(); 128 } 129 } 130 131 //CudaBackend::CudaBackend() : CudaBackend(nullptr, 0, nullptr) { 132 // 133 //} 134 135 CudaBackend::~CudaBackend() { 136 std::cout << "freeing context" << std::endl; 137 WHERE{.f=__FILE__, .l=__LINE__, 138 .e=cuCtxDestroy(context), 139 .t="cuCtxDestroy" 140 }.report(); 141 } 142 143 void CudaBackend::info() { 144 char name[100]; 145 cuDeviceGetName(name, sizeof(name), device); 146 std::cout << "> Using device 0: " << name << std::endl; 147 148 // get compute capabilities and the devicename 149 int major = 0, minor = 0; 150 cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); 151 cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); 152 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl; 153 154 int warpSize; 155 cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); 156 std::cout << "> GPU Device has warpSize " << warpSize << std::endl; 157 158 int threadsPerBlock; 159 cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); 160 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl; 161 162 int cores; 163 cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); 164 std::cout << "> GPU Cores " << cores << std::endl; 165 166 size_t totalGlobalMem; 167 cuDeviceTotalMem(&totalGlobalMem, device); 168 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl; 169 std::cout << " 64-bit Memory Address: " << 170 ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl; 171 172 } 173 174 PtxSource *CudaBackend::nvcc(CudaSource *cudaSource){ 175 uint64_t time = timeSinceEpochMillisec(); 176 std::string ptxPath = tmpFileName(time, ".ptx"); 177 std::string cudaPath = tmpFileName(time, ".cu"); 178 // we are going to fork exec nvcc so we need to write the cuda source to disk 179 int pid; 180 cudaSource->write(cudaPath); 181 if ((pid = fork()) == 0) { 182 const char *path = "/usr/local/cuda-12.2/bin/nvcc"; 183 const char *argv[]{"nvcc", "-ptx", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr}; 184 // std::cerr << "child about to exec nvcc" << std::endl; 185 // std::cerr << "path " << path<< " " << argv[1]<< " " << argv[2]<< " " << argv[3]<< " " << argv[4]<< std::endl; 186 int stat = execvp(path, (char *const *) argv); 187 std::cerr << " nvcc stat = "<<stat << " errno="<< errno<< " '"<< std::strerror(errno)<< "'"<<std::endl; 188 std::exit(errno); 189 } else if (pid < 0) { 190 // fork failed. 191 std::cerr << "fork of nvcc failed" << std::endl; 192 std::exit(1); 193 } else { 194 int status; 195 // std::cerr << "parent waiting for child nvcc exec" << std::endl; 196 pid_t result = wait(&status); 197 //std::cerr << "child finished should be safe to read "<< ptxPath << std::endl; 198 PtxSource *ptx= new PtxSource(); 199 ptx->read(ptxPath); 200 return ptx; 201 } 202 std::cerr << "we should never get here !"; 203 exit(1); 204 return nullptr; 205 206 } 207 CudaBackend::CudaModule * CudaBackend::compile(CudaSource &cudaSource) { 208 return compile(&cudaSource); 209 } 210 CudaBackend::CudaModule * CudaBackend::compile(CudaSource *cudaSource) { 211 PtxSource *ptx = nvcc(cudaSource); 212 CUmodule module; 213 // std::cout << "inside compile" << std::endl; 214 // std::cout << "cuda " << cudaSource->text << std::endl; 215 if (ptx->text != nullptr) { 216 std::cout << "ptx " << ptx->text << std::endl; 217 Log *infLog = new Log(8192); 218 Log *errLog = new Log(8192); 219 const unsigned int optc = 5; 220 auto jitOptions = new CUjit_option[optc]; 221 void **jitOptVals = new void *[optc]; 222 223 224 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;jitOptVals[0] = (void *) (size_t) infLog->len; 225 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; jitOptVals[1] = infLog->text; 226 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;jitOptVals[2] = (void *) (size_t) errLog->len; 227 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; jitOptVals[3] = errLog->text; 228 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;jitOptVals[4] = (void *)1; 229 230 WHERE{.f=__FILE__, .l=__LINE__, 231 .e=cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), 232 .t="cuModuleLoadDataEx" 233 }.report(); 234 std::cout <<"> PTX JIT inflog:"<<std::endl << infLog->text << std::endl; 235 std::cout <<"> PTX JIT errlog:"<<std::endl << errLog->text << std::endl; 236 return new CudaModule(this, ptx->text,infLog->text,true, module); 237 238 //delete ptx; 239 } else { 240 std::cout << "no ptx content!" << std::endl; 241 exit(1); 242 } 243 } 244 245 Backend::CompilationUnit * CudaBackend::compile(int len, char *source) { 246 if (config->traceCalls) { 247 std::cout << "inside compileProgram" << std::endl; 248 } 249 PtxSource *ptx = nullptr; 250 if (config->ptx){ 251 if (config->trace) { 252 std::cout << "compiling from ptx " << std::endl; 253 } 254 ptx = new PtxSource(len,source); 255 }else { 256 ptx = PtxSource::nvcc(source, len); 257 if (config->traceCalls) { 258 std::cout << "compiling from cuda c99 "<<std::endl; 259 } 260 if (config->showCode){ 261 std::cout << "cuda " << source << std::endl; 262 } 263 264 } 265 if (config->showCode){ 266 std::cout << "ptx " << ptx->text << std::endl; 267 } 268 CUmodule module; 269 270 271 if (ptx->text != nullptr) { 272 273 // in this branch we use compilation with parameters 274 const unsigned int jitNumOptions = 2; 275 auto jitOptions = new CUjit_option[jitNumOptions]; 276 void **jitOptVals = new void *[jitNumOptions]; 277 278 // set up size of compilation log buffer 279 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; 280 int jitLogBufferSize = 8192; 281 jitOptVals[0] = (void *) (size_t) jitLogBufferSize; 282 283 // set up pointer to the compilation log buffer 284 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; 285 char *jitLogBuffer = new char[jitLogBufferSize]; 286 jitOptVals[1] = jitLogBuffer; 287 cuCtxSetCurrent(context); 288 289 WHERE{.f=__FILE__, .l=__LINE__, 290 .e=cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals), 291 .t="cuModuleLoadDataEx" 292 }.report(); 293 std::cout <<"PTX log:"<< jitLogBuffer << std::endl; 294 return dynamic_cast<Backend::CompilationUnit *>(new CudaModule(this, ptx->text, jitLogBuffer, true, 295 module)); 296 // return reinterpret_cast<long>(new CudaModule(this, ptx->text,jitLogBuffer,true, module)); 297 // 298 //delete 299 } else { 300 std::cout << "no ptx content!" << std::endl; 301 exit(1); 302 } 303 304 } 305 extern "C" long getBackend(int mode) { 306 long backendHandle= reinterpret_cast<long>(new CudaBackend(mode)); 307 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl; 308 return backendHandle; 309 } 310 311 void clCallback(void *){ 312 std::cerr<<"start of compute"<<std::endl; 313 } 314 315 316 317 void CudaBackend::computeEnd(){ 318 queue->computeEnd(); 319 320 } 321 void CudaBackend::computeStart(){ 322 queue->computeStart(); 323 } 324 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength){ 325 if (config->traceCalls){ 326 std::cout << "getBufferFromDeviceIfDirty(" <<std::hex << (long)memorySegment << "," << std::dec<< memorySegmentLength <<"){"<<std::endl; 327 } 328 if (config->minimizeCopies){ 329 BufferState * bufferState = BufferState::of(memorySegment,memorySegmentLength); 330 if (bufferState->state == BufferState::DEVICE_OWNED){ 331 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr)); 332 if (config->traceEnqueues | config->traceCopies){ 333 std::cout << "copying buffer from device (from java access) "<< std::endl; 334 } 335 queue->wait(); 336 queue->release(); 337 }else{ 338 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty"<<std::endl; 339 std::exit(1); 340 } 341 }else{ 342 std::cerr << "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"<<std::endl; 343 std::exit(1); 344 } 345 if (config->traceCalls){ 346 std::cout << "}getBufferFromDeviceIfDirty()"<<std::endl; 347 } 348 return true; 349 350 } 351 352 CudaBackend * CudaBackend::of(long backendHandle){ 353 return reinterpret_cast<CudaBackend *>(backendHandle); 354 } 355 CudaBackend * CudaBackend::of(Backend *backend){ 356 return dynamic_cast<CudaBackend *>(backend); 357 } 358 359 CudaBackend::CudaBuffer * CudaBackend::getOrCreateBuffer(BufferState *bufferState) { 360 CudaBuffer *cudaBuffer = nullptr; 361 if (bufferState->vendorPtr == 0L || bufferState->state == BufferState::NEW_STATE){ 362 cudaBuffer = new CudaBuffer(this, bufferState); 363 if (config->trace){ 364 std::cout << "We allocated arg buffer "<<std::endl; 365 } 366 }else{ 367 if (config->trace){ 368 std::cout << "Were reusing buffer buffer "<<std::endl; 369 } 370 cudaBuffer= static_cast<CudaBuffer*>(bufferState->vendorPtr); 371 } 372 return cudaBuffer; 373 }