1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include "cuda_backend.h"
 29 #include <iostream>
 30 #include <cstdlib>
 31 #include <filesystem>
 32 
 33 PtxSource::PtxSource()
 34     : Text(0L) {
 35 }
 36 
 37 PtxSource::PtxSource(size_t len)
 38     : Text(len) {
 39 }
 40 
 41 PtxSource::PtxSource(char *text)
 42     : Text(text, false) {
 43 }
 44 
 45 PtxSource::PtxSource(size_t len, char *text)
 46     : Text(len, text, true) {
 47 }
 48 PtxSource::PtxSource(size_t len, char *text, bool isCopy)
 49     : Text(len, text, isCopy) {
 50 }
 51 
 52 CudaSource::CudaSource(size_t len)
 53     : Text(len) {
 54 }
 55 
 56 CudaSource::CudaSource(char *text)
 57     : Text(text, false) {
 58 }
 59 
 60 CudaSource::CudaSource(size_t len, char *text, bool isCopy, bool lineinfo)
 61     : Text(len, text, isCopy) {
 62     _lineInfo = lineinfo;
 63 }
 64 
 65 CudaSource::CudaSource()
 66     : Text(0) {
 67 }
 68 
 69 bool CudaSource::lineInfo() const {
 70     return _lineInfo;
 71 }
 72 
 73 uint64_t timeSinceEpochMillisec() {
 74     using namespace std::chrono;
 75     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 76 }
 77 
 78 std::string tmpFileName(uint64_t time, const std::string directoryName, const std::string &suffix) {
 79     std::stringstream timestamp;
 80     timestamp << directoryName << "/tmp_" << time << suffix;
 81     return timestamp.str();
 82 }
 83 
 84 CudaBackend::CudaBackend(int configBits)
 85     : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() {
 86     int deviceCount = 0;
 87 
 88     if (initStatus == CUDA_SUCCESS) {
 89         CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");
 90         if (config->info) {
 91             std::cout << "CudaBackend device count = " << deviceCount << std::endl;
 92         }
 93         CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet");
 94         #if defined(CUDA_VERSION) && CUDA_VERSION >= 12080
 95             CUctxCreateParams ctxCreateParams = {};
 96             CUDA_CHECK(cuCtxCreate_v4(&context, &ctxCreateParams, 0, device), "cuCtxCreate");
 97         #else
 98             // Invoke previous implementation with 3 parameters
 99             CUDA_CHECK(cuCtxCreate(&context, 0, device), "cuCtxCreate");
100         #endif
101         if (config->info) {
102             std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl;
103         }
104         dynamic_cast<CudaQueue *>(queue)->init();
105     } else {
106         CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device");
107     }
108 }
109 
110 CudaBackend::~CudaBackend() {
111     std::cout << "freeing context" << std::endl;
112     CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy");
113 }
114 
115 void CudaBackend::shortDeviceInfo() {
116     char name[100];
117     CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName");
118     std::cout << "[INFO] Using NVIDIA GPU: " << name << std::endl;
119 }
120 
121 void CudaBackend::showDeviceInfo() {
122     char name[100];
123     CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName");
124 
125     std::cout << "> Using device 0: " << name << std::endl;
126 
127     // get compute capabilities and the device name
128     int major = 0, minor = 0;
129     CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute");
130     CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute");
131     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
132 
133     int warpSize;
134     CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute");
135     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
136 
137     int threadsPerBlock;
138     CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute");
139     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
140 
141     int cores;
142     CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute");
143     std::cout << "> GPU Cores " << cores << std::endl;
144 
145     size_t totalGlobalMem;
146     CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem");
147     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
148     std::cout << "  64-bit Memory Address:           " <<
149             ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
150 }
151 
152 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) {
153 
154     // create var/cuda directory
155     std::string localDirectory = "./var/cuda";
156     std::filesystem::create_directories(localDirectory);
157     // create temp file for cuda generarated code
158     const uint64_t time = timeSinceEpochMillisec();
159     const std::string ptxPath = tmpFileName(time, localDirectory, ".ptx");
160     const std::string cudaPath = tmpFileName(time, localDirectory, ".cu");
161 
162     // compile the generated code
163     int pid;
164     cudaSource->write(cudaPath);
165     if ((pid = fork()) == 0) { //child
166         const auto path = "nvcc";
167         std::vector<std::string> command;
168         command.push_back(path);
169         command.push_back("-ptx");
170         command.push_back("-Wno-deprecated-gpu-targets");
171         command.push_back(cudaPath);
172         if (cudaSource->lineInfo()) {
173             command.push_back("-lineinfo");
174         }
175         command.push_back("-o");
176         command.push_back(ptxPath);
177 
178         // conver to char*[]
179         const char* args[command.size() + 1];
180         for (int i = 0; i < command.size(); i++) {
181             args[i] = command[i].c_str();
182         }
183         args[command.size()] = nullptr;
184         const int stat = execvp(path, (char *const *) args);
185         std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl;
186         std::exit(errno);
187     } else if (pid < 0) {// fork failed.
188         std::cerr << "fork of nvcc failed" << std::endl;
189         std::exit(1);
190     } else { //parent
191         int status;
192         pid_t result = wait(&status);
193         auto *ptx = new PtxSource();
194         ptx->read(ptxPath);
195         return ptx;
196     }
197 }
198 
199 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) {
200     return compile(&cudaSource);
201 }
202 
203 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) {
204     const PtxSource *ptxSource = nvcc(cudaSource);
205     return compile(ptxSource);
206 }
207 
208 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) {
209     return compile(&ptxSource);
210 }
211 
212 CudaBackend::CudaModule *CudaBackend::compile(const  PtxSource *ptx) {
213     CUmodule module;
214     if (ptx->text != nullptr) {
215         const Log *infLog = new Log(8192);
216         const Log *errLog = new Log(8192);
217         constexpr unsigned int optc = 5;
218         const auto jitOptions = new CUjit_option[optc];
219         auto jitOptVals = new void *[optc];
220 
221         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
222         jitOptVals[0] = reinterpret_cast<void *>(infLog->len);
223         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
224         jitOptVals[1] = infLog->text;
225         jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
226         jitOptVals[2] = reinterpret_cast<void *>(errLog->len);
227         jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
228         jitOptVals[3] = errLog->text;
229         jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;
230         jitOptVals[4] = reinterpret_cast<void *>(1);
231 
232         CUDA_CHECK(cuCtxSetCurrent(context), "cuCtxSetCurrent");
233         CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx");
234 
235         if (*infLog->text!='\0'){
236            std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl;
237         }
238         if (*errLog->text!='\0'){
239            std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl;
240         }
241         return new CudaModule(this, ptx->text, infLog->text, true, module);
242 
243         //delete ptx;
244     } else {
245         std::cout << "no ptx content!" << std::endl;
246         exit(1);
247     }
248 }
249 
250 //Entry point from HAT.  We use the config PTX bit to determine which Source type
251 
252 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) {
253     if (config->traceCalls) {
254         std::cout << "inside compileProgram" << std::endl;
255     }
256 
257     if (config->ptx){
258         if (config->trace) {
259             std::cout << "compiling from provided  ptx " << std::endl;
260         }
261         PtxSource ptxSource(len, source, false);
262         return compile(ptxSource);
263     }else{
264         if (config->trace) {
265             std::cout << "compiling from provided  cuda " << std::endl;
266         }
267         CudaSource cudaSource(len , source, false, config->profileCudaKernel);
268         return compile(cudaSource);
269     }
270 }
271 
272 /*
273 
274     if (config->ptx) {
275 
276     } else {
277         if (config->trace) {
278             std::cout << "compiling from cuda c99 " << std::endl;
279         }
280         if (config->showCode) {
281             std::cout << "cuda " << source << std::endl;
282         }
283         auto* cuda = new CudaSource(len, source, false);
284         ptx = nvcc(cuda);
285     }
286     if (config->showCode) {
287         std::cout << "ptx " << ptx->text << std::endl;
288     }
289     CUmodule module;
290 
291 
292     if (ptx->text != nullptr) {
293         constexpr unsigned int jitNumOptions = 2;
294         const auto jitOptions = new CUjit_option[jitNumOptions];
295         const auto jitOptVals = new void *[jitNumOptions];
296 
297         // set up size of compilation log buffer
298         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
299         constexpr int jitLogBufferSize = 8192;
300         jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
301 
302         // set up pointer to the compilation log buffer
303         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
304         auto jitLogBuffer = new char[jitLogBufferSize];
305         jitOptVals[1] = jitLogBuffer;
306         cuCtxSetCurrent(context);
307 
308         WHERE{
309             .f = __FILE__, .l = __LINE__,
310             .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals),
311             .t = "cuModuleLoadDataEx"
312         }.report();
313         if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){
314              std::cout << "PTX log:" << jitLogBuffer << std::endl;
315         }
316         return new CudaModule(this, ptx->text, jitLogBuffer, true, module);
317     } else {
318         std::cout << "no ptx content!" << std::endl;
319         exit(1);
320     }
321 } */
322 
323 extern "C" long getBackend(int mode) {
324     long backendHandle = reinterpret_cast<long>(new CudaBackend(mode));
325     //  std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
326     return backendHandle;
327 }
328 
329 void clCallback(void *) {
330     std::cerr << "start of compute" << std::endl;
331 }
332 
333 void CudaBackend::computeEnd() {
334     queue->computeEnd();
335 }
336 
337 void CudaBackend::computeStart() {
338     queue->computeStart();
339 }
340 
341 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) {
342     if (config->traceCalls) {
343         std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," <<
344                 std::dec << memorySegmentLength << "){" << std::endl;
345     }
346     if (config->minimizeCopies) {
347         const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength);
348         if (bufferState->state == BufferState::DEVICE_OWNED) {
349             queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
350             if (config->traceEnqueues | config->traceCopies) {
351                 std::cout << "copying buffer from device (from java access) " << std::endl;
352             }
353             queue->wait();
354             queue->release();
355         } else {
356             std::cout << "HOW DID WE GET HERE 1 attempting  to get buffer but buffer is not device dirty" << std::endl;
357             std::exit(1);
358         }
359     } else {
360         std::cerr <<
361                 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"
362                 << std::endl;
363         std::exit(1);
364     }
365     if (config->traceCalls) {
366         std::cout << "}getBufferFromDeviceIfDirty()" << std::endl;
367     }
368     return true;
369 }
370 
371 CudaBackend *CudaBackend::of(const long backendHandle) {
372     return reinterpret_cast<CudaBackend *>(backendHandle);
373 }
374 
375 CudaBackend *CudaBackend::of(Backend *backend) {
376     return dynamic_cast<CudaBackend *>(backend);
377 }
378 
379 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
380     CudaBuffer *cudaBuffer = nullptr;
381     if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) {
382         cudaBuffer = new CudaBuffer(this, bufferState);
383         if (config->trace) {
384             std::cout << "We allocated arg buffer " << std::endl;
385         }
386         bufferState->state = BufferState::NEW_STATE;
387     } else {
388         if (config->trace) {
389             std::cout << "Were reusing  buffer  buffer " << std::endl;
390         }
391         cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr);
392     }
393     return cudaBuffer;
394 }