1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include "cuda_backend.h"
 29 #include <iostream>
 30 
 31 PtxSource::PtxSource()
 32     : Text(0L) {
 33 }
 34 
 35 PtxSource::PtxSource(size_t len)
 36     : Text(len) {
 37 }
 38 
 39 PtxSource::PtxSource(char *text)
 40     : Text(text, false) {
 41 }
 42 
 43 PtxSource::PtxSource(size_t len, char *text)
 44     : Text(len, text, true) {
 45 }
 46 PtxSource::PtxSource(size_t len, char *text, bool isCopy)
 47     : Text(len, text, isCopy) {
 48 }
 49 
 50 CudaSource::CudaSource(size_t len)
 51     : Text(len) {
 52 }
 53 
 54 CudaSource::CudaSource(char *text)
 55     : Text(text, false) {
 56 }
 57 
 58 CudaSource::CudaSource(size_t len, char *text, bool isCopy, bool lineinfo)
 59     : Text(len, text, isCopy) {
 60     _lineInfo = lineinfo;
 61 }
 62 
 63 CudaSource::CudaSource()
 64     : Text(0) {
 65 }
 66 
 67 bool CudaSource::lineInfo() const {
 68     return _lineInfo;
 69 }
 70 
 71 uint64_t timeSinceEpochMillisec() {
 72     using namespace std::chrono;
 73     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 74 }
 75 
 76 std::string tmpFileName(uint64_t time, const std::string &suffix) {
 77     std::stringstream timestamp;
 78     timestamp << "./tmp" << time << suffix;
 79     return timestamp.str();
 80 }
 81 
 82 CudaBackend::CudaBackend(int configBits)
 83     : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() {
 84     int deviceCount = 0;
 85 
 86     if (initStatus == CUDA_SUCCESS) {
 87         CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");
 88         if (config->info) {
 89             std::cout << "CudaBackend device count = " << deviceCount << std::endl;
 90         }
 91         CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet");
 92         #if defined(CUDA_VERSION) && CUDA_VERSION >= 12080
 93             CUctxCreateParams ctxCreateParams = {};
 94             CUDA_CHECK(cuCtxCreate_v4(&context, &ctxCreateParams, 0, device), "cuCtxCreate");
 95         #else
 96             // Invoke previous implementation with 3 parameters
 97             CUDA_CHECK(cuCtxCreate(&context, 0, device), "cuCtxCreate");
 98         #endif
 99         if (config->info) {
100             std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl;
101         }
102         dynamic_cast<CudaQueue *>(queue)->init();
103     } else {
104         CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device");
105     }
106 }
107 
108 CudaBackend::~CudaBackend() {
109     std::cout << "freeing context" << std::endl;
110     CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy");
111 }
112 
113 void CudaBackend::info() {
114     char name[100];
115     CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName");
116 
117     std::cout << "> Using device 0: " << name << std::endl;
118 
119     // get compute capabilities and the device name
120     int major = 0, minor = 0;
121     CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute");
122     CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute");
123     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
124 
125     int warpSize;
126     CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute");
127     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
128 
129     int threadsPerBlock;
130     CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute");
131     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
132 
133     int cores;
134     CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute");
135     std::cout << "> GPU Cores " << cores << std::endl;
136 
137     size_t totalGlobalMem;
138     CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem");
139     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
140     std::cout << "  64-bit Memory Address:           " <<
141             ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
142 }
143 
144 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) {
145     const uint64_t time = timeSinceEpochMillisec();
146     const std::string ptxPath = tmpFileName(time, ".ptx");
147     const std::string cudaPath = tmpFileName(time, ".cu");
148     int pid;
149     cudaSource->write(cudaPath);
150     if ((pid = fork()) == 0) { //child
151         const auto path = "/usr/local/cuda/bin/nvcc";
152         std::vector<std::string> command;
153         command.push_back(path);
154         command.push_back("-ptx");
155         command.push_back("-Wno-deprecated-gpu-targets");
156         command.push_back(cudaPath);
157         if (cudaSource->lineInfo()) {
158             command.push_back("-lineinfo");
159         }
160         command.push_back("-o");
161         command.push_back(ptxPath);
162 
163         // conver to char*[]
164         const char* args[command.size() + 1];
165         for (int i = 0; i < command.size(); i++) {
166             args[i] = command[i].c_str();
167         }
168         args[command.size()] = nullptr;
169         const int stat = execvp(path, (char *const *) args);
170         std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl;
171         std::exit(errno);
172     } else if (pid < 0) {// fork failed.
173         std::cerr << "fork of nvcc failed" << std::endl;
174         std::exit(1);
175     } else { //parent
176         int status;
177         pid_t result = wait(&status);
178         auto *ptx = new PtxSource();
179         ptx->read(ptxPath);
180         return ptx;
181     }
182 }
183 
184 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) {
185     return compile(&cudaSource);
186 }
187 
188 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) {
189     const PtxSource *ptxSource = nvcc(cudaSource);
190     return compile(ptxSource);
191 }
192 
193 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) {
194     return compile(&ptxSource);
195 }
196 
197 CudaBackend::CudaModule *CudaBackend::compile(const  PtxSource *ptx) {
198     CUmodule module;
199     if (ptx->text != nullptr) {
200         const Log *infLog = new Log(8192);
201         const Log *errLog = new Log(8192);
202         constexpr unsigned int optc = 5;
203         const auto jitOptions = new CUjit_option[optc];
204         auto jitOptVals = new void *[optc];
205 
206         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
207         jitOptVals[0] = reinterpret_cast<void *>(infLog->len);
208         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
209         jitOptVals[1] = infLog->text;
210         jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
211         jitOptVals[2] = reinterpret_cast<void *>(errLog->len);
212         jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
213         jitOptVals[3] = errLog->text;
214         jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;
215         jitOptVals[4] = reinterpret_cast<void *>(1);
216 
217         CUDA_CHECK(cuCtxSetCurrent(context), "cuCtxSetCurrent");
218         CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx");
219 
220         if (*infLog->text!='\0'){
221            std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl;
222         }
223         if (*errLog->text!='\0'){
224            std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl;
225         }
226         return new CudaModule(this, ptx->text, infLog->text, true, module);
227 
228         //delete ptx;
229     } else {
230         std::cout << "no ptx content!" << std::endl;
231         exit(1);
232     }
233 }
234 
235 //Entry point from HAT.  We use the config PTX bit to determine which Source type
236 
237 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) {
238     if (config->traceCalls) {
239         std::cout << "inside compileProgram" << std::endl;
240     }
241 
242     if (config->ptx){
243         if (config->trace) {
244             std::cout << "compiling from provided  ptx " << std::endl;
245         }
246         PtxSource ptxSource(len, source, false);
247         return compile(ptxSource);
248     }else{
249         if (config->trace) {
250             std::cout << "compiling from provided  cuda " << std::endl;
251         }
252         CudaSource cudaSource(len , source, false, config->profileCudaKernel);
253         return compile(cudaSource);
254     }
255 }
256 
257 /*
258 
259     if (config->ptx) {
260 
261     } else {
262         if (config->trace) {
263             std::cout << "compiling from cuda c99 " << std::endl;
264         }
265         if (config->showCode) {
266             std::cout << "cuda " << source << std::endl;
267         }
268         auto* cuda = new CudaSource(len, source, false);
269         ptx = nvcc(cuda);
270     }
271     if (config->showCode) {
272         std::cout << "ptx " << ptx->text << std::endl;
273     }
274     CUmodule module;
275 
276 
277     if (ptx->text != nullptr) {
278         constexpr unsigned int jitNumOptions = 2;
279         const auto jitOptions = new CUjit_option[jitNumOptions];
280         const auto jitOptVals = new void *[jitNumOptions];
281 
282         // set up size of compilation log buffer
283         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
284         constexpr int jitLogBufferSize = 8192;
285         jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
286 
287         // set up pointer to the compilation log buffer
288         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
289         auto jitLogBuffer = new char[jitLogBufferSize];
290         jitOptVals[1] = jitLogBuffer;
291         cuCtxSetCurrent(context);
292 
293         WHERE{
294             .f = __FILE__, .l = __LINE__,
295             .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals),
296             .t = "cuModuleLoadDataEx"
297         }.report();
298         if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){
299              std::cout << "PTX log:" << jitLogBuffer << std::endl;
300         }
301         return new CudaModule(this, ptx->text, jitLogBuffer, true, module);
302     } else {
303         std::cout << "no ptx content!" << std::endl;
304         exit(1);
305     }
306 } */
307 
308 extern "C" long getBackend(int mode) {
309     long backendHandle = reinterpret_cast<long>(new CudaBackend(mode));
310     //  std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
311     return backendHandle;
312 }
313 
314 void clCallback(void *) {
315     std::cerr << "start of compute" << std::endl;
316 }
317 
318 void CudaBackend::computeEnd() {
319     queue->computeEnd();
320 }
321 
322 void CudaBackend::computeStart() {
323     queue->computeStart();
324 }
325 
326 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) {
327     if (config->traceCalls) {
328         std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," <<
329                 std::dec << memorySegmentLength << "){" << std::endl;
330     }
331     if (config->minimizeCopies) {
332         const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength);
333         if (bufferState->state == BufferState::DEVICE_OWNED) {
334             queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
335             if (config->traceEnqueues | config->traceCopies) {
336                 std::cout << "copying buffer from device (from java access) " << std::endl;
337             }
338             queue->wait();
339             queue->release();
340         } else {
341             std::cout << "HOW DID WE GET HERE 1 attempting  to get buffer but buffer is not device dirty" << std::endl;
342             std::exit(1);
343         }
344     } else {
345         std::cerr <<
346                 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"
347                 << std::endl;
348         std::exit(1);
349     }
350     if (config->traceCalls) {
351         std::cout << "}getBufferFromDeviceIfDirty()" << std::endl;
352     }
353     return true;
354 }
355 
356 CudaBackend *CudaBackend::of(const long backendHandle) {
357     return reinterpret_cast<CudaBackend *>(backendHandle);
358 }
359 
360 CudaBackend *CudaBackend::of(Backend *backend) {
361     return dynamic_cast<CudaBackend *>(backend);
362 }
363 
364 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
365     CudaBuffer *cudaBuffer = nullptr;
366     if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) {
367         cudaBuffer = new CudaBuffer(this, bufferState);
368         if (config->trace) {
369             std::cout << "We allocated arg buffer " << std::endl;
370         }
371     } else {
372         if (config->trace) {
373             std::cout << "Were reusing  buffer  buffer " << std::endl;
374         }
375         cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr);
376     }
377     return cudaBuffer;
378 }