1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include "cuda_backend.h"
 29 
 30 #include <iostream>
 31 
 32 
 33 PtxSource::PtxSource()
 34     : Text(0L) {
 35 }
 36 
 37 PtxSource::PtxSource(size_t len)
 38     : Text(len) {
 39 }
 40 
 41 PtxSource::PtxSource(char *text)
 42     : Text(text, false) {
 43 }
 44 
 45 PtxSource::PtxSource(size_t len, char *text)
 46     : Text(len, text, true) {
 47 }
 48 PtxSource::PtxSource(size_t len, char *text, bool isCopy)
 49     : Text(len, text, isCopy) {
 50 }
 51 
 52 CudaSource::CudaSource(size_t len)
 53     : Text(len) {
 54 }
 55 
 56 CudaSource::CudaSource(char *text)
 57     : Text(text, false) {
 58 }
 59 
 60 CudaSource::CudaSource(size_t len, char *text, bool isCopy)
 61     : Text(len, text, isCopy) {
 62 }
 63 
 64 CudaSource::CudaSource()
 65     : Text(0) {
 66 }
 67 
 68 uint64_t timeSinceEpochMillisec() {
 69     using namespace std::chrono;
 70     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 71 }
 72 
 73 std::string tmpFileName(uint64_t time, const std::string &suffix) {
 74     std::stringstream timestamp;
 75     timestamp << "./tmp" << time << suffix;
 76     return timestamp.str();
 77 }
 78 
 79 
 80 
 81 CudaBackend::CudaBackend(int configBits)
 82     : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() {
 83     int deviceCount = 0;
 84 
 85     if (initStatus == CUDA_SUCCESS) {
 86         WHERE{
 87             .f = __FILE__, .l = __LINE__,
 88             .e = cuDeviceGetCount(&deviceCount),
 89             .t = "cuDeviceGetCount"
 90         }.report();
 91         std::cout << "CudaBackend device count = " << deviceCount << std::endl;
 92         WHERE{
 93             .f = __FILE__, .l = __LINE__,
 94             .e = cuDeviceGet(&device, 0),
 95             .t = "cuDeviceGet"
 96         }.report();
 97         WHERE{
 98             .f = __FILE__, .l = __LINE__,
 99             .e = cuCtxCreate(&context, 0, device),
100             .t = "cuCtxCreate"
101         }.report();
102         std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl;
103         dynamic_cast<CudaQueue *>(queue)->init();
104     } else {
105         WHERE{
106             .f = __FILE__, .l = __LINE__,
107             .e = initStatus,
108             "cuInit() failed we seem to have the runtime library but no device"
109         }.report();
110     }
111 }
112 
113 
114 CudaBackend::~CudaBackend() {
115     std::cout << "freeing context" << std::endl;
116     WHERE{
117         .f = __FILE__, .l = __LINE__,
118         .e = cuCtxDestroy(context),
119         .t = "cuCtxDestroy"
120     }.report();
121 }
122 
123 void CudaBackend::info() {
124     char name[100];
125     cuDeviceGetName(name, sizeof(name), device);
126     std::cout << "> Using device 0: " << name << std::endl;
127 
128     // get compute capabilities and the devicename
129     int major = 0, minor = 0;
130     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
131     cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
132     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
133 
134     int warpSize;
135     cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
136     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
137 
138     int threadsPerBlock;
139     cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
140     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
141 
142     int cores;
143     cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
144     std::cout << "> GPU Cores " << cores << std::endl;
145 
146     size_t totalGlobalMem;
147     cuDeviceTotalMem(&totalGlobalMem, device);
148     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
149     std::cout << "  64-bit Memory Address:           " <<
150             ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
151 }
152 
153 
154 
155 
156 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) {
157   //std::cout << "inside nvcc" << std::endl;
158     const uint64_t time = timeSinceEpochMillisec();
159     const std::string ptxPath = tmpFileName(time, ".ptx");
160     const std::string cudaPath = tmpFileName(time, ".cu");
161     int pid;
162     cudaSource->write(cudaPath);
163     if ((pid = fork()) == 0) { //child
164         const auto path = "/usr/local/cuda/bin/nvcc";
165         const char *argv[]{  "/usr/local/cuda/bin/nvcc", "-ptx", "-Wno-deprecated-gpu-targets", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr};
166         const int stat = execvp(path, (char *const *) argv);
167         std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl;
168         std::exit(errno);
169     } else if (pid < 0) {// fork failed.
170         std::cerr << "fork of nvcc failed" << std::endl;
171         std::exit(1);
172     } else { //parent
173         int status;
174         pid_t result = wait(&status);
175         auto *ptx = new PtxSource();
176         ptx->read(ptxPath);
177         return ptx;
178     }
179 }
180 
181 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) {
182     return compile(&cudaSource);
183 }
184 
185 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) {
186     const PtxSource *ptxSource = nvcc(cudaSource);
187     return compile(ptxSource);
188 }
189 
190 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) {
191     return compile(&ptxSource);
192 }
193 
194 CudaBackend::CudaModule *CudaBackend::compile(const  PtxSource *ptx) {
195 
196     CUmodule module;
197      // std::cout << "inside compile" << std::endl;
198     // std::cout << "cuda " << cudaSource->text << std::endl;
199     if (ptx->text != nullptr) {
200        // std::cout << "ptx " << ptx->text << std::endl;
201         const Log *infLog = new Log(8192);
202         const Log *errLog = new Log(8192);
203         constexpr unsigned int optc = 5;
204         const auto jitOptions = new CUjit_option[optc];
205         auto jitOptVals = new void *[optc];
206 
207 
208         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
209         jitOptVals[0] = reinterpret_cast<void *>(infLog->len);
210         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
211         jitOptVals[1] = infLog->text;
212         jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
213         jitOptVals[2] = reinterpret_cast<void *>(errLog->len);
214         jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
215         jitOptVals[3] = errLog->text;
216         jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;
217         jitOptVals[4] = reinterpret_cast<void *>(1);
218 
219         WHERE{
220             .f = __FILE__, .l = __LINE__,
221             .e = cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals),
222             .t = "cuModuleLoadDataEx"
223         }.report();
224         if (*infLog->text!='\0'){
225            std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl;
226         }
227         if (*errLog->text!='\0'){
228            std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl;
229         }
230         return new CudaModule(this, ptx->text, infLog->text, true, module);
231 
232         //delete ptx;
233     } else {
234         std::cout << "no ptx content!" << std::endl;
235         exit(1);
236     }
237 }
238 
239 //Entry point from HAT.  We use the config PTX bit to determine which Source type
240 
241 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) {
242     if (config->traceCalls) {
243         std::cout << "inside compileProgram" << std::endl;
244     }
245 
246     if (config->ptx){
247         if (config->trace) {
248             std::cout << "compiling from provided  ptx " << std::endl;
249         }
250         PtxSource ptxSource(len, source, false);
251         return compile(ptxSource);
252     }else{
253         if (config->trace) {
254             std::cout << "compiling from provided  cuda " << std::endl;
255         }
256         CudaSource cudaSource(len , source, false);
257         return compile(cudaSource);
258     }
259 }
260 
261 /*
262 
263     if (config->ptx) {
264 
265     } else {
266         if (config->trace) {
267             std::cout << "compiling from cuda c99 " << std::endl;
268         }
269         if (config->showCode) {
270             std::cout << "cuda " << source << std::endl;
271         }
272         auto* cuda = new CudaSource(len, source, false);
273         ptx = nvcc(cuda);
274     }
275     if (config->showCode) {
276         std::cout << "ptx " << ptx->text << std::endl;
277     }
278     CUmodule module;
279 
280 
281     if (ptx->text != nullptr) {
282         constexpr unsigned int jitNumOptions = 2;
283         const auto jitOptions = new CUjit_option[jitNumOptions];
284         const auto jitOptVals = new void *[jitNumOptions];
285 
286         // set up size of compilation log buffer
287         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
288         constexpr int jitLogBufferSize = 8192;
289         jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
290 
291         // set up pointer to the compilation log buffer
292         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
293         auto jitLogBuffer = new char[jitLogBufferSize];
294         jitOptVals[1] = jitLogBuffer;
295         cuCtxSetCurrent(context);
296 
297         WHERE{
298             .f = __FILE__, .l = __LINE__,
299             .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals),
300             .t = "cuModuleLoadDataEx"
301         }.report();
302         if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){
303              std::cout << "PTX log:" << jitLogBuffer << std::endl;
304         }
305         return new CudaModule(this, ptx->text, jitLogBuffer, true, module);
306     } else {
307         std::cout << "no ptx content!" << std::endl;
308         exit(1);
309     }
310 } */
311 
312 extern "C" long getBackend(int mode) {
313     long backendHandle = reinterpret_cast<long>(new CudaBackend(mode));
314     //  std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
315     return backendHandle;
316 }
317 
318 void clCallback(void *) {
319     std::cerr << "start of compute" << std::endl;
320 }
321 
322 
323 void CudaBackend::computeEnd() {
324     queue->computeEnd();
325 }
326 
327 void CudaBackend::computeStart() {
328     queue->computeStart();
329 }
330 
331 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) {
332     if (config->traceCalls) {
333         std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," <<
334                 std::dec << memorySegmentLength << "){" << std::endl;
335     }
336     if (config->minimizeCopies) {
337         const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength);
338         if (bufferState->state == BufferState::DEVICE_OWNED) {
339             queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
340             if (config->traceEnqueues | config->traceCopies) {
341                 std::cout << "copying buffer from device (from java access) " << std::endl;
342             }
343             queue->wait();
344             queue->release();
345         } else {
346             std::cout << "HOW DID WE GET HERE 1 attempting  to get buffer but buffer is not device dirty" << std::endl;
347             std::exit(1);
348         }
349     } else {
350         std::cerr <<
351                 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"
352                 << std::endl;
353         std::exit(1);
354     }
355     if (config->traceCalls) {
356         std::cout << "}getBufferFromDeviceIfDirty()" << std::endl;
357     }
358     return true;
359 }
360 
361 CudaBackend *CudaBackend::of(const long backendHandle) {
362     return reinterpret_cast<CudaBackend *>(backendHandle);
363 }
364 
365 CudaBackend *CudaBackend::of(Backend *backend) {
366     return dynamic_cast<CudaBackend *>(backend);
367 }
368 
369 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
370     CudaBuffer *cudaBuffer = nullptr;
371     if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) {
372         cudaBuffer = new CudaBuffer(this, bufferState);
373         if (config->trace) {
374             std::cout << "We allocated arg buffer " << std::endl;
375         }
376     } else {
377         if (config->trace) {
378             std::cout << "Were reusing  buffer  buffer " << std::endl;
379         }
380         cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr);
381     }
382     return cudaBuffer;
383 }