1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include "cuda_backend.h"
 29 
 30 
 31 PtxSource::PtxSource()
 32     : Text(0L) {
 33 }
 34 
 35 PtxSource::PtxSource(size_t len)
 36     : Text(len) {
 37 }
 38 
 39 PtxSource::PtxSource(char *text)
 40     : Text(text, false) {
 41 }
 42 
 43 PtxSource::PtxSource(size_t len, char *text)
 44     : Text(len, text, true) {
 45 }
 46 PtxSource::PtxSource(size_t len, char *text, bool isCopy)
 47     : Text(len, text, isCopy) {
 48 }
 49 
 50 CudaSource::CudaSource(size_t len)
 51     : Text(len) {
 52 }
 53 
 54 CudaSource::CudaSource(char *text)
 55     : Text(text, false) {
 56 }
 57 
 58 CudaSource::CudaSource(size_t len, char *text, bool isCopy)
 59     : Text(len, text, isCopy) {
 60 }
 61 
 62 CudaSource::CudaSource()
 63     : Text(0) {
 64 }
 65 
 66 uint64_t timeSinceEpochMillisec() {
 67     using namespace std::chrono;
 68     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 69 }
 70 
 71 std::string tmpFileName(uint64_t time, const std::string &suffix) {
 72     std::stringstream timestamp;
 73     timestamp << "./tmp" << time << suffix;
 74     return timestamp.str();
 75 }
 76 
 77 
 78 
 79 CudaBackend::CudaBackend(int configBits)
 80     : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() {
 81     int deviceCount = 0;
 82 
 83     if (initStatus == CUDA_SUCCESS) {
 84         WHERE{
 85             .f = __FILE__, .l = __LINE__,
 86             .e = cuDeviceGetCount(&deviceCount),
 87             .t = "cuDeviceGetCount"
 88         }.report();
 89         std::cout << "CudaBackend device count = " << deviceCount << std::endl;
 90         WHERE{
 91             .f = __FILE__, .l = __LINE__,
 92             .e = cuDeviceGet(&device, 0),
 93             .t = "cuDeviceGet"
 94         }.report();
 95         WHERE{
 96             .f = __FILE__, .l = __LINE__,
 97             .e = cuCtxCreate(&context, 0, device),
 98             .t = "cuCtxCreate"
 99         }.report();
100         std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl;
101         dynamic_cast<CudaQueue *>(queue)->init();
102     } else {
103         WHERE{
104             .f = __FILE__, .l = __LINE__,
105             .e = initStatus,
106             "cuInit() failed we seem to have the runtime library but no device"
107         }.report();
108     }
109 }
110 
111 
112 CudaBackend::~CudaBackend() {
113     std::cout << "freeing context" << std::endl;
114     WHERE{
115         .f = __FILE__, .l = __LINE__,
116         .e = cuCtxDestroy(context),
117         .t = "cuCtxDestroy"
118     }.report();
119 }
120 
121 void CudaBackend::info() {
122     char name[100];
123     cuDeviceGetName(name, sizeof(name), device);
124     std::cout << "> Using device 0: " << name << std::endl;
125 
126     // get compute capabilities and the devicename
127     int major = 0, minor = 0;
128     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
129     cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
130     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
131 
132     int warpSize;
133     cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
134     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
135 
136     int threadsPerBlock;
137     cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
138     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
139 
140     int cores;
141     cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
142     std::cout << "> GPU Cores " << cores << std::endl;
143 
144     size_t totalGlobalMem;
145     cuDeviceTotalMem(&totalGlobalMem, device);
146     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
147     std::cout << "  64-bit Memory Address:           " <<
148             ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
149 }
150 
151 
152 
153 
154 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) {
155   //std::cout << "inside nvcc" << std::endl;
156     const uint64_t time = timeSinceEpochMillisec();
157     const std::string ptxPath = tmpFileName(time, ".ptx");
158     const std::string cudaPath = tmpFileName(time, ".cu");
159     int pid;
160     cudaSource->write(cudaPath);
161     if ((pid = fork()) == 0) { //child
162         const auto path = "/usr/local/cuda/bin/nvcc";
163         const char *argv[]{  "/usr/local/cuda/bin/nvcc", "-ptx", "-Wno-deprecated-gpu-targets", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr};
164         const int stat = execvp(path, (char *const *) argv);
165         std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl;
166         std::exit(errno);
167     } else if (pid < 0) {// fork failed.
168         std::cerr << "fork of nvcc failed" << std::endl;
169         std::exit(1);
170     } else { //parent
171         int status;
172         pid_t result = wait(&status);
173         auto *ptx = new PtxSource();
174         ptx->read(ptxPath);
175         return ptx;
176     }
177 }
178 
179 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) {
180     return compile(&cudaSource);
181 }
182 
183 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) {
184     const PtxSource *ptxSource = nvcc(cudaSource);
185     return compile(ptxSource);
186 }
187 
188 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) {
189     return compile(&ptxSource);
190 }
191 
192 CudaBackend::CudaModule *CudaBackend::compile(const  PtxSource *ptx) {
193 
194     CUmodule module;
195      // std::cout << "inside compile" << std::endl;
196     // std::cout << "cuda " << cudaSource->text << std::endl;
197     if (ptx->text != nullptr) {
198        // std::cout << "ptx " << ptx->text << std::endl;
199         const Log *infLog = new Log(8192);
200         const Log *errLog = new Log(8192);
201         constexpr unsigned int optc = 5;
202         const auto jitOptions = new CUjit_option[optc];
203         auto jitOptVals = new void *[optc];
204 
205 
206         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
207         jitOptVals[0] = reinterpret_cast<void *>(infLog->len);
208         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
209         jitOptVals[1] = infLog->text;
210         jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
211         jitOptVals[2] = reinterpret_cast<void *>(errLog->len);
212         jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
213         jitOptVals[3] = errLog->text;
214         jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;
215         jitOptVals[4] = reinterpret_cast<void *>(1);
216 
217         WHERE{
218             .f = __FILE__, .l = __LINE__,
219             .e = cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals),
220             .t = "cuModuleLoadDataEx"
221         }.report();
222         if (*infLog->text!='\0'){
223            std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl;
224         }
225         if (*errLog->text!='\0'){
226            std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl;
227         }
228         return new CudaModule(this, ptx->text, infLog->text, true, module);
229 
230         //delete ptx;
231     } else {
232         std::cout << "no ptx content!" << std::endl;
233         exit(1);
234     }
235 }
236 
237 //Entry point from HAT.  We use the config PTX bit to determine which Source type
238 
239 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) {
240     if (config->traceCalls) {
241         std::cout << "inside compileProgram" << std::endl;
242     }
243 
244     if (config->ptx){
245         if (config->trace) {
246             std::cout << "compiling from provided  ptx " << std::endl;
247         }
248         PtxSource ptxSource(len, source, false);
249         return compile(ptxSource);
250     }else{
251         if (config->trace) {
252             std::cout << "compiling from provided  cuda " << std::endl;
253         }
254         CudaSource cudaSource(len , source, false);
255         return compile(cudaSource);
256     }
257 }
258 
259 /*
260 
261     if (config->ptx) {
262 
263     } else {
264         if (config->trace) {
265             std::cout << "compiling from cuda c99 " << std::endl;
266         }
267         if (config->showCode) {
268             std::cout << "cuda " << source << std::endl;
269         }
270         auto* cuda = new CudaSource(len, source, false);
271         ptx = nvcc(cuda);
272     }
273     if (config->showCode) {
274         std::cout << "ptx " << ptx->text << std::endl;
275     }
276     CUmodule module;
277 
278 
279     if (ptx->text != nullptr) {
280         constexpr unsigned int jitNumOptions = 2;
281         const auto jitOptions = new CUjit_option[jitNumOptions];
282         const auto jitOptVals = new void *[jitNumOptions];
283 
284         // set up size of compilation log buffer
285         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
286         constexpr int jitLogBufferSize = 8192;
287         jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
288 
289         // set up pointer to the compilation log buffer
290         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
291         auto jitLogBuffer = new char[jitLogBufferSize];
292         jitOptVals[1] = jitLogBuffer;
293         cuCtxSetCurrent(context);
294 
295         WHERE{
296             .f = __FILE__, .l = __LINE__,
297             .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals),
298             .t = "cuModuleLoadDataEx"
299         }.report();
300         if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){
301              std::cout << "PTX log:" << jitLogBuffer << std::endl;
302         }
303         return new CudaModule(this, ptx->text, jitLogBuffer, true, module);
304     } else {
305         std::cout << "no ptx content!" << std::endl;
306         exit(1);
307     }
308 } */
309 
310 extern "C" long getBackend(int mode) {
311     long backendHandle = reinterpret_cast<long>(new CudaBackend(mode));
312     //  std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
313     return backendHandle;
314 }
315 
316 void clCallback(void *) {
317     std::cerr << "start of compute" << std::endl;
318 }
319 
320 
321 void CudaBackend::computeEnd() {
322     queue->computeEnd();
323 }
324 
325 void CudaBackend::computeStart() {
326     queue->computeStart();
327 }
328 
329 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) {
330     if (config->traceCalls) {
331         std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," <<
332                 std::dec << memorySegmentLength << "){" << std::endl;
333     }
334     if (config->minimizeCopies) {
335         const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength);
336         if (bufferState->state == BufferState::DEVICE_OWNED) {
337             queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
338             if (config->traceEnqueues | config->traceCopies) {
339                 std::cout << "copying buffer from device (from java access) " << std::endl;
340             }
341             queue->wait();
342             queue->release();
343         } else {
344             std::cout << "HOW DID WE GET HERE 1 attempting  to get buffer but buffer is not device dirty" << std::endl;
345             std::exit(1);
346         }
347     } else {
348         std::cerr <<
349                 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"
350                 << std::endl;
351         std::exit(1);
352     }
353     if (config->traceCalls) {
354         std::cout << "}getBufferFromDeviceIfDirty()" << std::endl;
355     }
356     return true;
357 }
358 
359 CudaBackend *CudaBackend::of(const long backendHandle) {
360     return reinterpret_cast<CudaBackend *>(backendHandle);
361 }
362 
363 CudaBackend *CudaBackend::of(Backend *backend) {
364     return dynamic_cast<CudaBackend *>(backend);
365 }
366 
367 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
368     CudaBuffer *cudaBuffer = nullptr;
369     if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) {
370         cudaBuffer = new CudaBuffer(this, bufferState);
371         if (config->trace) {
372             std::cout << "We allocated arg buffer " << std::endl;
373         }
374     } else {
375         if (config->trace) {
376             std::cout << "Were reusing  buffer  buffer " << std::endl;
377         }
378         cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr);
379     }
380     return cudaBuffer;
381 }