1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 #include <sys/wait.h>
 27 #include <chrono>
 28 #include "cuda_backend.h"
 29 
 30 
 31 PtxSource::PtxSource()
 32         : Text(0L) {
 33 }
 34 PtxSource::PtxSource(size_t len)
 35         : Text(len) {
 36 }
 37 PtxSource::PtxSource(char *text)
 38         : Text(text, false) {
 39 }
 40 PtxSource::PtxSource(size_t len, char *text)
 41         : Text(len, text , true) {
 42 }
 43 CudaSource::CudaSource(size_t len)
 44         : Text(len) {
 45 }
 46 CudaSource::CudaSource(char *text)
 47         : Text(text, false) {
 48 }
 49 CudaSource::CudaSource(size_t len, char *text, bool isCopy)
 50         :Text(len, text, isCopy){
 51 
 52 }
 53 CudaSource::CudaSource()
 54         : Text(0) {
 55 }
 56 uint64_t timeSinceEpochMillisec() {
 57     using namespace std::chrono;
 58     return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
 59 }
 60 
 61 std::string tmpFileName(uint64_t time, const std::string& suffix){
 62     std::stringstream timestamp;
 63     timestamp << "./tmp" << time << suffix;
 64     return timestamp.str();
 65 }
 66 
 67 PtxSource *PtxSource::nvcc(const char *cudaSource, size_t len) {
 68     CudaSource cSource(len,(char*)cudaSource,false);
 69 
 70     uint64_t time = timeSinceEpochMillisec();
 71     std::string ptxPath = tmpFileName(time, ".ptx");
 72     std::string cudaPath = tmpFileName(time, ".cu");
 73     // we are going to fork exec nvcc
 74     int pid;
 75     cSource.write(cudaPath);
 76     if ((pid = fork()) == 0) {
 77         const char *path = "/usr/local/cuda-12.2/bin/nvcc";
 78         const char *argv[]{"nvcc", "-ptx", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr};
 79        // std::cerr << "child about to exec nvcc" << std::endl;
 80        // std::cerr << "path " << path<< " " << argv[1]<< " " << argv[2]<< " " << argv[3]<< " " << argv[4]<< std::endl;
 81         int stat = execvp(path, (char *const *) argv);
 82         std::cerr << " nvcc stat = "<<stat << " errno="<< errno<< " '"<< std::strerror(errno)<< "'"<<std::endl;
 83         std::exit(errno);
 84     } else if (pid < 0) {
 85         // fork failed.
 86         std::cerr << "fork of nvcc failed" << std::endl;
 87         std::exit(1);
 88     } else {
 89         int status;
 90        // std::cerr << "parent waiting for child nvcc exec" << std::endl;
 91         pid_t result = wait(&status);
 92         //std::cerr << "child finished should be safe to read "<< ptxPath << std::endl;
 93         PtxSource *ptx= new PtxSource();
 94         ptx->read(ptxPath);
 95         return ptx;
 96     }
 97     std::cerr << "we should never get here !";
 98     exit(1);
 99     return nullptr;
100 }
101 
102 
103 CudaBackend::CudaBackend(int configBits)
104         : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(),context()  {
105     int deviceCount = 0;
106 
107     if (initStatus == CUDA_SUCCESS) {
108         WHERE{.f=__FILE__, .l=__LINE__,
109                 .e=cuDeviceGetCount(&deviceCount),
110                 .t="cuDeviceGetCount"
111         }.report();
112         std::cout << "CudaBackend device count = "<< deviceCount << std::endl;
113         WHERE{.f=__FILE__, .l=__LINE__,
114                 .e=cuDeviceGet(&device, 0),
115                 .t="cuDeviceGet"
116         }.report();
117         WHERE{.f=__FILE__, .l=__LINE__,
118                 .e=cuCtxCreate(&context, 0, device),
119                 .t="cuCtxCreate"
120         }.report();
121         std::cout << "CudaBackend context created ok (id="<<context<<")" << std::endl;
122         dynamic_cast<CudaQueue *>(queue)->init();
123     } else {
124         WHERE{.f=__FILE__, .l=__LINE__,
125                 .e=initStatus,
126                 "cuInit() failed we seem to have the runtime library but no device"
127         }.report();
128     }
129 }
130 
131 //CudaBackend::CudaBackend() : CudaBackend(nullptr, 0, nullptr) {
132 //
133 //}
134 
135 CudaBackend::~CudaBackend() {
136     std::cout << "freeing context" << std::endl;
137     WHERE{.f=__FILE__, .l=__LINE__,
138             .e=cuCtxDestroy(context),
139             .t="cuCtxDestroy"
140     }.report();
141 }
142 
143 void CudaBackend::info() {
144     char name[100];
145     cuDeviceGetName(name, sizeof(name), device);
146     std::cout << "> Using device 0: " << name << std::endl;
147 
148     // get compute capabilities and the devicename
149     int major = 0, minor = 0;
150     cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
151     cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
152     std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
153 
154     int warpSize;
155     cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device);
156     std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
157 
158     int threadsPerBlock;
159     cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device);
160     std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
161 
162     int cores;
163     cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device);
164     std::cout << "> GPU Cores " << cores << std::endl;
165 
166     size_t totalGlobalMem;
167     cuDeviceTotalMem(&totalGlobalMem, device);
168     std::cout << "  Total amount of global memory:   " << (unsigned long long) totalGlobalMem << std::endl;
169     std::cout << "  64-bit Memory Address:           " <<
170               ((totalGlobalMem > (unsigned long long) 4 * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
171 
172 }
173 
174 PtxSource *CudaBackend::nvcc(CudaSource *cudaSource){
175     uint64_t time = timeSinceEpochMillisec();
176     std::string ptxPath = tmpFileName(time, ".ptx");
177     std::string cudaPath = tmpFileName(time, ".cu");
178     // we are going to fork exec nvcc so we need to write the cuda source to disk
179     int pid;
180     cudaSource->write(cudaPath);
181     if ((pid = fork()) == 0) {
182         const char *path = "/usr/local/cuda-12.2/bin/nvcc";
183         const char *argv[]{"nvcc", "-ptx", cudaPath.c_str(), "-o", ptxPath.c_str(), nullptr};
184         // std::cerr << "child about to exec nvcc" << std::endl;
185         // std::cerr << "path " << path<< " " << argv[1]<< " " << argv[2]<< " " << argv[3]<< " " << argv[4]<< std::endl;
186         int stat = execvp(path, (char *const *) argv);
187         std::cerr << " nvcc stat = "<<stat << " errno="<< errno<< " '"<< std::strerror(errno)<< "'"<<std::endl;
188         std::exit(errno);
189     } else if (pid < 0) {
190         // fork failed.
191         std::cerr << "fork of nvcc failed" << std::endl;
192         std::exit(1);
193     } else {
194         int status;
195         // std::cerr << "parent waiting for child nvcc exec" << std::endl;
196         pid_t result = wait(&status);
197         //std::cerr << "child finished should be safe to read "<< ptxPath << std::endl;
198         PtxSource *ptx= new PtxSource();
199         ptx->read(ptxPath);
200         return ptx;
201     }
202     std::cerr << "we should never get here !";
203     exit(1);
204     return nullptr;
205 
206 }
207 CudaBackend::CudaModule * CudaBackend::compile(CudaSource &cudaSource) {
208     return compile(&cudaSource);
209 }
210 CudaBackend::CudaModule * CudaBackend::compile(CudaSource *cudaSource) {
211     PtxSource *ptx = nvcc(cudaSource);
212     CUmodule module;
213   //  std::cout << "inside compile" << std::endl;
214    // std::cout << "cuda " << cudaSource->text << std::endl;
215     if (ptx->text != nullptr) {
216         std::cout << "ptx " << ptx->text << std::endl;
217         Log *infLog = new Log(8192);
218         Log *errLog = new Log(8192);
219         const unsigned int optc = 5;
220         auto jitOptions = new CUjit_option[optc];
221         void **jitOptVals = new void *[optc];
222 
223 
224         jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;jitOptVals[0] = (void *) (size_t) infLog->len;
225         jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; jitOptVals[1] = infLog->text;
226         jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;jitOptVals[2] = (void *) (size_t) errLog->len;
227         jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER; jitOptVals[3] = errLog->text;
228         jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;jitOptVals[4] = (void *)1;
229 
230         WHERE{.f=__FILE__, .l=__LINE__,
231                 .e=cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals),
232                 .t="cuModuleLoadDataEx"
233         }.report();
234         std::cout <<"> PTX JIT inflog:"<<std::endl  << infLog->text << std::endl;
235         std::cout <<"> PTX JIT errlog:"<<std::endl  << errLog->text << std::endl;
236         return new CudaModule(this,  ptx->text,infLog->text,true, module);
237 
238         //delete ptx;
239     } else {
240         std::cout << "no ptx content!" << std::endl;
241         exit(1);
242     }
243 }
244 
245 Backend::CompilationUnit * CudaBackend::compile(int len, char *source) {
246     if (config->traceCalls) {
247         std::cout << "inside compileProgram" << std::endl;
248     }
249     PtxSource *ptx = nullptr;
250     if (config->ptx){
251         if (config->trace) {
252             std::cout << "compiling from ptx " << std::endl;
253         }
254         ptx = new PtxSource(len,source);
255     }else {
256         ptx = PtxSource::nvcc(source, len);
257         if (config->traceCalls) {
258             std::cout << "compiling from cuda c99 "<<std::endl;
259         }
260         if (config->showCode){
261             std::cout << "cuda " << source << std::endl;
262         }
263 
264     }
265     if (config->showCode){
266         std::cout << "ptx " << ptx->text << std::endl;
267     }
268         CUmodule module;
269 
270 
271         if (ptx->text != nullptr) {
272 
273             // in this branch we use compilation with parameters
274             const unsigned int jitNumOptions = 2;
275             auto jitOptions = new CUjit_option[jitNumOptions];
276             void **jitOptVals = new void *[jitNumOptions];
277 
278             // set up size of compilation log buffer
279             jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
280             int jitLogBufferSize = 8192;
281             jitOptVals[0] = (void *) (size_t) jitLogBufferSize;
282 
283             // set up pointer to the compilation log buffer
284             jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
285             char *jitLogBuffer = new char[jitLogBufferSize];
286             jitOptVals[1] = jitLogBuffer;
287             cuCtxSetCurrent(context);
288 
289             WHERE{.f=__FILE__, .l=__LINE__,
290                     .e=cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, (void **) jitOptVals),
291                     .t="cuModuleLoadDataEx"
292             }.report();
293             std::cout <<"PTX log:"<< jitLogBuffer << std::endl;
294             return dynamic_cast<Backend::CompilationUnit *>(new CudaModule(this, ptx->text, jitLogBuffer, true,
295                                                                            module));
296             //      return reinterpret_cast<long>(new CudaModule(this,  ptx->text,jitLogBuffer,true, module));
297 //
298             //delete
299         } else {
300             std::cout << "no ptx content!" << std::endl;
301             exit(1);
302         }
303 
304 }
305 extern "C" long getBackend(int mode) {
306     long backendHandle= reinterpret_cast<long>(new CudaBackend(mode));
307   //  std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
308     return backendHandle;
309 }
310 
311 void clCallback(void *){
312     std::cerr<<"start of compute"<<std::endl;
313 }
314 
315 
316 
317 void CudaBackend::computeEnd(){
318     queue->computeEnd();
319 
320 }
321 void CudaBackend::computeStart(){
322     queue->computeStart();
323 }
324 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength){
325     if (config->traceCalls){
326         std::cout << "getBufferFromDeviceIfDirty(" <<std::hex << (long)memorySegment << "," << std::dec<< memorySegmentLength <<"){"<<std::endl;
327     }
328     if (config->minimizeCopies){
329         BufferState * bufferState = BufferState::of(memorySegment,memorySegmentLength);
330         if (bufferState->state == BufferState::DEVICE_OWNED){
331             queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
332             if (config->traceEnqueues | config->traceCopies){
333                 std::cout << "copying buffer from device (from java access) "<< std::endl;
334             }
335             queue->wait();
336             queue->release();
337         }else{
338             std::cout << "HOW DID WE GET HERE 1 attempting  to get buffer but buffer is not device dirty"<<std::endl;
339             std::exit(1);
340         }
341     }else{
342         std::cerr << "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"<<std::endl;
343         std::exit(1);
344     }
345     if (config->traceCalls){
346         std::cout << "}getBufferFromDeviceIfDirty()"<<std::endl;
347     }
348     return true;
349 
350 }
351 
352 CudaBackend * CudaBackend::of(long backendHandle){
353     return reinterpret_cast<CudaBackend *>(backendHandle);
354 }
355 CudaBackend * CudaBackend::of(Backend *backend){
356     return dynamic_cast<CudaBackend *>(backend);
357 }
358 
359 CudaBackend::CudaBuffer * CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
360     CudaBuffer *cudaBuffer = nullptr;
361     if (bufferState->vendorPtr == 0L || bufferState->state == BufferState::NEW_STATE){
362         cudaBuffer = new CudaBuffer(this,  bufferState);
363         if (config->trace){
364             std::cout << "We allocated arg buffer "<<std::endl;
365         }
366     }else{
367         if (config->trace){
368             std::cout << "Were reusing  buffer  buffer "<<std::endl;
369         }
370         cudaBuffer=  static_cast<CudaBuffer*>(bufferState->vendorPtr);
371     }
372     return cudaBuffer;
373 }