1 /*
2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Oracle designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Oracle in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 */
25
26 #include <sys/wait.h>
27 #include <chrono>
28 #include "cuda_backend.h"
29 #include <iostream>
30
31 PtxSource::PtxSource()
32 : Text(0L) {
33 }
34
35 PtxSource::PtxSource(size_t len)
36 : Text(len) {
37 }
38
39 PtxSource::PtxSource(char *text)
40 : Text(text, false) {
41 }
42
43 PtxSource::PtxSource(size_t len, char *text)
44 : Text(len, text, true) {
45 }
46 PtxSource::PtxSource(size_t len, char *text, bool isCopy)
47 : Text(len, text, isCopy) {
48 }
49
50 CudaSource::CudaSource(size_t len)
51 : Text(len) {
52 }
53
54 CudaSource::CudaSource(char *text)
55 : Text(text, false) {
56 }
57
58 CudaSource::CudaSource(size_t len, char *text, bool isCopy, bool lineinfo)
59 : Text(len, text, isCopy) {
60 _lineInfo = lineinfo;
61 }
62
63 CudaSource::CudaSource()
64 : Text(0) {
65 }
66
67 bool CudaSource::lineInfo() const {
68 return _lineInfo;
69 }
70
71 uint64_t timeSinceEpochMillisec() {
72 using namespace std::chrono;
73 return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
74 }
75
76 std::string tmpFileName(uint64_t time, const std::string &suffix) {
77 std::stringstream timestamp;
78 timestamp << "./tmp" << time << suffix;
79 return timestamp.str();
80 }
81
82 CudaBackend::CudaBackend(int configBits)
83 : Backend(new Config(configBits), new CudaQueue(this)), initStatus(cuInit(0)), device(), context() {
84 int deviceCount = 0;
85
86 if (initStatus == CUDA_SUCCESS) {
87 CUDA_CHECK(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");
88 if (config->info) {
89 std::cout << "CudaBackend device count = " << deviceCount << std::endl;
90 }
91 CUDA_CHECK(cuDeviceGet(&device, 0), "cuDeviceGet");
92 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12080
93 CUctxCreateParams ctxCreateParams = {};
94 CUDA_CHECK(cuCtxCreate_v4(&context, &ctxCreateParams, 0, device), "cuCtxCreate");
95 #else
96 // Invoke previous implementation with 3 parameters
97 CUDA_CHECK(cuCtxCreate(&context, 0, device), "cuCtxCreate");
98 #endif
99 if (config->info) {
100 std::cout << "CudaBackend context created ok (id=" << context << ")" << std::endl;
101 }
102 dynamic_cast<CudaQueue *>(queue)->init();
103 } else {
104 CUDA_CHECK(initStatus, "cuInit() failed we seem to have the runtime library but no device");
105 }
106 }
107
108 CudaBackend::~CudaBackend() {
109 std::cout << "freeing context" << std::endl;
110 CUDA_CHECK(cuCtxDestroy(context), "cuCtxDestroy");
111 }
112
113 void CudaBackend::info() {
114 char name[100];
115 CUDA_CHECK(cuDeviceGetName(name, sizeof(name), device), "cuDeviceGetName");
116
117 std::cout << "> Using device 0: " << name << std::endl;
118
119 // get compute capabilities and the device name
120 int major = 0, minor = 0;
121 CUDA_CHECK(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device), "cuDeviceGetAttribute");
122 CUDA_CHECK(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device), "cuDeviceGetAttribute");
123 std::cout << "> GPU Device has major=" << major << " minor=" << minor << " compute capability" << std::endl;
124
125 int warpSize;
126 CUDA_CHECK(cuDeviceGetAttribute(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device), "cuDeviceGetAttribute");
127 std::cout << "> GPU Device has warpSize " << warpSize << std::endl;
128
129 int threadsPerBlock;
130 CUDA_CHECK(cuDeviceGetAttribute(&threadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device), "cuDeviceGetAttribute");
131 std::cout << "> GPU Device has threadsPerBlock " << threadsPerBlock << std::endl;
132
133 int cores;
134 CUDA_CHECK(cuDeviceGetAttribute(&cores, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device), "cuDeviceGetAttribute");
135 std::cout << "> GPU Cores " << cores << std::endl;
136
137 size_t totalGlobalMem;
138 CUDA_CHECK(cuDeviceTotalMem(&totalGlobalMem, device), "cuDeviceTotalMem");
139 std::cout << " Total amount of global memory: " << (unsigned long long) totalGlobalMem << std::endl;
140 std::cout << " 64-bit Memory Address: " <<
141 ((totalGlobalMem > static_cast<unsigned long long>(4) * 1024 * 1024 * 1024L) ? "YES" : "NO") << std::endl;
142 }
143
144 PtxSource *CudaBackend::nvcc(const CudaSource *cudaSource) {
145 const uint64_t time = timeSinceEpochMillisec();
146 const std::string ptxPath = tmpFileName(time, ".ptx");
147 const std::string cudaPath = tmpFileName(time, ".cu");
148 int pid;
149 cudaSource->write(cudaPath);
150 if ((pid = fork()) == 0) { //child
151 const auto path = "/usr/local/cuda/bin/nvcc";
152 std::vector<std::string> command;
153 command.push_back(path);
154 command.push_back("-ptx");
155 command.push_back("-Wno-deprecated-gpu-targets");
156 command.push_back(cudaPath);
157 if (cudaSource->lineInfo()) {
158 command.push_back("-lineinfo");
159 }
160 command.push_back("-o");
161 command.push_back(ptxPath);
162
163 // conver to char*[]
164 const char* args[command.size() + 1];
165 for (int i = 0; i < command.size(); i++) {
166 args[i] = command[i].c_str();
167 }
168 args[command.size()] = nullptr;
169 const int stat = execvp(path, (char *const *) args);
170 std::cerr << " nvcc stat = " << stat << " errno=" << errno << " '" << std::strerror(errno) << "'" << std::endl;
171 std::exit(errno);
172 } else if (pid < 0) {// fork failed.
173 std::cerr << "fork of nvcc failed" << std::endl;
174 std::exit(1);
175 } else { //parent
176 int status;
177 pid_t result = wait(&status);
178 auto *ptx = new PtxSource();
179 ptx->read(ptxPath);
180 return ptx;
181 }
182 }
183
184 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource &cudaSource) {
185 return compile(&cudaSource);
186 }
187
188 CudaBackend::CudaModule *CudaBackend::compile(const CudaSource *cudaSource) {
189 const PtxSource *ptxSource = nvcc(cudaSource);
190 return compile(ptxSource);
191 }
192
193 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource &ptxSource) {
194 return compile(&ptxSource);
195 }
196
197 CudaBackend::CudaModule *CudaBackend::compile(const PtxSource *ptx) {
198 CUmodule module;
199 if (ptx->text != nullptr) {
200 const Log *infLog = new Log(8192);
201 const Log *errLog = new Log(8192);
202 constexpr unsigned int optc = 5;
203 const auto jitOptions = new CUjit_option[optc];
204 auto jitOptVals = new void *[optc];
205
206 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
207 jitOptVals[0] = reinterpret_cast<void *>(infLog->len);
208 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
209 jitOptVals[1] = infLog->text;
210 jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
211 jitOptVals[2] = reinterpret_cast<void *>(errLog->len);
212 jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
213 jitOptVals[3] = errLog->text;
214 jitOptions[4] = CU_JIT_GENERATE_LINE_INFO;
215 jitOptVals[4] = reinterpret_cast<void *>(1);
216
217 CUDA_CHECK(cuCtxSetCurrent(context), "cuCtxSetCurrent");
218 CUDA_CHECK(cuModuleLoadDataEx(&module, ptx->text, optc, jitOptions, (void **) jitOptVals), "cuModuleLoadDataEx");
219
220 if (*infLog->text!='\0'){
221 std::cout << "> PTX JIT inflog:" << std::endl << infLog->text << std::endl;
222 }
223 if (*errLog->text!='\0'){
224 std::cout << "> PTX JIT errlog:" << std::endl << errLog->text << std::endl;
225 }
226 return new CudaModule(this, ptx->text, infLog->text, true, module);
227
228 //delete ptx;
229 } else {
230 std::cout << "no ptx content!" << std::endl;
231 exit(1);
232 }
233 }
234
235 //Entry point from HAT. We use the config PTX bit to determine which Source type
236
237 Backend::CompilationUnit *CudaBackend::compile(const int len, char *source) {
238 if (config->traceCalls) {
239 std::cout << "inside compileProgram" << std::endl;
240 }
241
242 if (config->ptx){
243 if (config->trace) {
244 std::cout << "compiling from provided ptx " << std::endl;
245 }
246 PtxSource ptxSource(len, source, false);
247 return compile(ptxSource);
248 }else{
249 if (config->trace) {
250 std::cout << "compiling from provided cuda " << std::endl;
251 }
252 CudaSource cudaSource(len , source, false, config->profileCudaKernel);
253 return compile(cudaSource);
254 }
255 }
256
257 /*
258
259 if (config->ptx) {
260
261 } else {
262 if (config->trace) {
263 std::cout << "compiling from cuda c99 " << std::endl;
264 }
265 if (config->showCode) {
266 std::cout << "cuda " << source << std::endl;
267 }
268 auto* cuda = new CudaSource(len, source, false);
269 ptx = nvcc(cuda);
270 }
271 if (config->showCode) {
272 std::cout << "ptx " << ptx->text << std::endl;
273 }
274 CUmodule module;
275
276
277 if (ptx->text != nullptr) {
278 constexpr unsigned int jitNumOptions = 2;
279 const auto jitOptions = new CUjit_option[jitNumOptions];
280 const auto jitOptVals = new void *[jitNumOptions];
281
282 // set up size of compilation log buffer
283 jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
284 constexpr int jitLogBufferSize = 8192;
285 jitOptVals[0] = reinterpret_cast<void *>(jitLogBufferSize);
286
287 // set up pointer to the compilation log buffer
288 jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
289 auto jitLogBuffer = new char[jitLogBufferSize];
290 jitOptVals[1] = jitLogBuffer;
291 cuCtxSetCurrent(context);
292
293 WHERE{
294 .f = __FILE__, .l = __LINE__,
295 .e = cuModuleLoadDataEx(&module, ptx->text, jitNumOptions, jitOptions, jitOptVals),
296 .t = "cuModuleLoadDataEx"
297 }.report();
298 if (jitLogBuffer != nullptr && *jitLogBuffer!='\0'){
299 std::cout << "PTX log:" << jitLogBuffer << std::endl;
300 }
301 return new CudaModule(this, ptx->text, jitLogBuffer, true, module);
302 } else {
303 std::cout << "no ptx content!" << std::endl;
304 exit(1);
305 }
306 } */
307
308 extern "C" long getBackend(int mode) {
309 long backendHandle = reinterpret_cast<long>(new CudaBackend(mode));
310 // std::cout << "getBackend() -> backendHandle=" << std::hex << backendHandle << std::dec << std::endl;
311 return backendHandle;
312 }
313
314 void clCallback(void *) {
315 std::cerr << "start of compute" << std::endl;
316 }
317
318 void CudaBackend::computeEnd() {
319 queue->computeEnd();
320 }
321
322 void CudaBackend::computeStart() {
323 queue->computeStart();
324 }
325
326 bool CudaBackend::getBufferFromDeviceIfDirty(void *memorySegment, long memorySegmentLength) {
327 if (config->traceCalls) {
328 std::cout << "getBufferFromDeviceIfDirty(" << std::hex << reinterpret_cast<long>(memorySegment) << "," <<
329 std::dec << memorySegmentLength << "){" << std::endl;
330 }
331 if (config->minimizeCopies) {
332 const BufferState *bufferState = BufferState::of(memorySegment, memorySegmentLength);
333 if (bufferState->state == BufferState::DEVICE_OWNED) {
334 queue->copyFromDevice(static_cast<Backend::Buffer *>(bufferState->vendorPtr));
335 if (config->traceEnqueues | config->traceCopies) {
336 std::cout << "copying buffer from device (from java access) " << std::endl;
337 }
338 queue->wait();
339 queue->release();
340 } else {
341 std::cout << "HOW DID WE GET HERE 1 attempting to get buffer but buffer is not device dirty" << std::endl;
342 std::exit(1);
343 }
344 } else {
345 std::cerr <<
346 "HOW DID WE GET HERE ? java side should avoid calling getBufferFromDeviceIfDirty as we are not minimising buffers!"
347 << std::endl;
348 std::exit(1);
349 }
350 if (config->traceCalls) {
351 std::cout << "}getBufferFromDeviceIfDirty()" << std::endl;
352 }
353 return true;
354 }
355
356 CudaBackend *CudaBackend::of(const long backendHandle) {
357 return reinterpret_cast<CudaBackend *>(backendHandle);
358 }
359
360 CudaBackend *CudaBackend::of(Backend *backend) {
361 return dynamic_cast<CudaBackend *>(backend);
362 }
363
364 CudaBackend::CudaBuffer *CudaBackend::getOrCreateBuffer(BufferState *bufferState) {
365 CudaBuffer *cudaBuffer = nullptr;
366 if (bufferState->vendorPtr == nullptr || bufferState->state == BufferState::NEW_STATE) {
367 cudaBuffer = new CudaBuffer(this, bufferState);
368 if (config->trace) {
369 std::cout << "We allocated arg buffer " << std::endl;
370 }
371 } else {
372 if (config->trace) {
373 std::cout << "Were reusing buffer buffer " << std::endl;
374 }
375 cudaBuffer = static_cast<CudaBuffer *>(bufferState->vendorPtr);
376 }
377 return cudaBuffer;
378 }