New hat/backends/ffi/cuda/cpp/cuda_backend

  1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 
 26 
 27 #include "cuda_backend.h"
 28 
 29 
 30 CudaBackend::CudaModule::CudaKernel::CudaKernel(Backend::CompilationUnit *program,char * name, CUfunction function)
 31         : Backend::CompilationUnit::Kernel(program, name), function(function) {
 32 }
 33 
 34 CudaBackend::CudaModule::CudaKernel::~CudaKernel() = default;
 35 /*
 36 long CudaBackend::CudaModule::CudaKernel::ndrange(void *argArray) {
 37 
 38     auto cudaBackend = CudaBackend::of(compilationUnit->backend);
 39     if (cudaBackend->cudaConfig.traceCalls) {
 40         std::cout << "ndrange(" <<  ") " << name << std::endl;
 41     }
 42     ArgSled argSled(static_cast<ArgArray_s *>(argArray));
 43     void *argslist[argSled.argc()];
 44 
 45     NDRange *ndrange = nullptr;
 46     for (int i = 0; i < argSled.argc(); i++) {
 47         Arg_s *arg = argSled.arg(i);
 48         switch (arg->variant) {
 49             case '&': {
 50                 if (arg->idx == 0){
 51                     ndrange = static_cast<NDRange *>(arg->value.buffer.memorySegment);
 52                 }
 53                 auto cudaBuffer = new CudaBackend::CudaBuffer(cudaBackend, arg, BufferState_s::of(arg));
 54                 cudaBuffer->copyToDevice();
 55                 argslist[arg->idx] = static_cast<void *>(&cudaBuffer->devicePtr);
 56                 break;
 57             }
 58             case 'I':
 59             case 'F':
 60             case 'J':
 61             case 'D':
 62             case 'C':
 63             case 'S': {
 64                 argslist[arg->idx] = static_cast<void *>(&arg->value);
 65                 break;
 66             }
 67             default: {
 68                 std::cerr << " unhandled variant " << (char) arg->variant << std::endl;
 69                 break;
 70             }
 71         }
 72     }
 73     int range = ndrange->maxX;
 74     int rangediv1024 = range / 1024;
 75     int rangemod1024 = range % 1024;
 76     if (rangemod1024 > 0) {
 77         rangediv1024++;
 78     }
 79    // std::cout << "Running the kernel..." << std::endl;
 80    // std::cout << "   Requested range   = " << range << std::endl;
 81    // std::cout << "   Range mod 1024    = " << rangemod1024 << std::endl;
 82    // std::cout << "   Actual range 1024 = " << (rangediv1024 * 1024) << std::endl;
 83   //  auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaBackend->cudaQueue.cuStream));
 84 
 85   //  cudaBackend->cudaQueue.wait();
 86     auto status= cuLaunchKernel(function,
 87                                    rangediv1024, 1, 1,
 88                                    1024, 1, 1,
 89                                    0, cudaBackend->cudaQueue.cuStream ,
 90                     argslist, nullptr);
 91 
 92     WHERE{.f=__FILE__, .l=__LINE__, .e=status, .t="cuLaunchKernel"}.report();
 93  //   cudaBackend->cudaQueue.wait();
 94 
 95 
 96     for (int i = 0; i < argSled.argc(); i++) {
 97         Arg_s *arg = argSled.arg(i);
 98         if (arg->variant == '&') {
 99             auto bufferState = BufferState_s::of(arg)->vendorPtr;
100             auto cudaBuffer = static_cast<CudaBuffer *>(bufferState);
101             cudaBuffer->copyFromDevice();
102         }
103     }
104     cudaBackend->cudaQueue.wait();
105 
106     for (int i = 0; i < argSled.argc(); i++) {
107         Arg_s *arg = argSled.arg(i);
108         if (arg->variant == '&') {
109             auto bufferState = BufferState_s::of(arg)->vendorPtr;
110             auto cudaBuffer = static_cast<CudaBuffer *>(bufferState);
111             delete cudaBuffer;
112 
113         }
114     }
115 
116     return (long) 0;
117 } */
118 
119 CudaBackend::CudaModule::CudaKernel * CudaBackend::CudaModule::CudaKernel::of(long kernelHandle){
120     return reinterpret_cast<CudaBackend::CudaModule::CudaKernel *>(kernelHandle);
121 }
122 CudaBackend::CudaModule::CudaKernel * CudaBackend::CudaModule::CudaKernel::of(Backend::CompilationUnit::Kernel *kernel){
123     return dynamic_cast<CudaBackend::CudaModule::CudaKernel *>(kernel);
124 }
125 
126 bool CudaBackend::CudaModule::CudaKernel::setArg(KernelArg *arg){
127     argslist[arg->idx] = static_cast<void *>(&arg->value);
128     return true;
129 }
130 bool CudaBackend::CudaModule::CudaKernel::setArg(KernelArg *arg, Buffer *buffer) {
131     argslist[arg->idx] = static_cast<void *>(&dynamic_cast<CudaBuffer *>(buffer)->devicePtr);
132     return true;
133 }