1 /* 2 * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 #include <sys/wait.h> 27 #include <chrono> 28 #include <thread> 29 #include "cuda_backend.h" 30 31 CudaBackend::CudaQueue::CudaQueue(Backend *backend) 32 : Backend::Queue(backend),cuStream(),streamCreationThread() { 33 } 34 void CudaBackend::CudaQueue::init(){ 35 streamCreationThread = std::this_thread::get_id(); 36 if (backend->config->traceCalls){ 37 std::cout << "init() 0x" 38 << " thread=" <<streamCreationThread 39 << std::endl; 40 } 41 42 WHERE{.f=__FILE__ , .l=__LINE__, 43 .e=cuStreamCreate(&cuStream,CU_STREAM_DEFAULT), 44 .t= "cuStreamCreate" 45 }.report(); 46 47 if (backend->config->traceCalls){ 48 std::cout << "exiting init() 0x" 49 << " custream=" <<std::hex<<streamCreationThread <<std::dec 50 << std::endl; 51 } 52 } 53 54 //void CudaBackend::CudaQueue::sync(const char *file, int line) const { 55 56 //} 57 58 59 void CudaBackend::CudaQueue::wait(){ 60 WHERE{.f=__FILE__, .l=__LINE__, 61 .e=cuStreamSynchronize(cuStream), 62 .t= "cuStreamSynchronize" 63 }.report(); 64 } 65 66 67 void CudaBackend::CudaQueue::computeStart(){ 68 wait(); // should be no-op 69 release(); // also ; 70 } 71 72 73 74 void CudaBackend::CudaQueue::computeEnd(){ 75 76 } 77 78 79 80 81 void CudaBackend::CudaQueue::release(){ 82 83 } 84 85 CudaBackend::CudaQueue::~CudaQueue(){ 86 // delete []events; 87 WHERE{.f=__FILE__, .l=__LINE__, 88 .e=cuStreamDestroy(cuStream), 89 .t= "cuStreamDestroy" 90 }.report(); 91 } 92 93 void CudaBackend::CudaQueue::copyToDevice(Buffer *buffer) { 94 //auto cudaBackend = dynamic_cast<CudaBackend*>(backend); 95 auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer); 96 std::thread::id thread_id = std::this_thread::get_id(); 97 if (thread_id != streamCreationThread){ 98 std::cout << "copyToDevice() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl; 99 } 100 if (backend->config->traceCalls) { 101 102 std::cout << "copyToDevice() 0x" 103 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/" 104 << cudaBuffer->bufferState->length << " " 105 << "devptr=" << std::hex<< (long)cudaBuffer->devicePtr <<std::dec 106 << " thread=" <<thread_id 107 << std::endl; 108 } 109 WHERE{.f=__FILE__, .l=__LINE__, 110 .e=cuMemcpyHtoDAsync( 111 cudaBuffer->devicePtr, 112 cudaBuffer->bufferState->ptr, 113 cudaBuffer->bufferState->length, 114 dynamic_cast<CudaQueue*>(backend->queue)->cuStream), 115 .t="cuMemcpyHtoDAsync" 116 }.report(); 117 118 } 119 120 void CudaBackend::CudaQueue::copyFromDevice(Buffer *buffer) { 121 auto *cudaBuffer = dynamic_cast<CudaBuffer *>(buffer); 122 //auto cudaBackend = dynamic_cast<CudaBackend*>(backend); 123 std::thread::id thread_id = std::this_thread::get_id(); 124 if (thread_id != streamCreationThread){ 125 std::cout << "copyFromDevice() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl; 126 } 127 if (backend->config->traceCalls) { 128 129 std::cout << "copyFromDevice() 0x" 130 << std::hex<<cudaBuffer->bufferState->length<<std::dec << "/" 131 << cudaBuffer->bufferState->length << " " 132 << "devptr=" << std::hex<< (long)cudaBuffer->devicePtr <<std::dec 133 << " thread=" <<thread_id 134 << std::endl; 135 } 136 137 138 WHERE{.f=__FILE__, .l=__LINE__, 139 .e=cuMemcpyDtoHAsync( 140 cudaBuffer->bufferState->ptr, 141 cudaBuffer->devicePtr, 142 cudaBuffer->bufferState->length, 143 dynamic_cast<CudaQueue*>(backend->queue)->cuStream), 144 .t="cuMemcpyDtoHAsync" 145 }.report(); 146 147 } 148 149 void CudaBackend::CudaQueue::dispatch(KernelContext *kernelContext, CompilationUnit::Kernel *kernel) { 150 auto cudaKernel = dynamic_cast<CudaModule::CudaKernel *>(kernel); 151 152 int range = kernelContext->maxX; 153 int rangediv1024 = range / 1024; 154 int rangemod1024 = range % 1024; 155 if (rangemod1024 > 0) { 156 rangediv1024++; 157 } 158 // std::cout << "Running the kernel..." << std::endl; 159 // std::cout << " Requested range = " << range << std::endl; 160 // std::cout << " Range mod 1024 = " << rangemod1024 << std::endl; 161 // std::cout << " Actual range 1024 = " << (rangediv1024 * 1024) << std::endl; 162 // auto status= static_cast<CUresult>(cudaStreamSynchronize(cudaBackend->cudaQueue.cuStream)); 163 164 // cudaBackend->cudaQueue.wait(); 165 std::thread::id thread_id = std::this_thread::get_id(); 166 if (thread_id != streamCreationThread){ 167 std::cout << "dispatch() thread=" <<thread_id<< " != "<< streamCreationThread<< std::endl; 168 } 169 170 auto status = cuLaunchKernel(cudaKernel->function, 171 rangediv1024, 1, 1, 172 1024, 1, 1, 173 0, cuStream, 174 cudaKernel->argslist, nullptr); 175 176 WHERE{.f=__FILE__, .l=__LINE__, .e=status, .t="cuLaunchKernel"}.report(); 177 }