1 2 # Minimizing Buffer Transfers 3 4 ---- 5 6 * [Contents](hat-00.md) 7 * House Keeping 8 * [Project Layout](hat-01-01-project-layout.md) 9 * [Building Babylon](hat-01-02-building-babylon.md) 10 * [Building HAT](hat-01-03-building-hat.md) 11 * Programming Model 12 * [Programming Model](hat-03-programming-model.md) 13 * Interface Mapping 14 * [Interface Mapping Overview](hat-04-01-interface-mapping.md) 15 * [Cascade Interface Mapping](hat-04-02-cascade-interface-mapping.md) 16 * Implementation Detail 17 * [Walkthrough Of Accelerator.compute()](hat-accelerator-compute.md) 18 * [How we minimize buffer transfers](hat-minimizing-buffer-transfers.md) 19 20 ---- 21 22 ## Using buffer marking to minimize data transfers 23 24 ### The naive approach 25 The default execution model is that at each kernel 26 dispatch the backend just copy all arg buffers togc 27 the device and after the dispatch it copies all arg 28 buffers back. 29 30 ### Using kernel arg buffer access patterns 31 If we knew how each kernel accesses it's args (via static analysis of code model orgc 32 by marking the args RO, RW or WO with annotations) we can avoid some copies by onlygc 33 copying in if the kernel 'reads' the arg buffer and only copying out if the 34 kernel writes to the arg buffer. 35 36 Lets use the game of life as an example.gc 37 38 We assume that the UI only needs updating at some 'rate' (say 5 fps), but the kernels can generate 39 generations faster that 5 generations per second. code to generate eactgc 40 41 So not every generation needs to be copied to the device.gc 42 43 We'll ignore the detail regarding the `life` kernel, and we will assume the kernel args Mostly we care ab 44 are appropriately annotated as RO, RW or WO. 45 46 ```java 47 @CodeReflection 48 public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) { 49 if (kc.x < kc.maxX) { 50 Compute.lifePerIdx(kc.x, control, cellGrid); 51 } 52 } 53 54 @CodeReflection 55 static public void compute(final @RO ComputeContext cc, 56 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 57 var timeOfLastUIUpdate = System.currentTimeMillis(); 58 var msPerFrame = 1000/5; // we want 5 fps 59 while (viewer.state.generation < viewer.state.maxGenerations) { 60 long now = System.currentTimeMillis(); 61 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 62 var updateNeeded = (msSinceLastUpdate > msPerFrame); 63 gc 64 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 65 kc -> Compute.life(kc, control, cellGrid) 66 ); 67 gc 68 // Here we are swapping from<->to on the control buffer 69 int to = control.from(); 70 control.from(control.to()); 71 control.to(to); 72 gc 73 if (updateNeeded) { 74 viewer.update(now, to, cellGrid); 75 timeOfLastUIUpdate = now; 76 } 77 } 78 } 79 ``` 80 81 First lets assume there were no automatic transfers, assume we had to define them. we had to explicitly control transfers so we will insert codegc 82 83 What would our code look likegc 84 85 86 ```java 87 @CodeReflection 88 public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) { 89 if (kc.x < kc.maxX) { 90 Compute.lifePerIdx(kc.x, control, cellGrid); 91 } 92 } 93 94 @CodeReflection 95 static public void compute(final @RO ComputeContext cc, 96 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 97 var timeOfLastUIUpdate = System.currentTimeMillis(); 98 var msPerFrame = 1000/5; // we want 5 fps 99 var cellGridIsJavaDirty = true; 100 var controlIsJavaDirty = true; 101 var cellGridIsDeviceDirty = true; 102 var controlIsDeviceDirty = true; 103 while (true) { 104 long now = System.currentTimeMillis(); 105 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 106 var updateNeeded = (msSinceLastUpdate > msPerFrame); 107 gc 108 if (cellGridIsJavaDirty){ 109 cc.copyToDevice(cellGrid); 110 } 111 if (controlIsJavaDirty){ 112 cc.copyToDevice(control); 113 } 114 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 115 kc -> Compute.life(kc, control, cellGrid) 116 ); 117 controlIsDeviceDirty = false; // Compute.life marked control as @RO 118 cellGridIsDeviceDirty = true; // Compute.life marjed cellGrid as @RW 119 gc 120 // Here we are swapping from<->to on the control buffer 121 if (controlIsDeviceDirty){ 122 cc.copyFromDevice(control); 123 } 124 int to = control.from(); 125 control.from(control.to()); 126 control.to(to); 127 controlIsJavaDirty = true; 128 gc 129 if (updateNeeded) { 130 if (cellGridIsDeviceDirty){ 131 cc.copyFromDevice(cellGrid); 132 } 133 viewer.update(now, to, cellGrid); 134 timeOfLastUIUpdate = now; 135 } 136 } 137 } 138 ``` 139 140 Alternatively what if the buffers themselves could hold the deviceDirty flags javaDirty? 141 142 143 ```java 144 @CodeReflection 145 public static void life(@RO KernelContext kc, @RO Control control, @RW CellGrid cellGrid) { 146 if (kc.x < kc.maxX) { 147 Compute.lifePerIdx(kc.x, control, cellGrid); 148 } 149 } 150 151 @CodeReflection 152 static public void compute(final @RO ComputeContext cc, 153 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 154 control.flags =JavaDirty; // not ideal but necessary 155 cellGrid.flags = JavaDirty; // not ideal but necessary 156 gc 157 var timeOfLastUIUpdate = System.currentTimeMillis(); 158 var msPerFrame = 1000/5; // we want 5 fps 159 160 while (true) { 161 long now = System.currentTimeMillis(); 162 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 163 var updateNeeded = (msSinceLastUpdate > msPerFrame); 164 gc 165 if ((cellGrid.flags & JavaDirty) == JavaDirty){ 166 cc.copyToDevice(cellGrid); 167 } 168 if ((control.flags & JavaDirty) == JavaDirty){ 169 cc.copyToDevice(control); 170 } 171 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 172 kc -> Compute.life(kc, control, cellGrid) 173 ); 174 control.flags = JavaDirty; // Compute.life marked control as @RO 175 cellGrid.flags = DeviceDirty; // Compute.life marjed cellGrid as @RW 176 gc 177 // Here we are swapping from<->to on the control buffer 178 if ((control.flags & DeviceDirty)==DeviceDirty){ 179 cc.copyFromDevice(control); 180 } 181 int to = control.from(); 182 control.from(control.to()); 183 control.to(to); 184 control.flags = JavaDirty; 185 gc 186 if (updateNeeded) { 187 if ((cellGrid.flags & DeviceDirty)==DeviceDirty){ 188 cc.copyFromDevice(cellGrid); 189 } 190 viewer.update(now, to, cellGrid); 191 // update does not mutate cellGrid so cellGrid.flags = DeviceDirty 192 timeOfLastUIUpdate = now; 193 } 194 } 195 } 196 ``` 197 198 Essentially we defer to the kernel dispatch to determine whether buffers are 199 copied to the device and to mark buffers accordingly if the dispatch mutated the buffer.gc 200 201 Psuedo code for dispatch is essentiallygc 202 ```java 203 204 void dispatchKernel(Kernel kernel, KernelContext kc, Arg ... args) { 205 for (int argn = 0; argn<args.length; argn++){ 206 Arg arg = args[argn]; 207 if (((arg.flags &JavaDirty)==JavaDirty) && kernel.readsFrom(arg)) { 208 enqueueCopyToDevice(arg); 209 } 210 } 211 enqueueKernel(kernel); 212 for (int argn = 0; argn<args.length; argn++){ 213 Arg arg = args[argn]; 214 if (kernel.writesTo(arg)) { 215 arg.flags = DeviceDirty; 216 } 217 } 218 } 219 ``` 220 We rely on babylon to mark each buffer passed to it as JavaDirty 221 222 ```java 223 224 @CodeReflection 225 static public void compute(final @RO ComputeContext cc, 226 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 227 control.flags = JavaDirty; 228 cellGrid.flags = JavaDirty; 229 // yada yada 230 } 231 ``` 232 233 We also rely on babylon to inject calls before each buffer access from java in the compute code. 234 235 So the injected code would look like this.gc 236 237 238 ```java 239 240 @CodeReflection 241 static public void compute(final @RO ComputeContext cc, 242 Viewer viewer, @RO Control control, @RW CellGrid cellGrid) { 243 control.flags =JavaDirty; // injected by bablyon 244 cellGrid.flags = JavaDirty; // injected by babylon 245 gc 246 var timeOfLastUIUpdate = System.currentTimeMillis(); 247 var msPerFrame = 1000/5; // we want 5 fps 248 while (true) { 249 long now = System.currentTimeMillis(); 250 var msSinceLastUpdate = (now - timeOfLastUIUpdate); 251 var updateNeeded = (msSinceLastUpdate > msPerFrame); 252 gc 253 // See the psuedo code above to see how dispatchKernel 254 // Only copies buffers that need copying, and marks 255 // buffers it has mutate as dirty 256 cc.dispatchKernel(cellGrid.width() * cellGrid.height(), 257 kc -> Compute.life(kc, control, cellGrid) 258 ); 259 gc 260 // injected by babylon 261 if ((control.flags & DeviceDirty)==DeviceDirty){ 262 cc.copyFromDevice(control); 263 } 264 // Here we are swapping from<->to on the control buffer 265 int to = control.from(); 266 gc 267 control.from(control.to()); 268 control.flags = JavaDirty; // injectedgc 269 control.to(to); 270 control.flags = JavaDirty; // injected, but can be avoided 271 gc 272 if (updateNeeded) { 273 // Injected by babylon because cellGrid escapes cpmputegc 274 // and because viewer.update marks cellGrid as @RO 275 if ((cellGrid.flags & DeviceDirty)==DeviceDirty){ 276 cc.copyFromDevice(cellGrid); 277 } 278 viewer.update(now, to, cellGrid); 279 // We don't copy cellgrid back after escape becausegc 280 // viewer.update annotates cellGrdi access as RO 281 timeOfLastUIUpdate = now; 282 } 283 } 284 } 285 ``` 286 287 288 289 290 291 292 293