1 /*
  2  * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.  Oracle designates this
  8  * particular file as subject to the "Classpath" exception as provided
  9  * by Oracle in the LICENSE file that accompanied this code.
 10  *
 11  * This code is distributed in the hope that it will be useful, but WITHOUT
 12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 14  * version 2 for more details (a copy is included in the LICENSE file that
 15  * accompanied this code).
 16  *
 17  * You should have received a copy of the GNU General Public License version
 18  * 2 along with this work; if not, write to the Free Software Foundation,
 19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 20  *
 21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 22  * or visit www.oracle.com if you need additional information or have any
 23  * questions.
 24  */
 25 package violajones.attic;
 26 
 27 
 28 import hat.Accelerator;
 29 import hat.NDRange;
 30 import hat.backend.java.WorkStealer;
 31 import hat.buffer.F32Array2D;
 32 import org.xml.sax.SAXException;
 33 import violajones.Viewer;
 34 import violajones.XMLHaarCascadeModel;
 35 import hat.buffer.S08x3RGBImage;
 36 import violajones.ifaces.Cascade;
 37 import violajones.ifaces.ResultTable;
 38 import violajones.ifaces.ScaleTable;
 39 
 40 import javax.imageio.ImageIO;
 41 import javax.xml.parsers.ParserConfigurationException;
 42 import java.awt.image.BufferedImage;
 43 import java.io.IOException;
 44 import java.lang.invoke.MethodHandles;
 45 import java.util.Objects;
 46 
 47 public class ViolaJones {
 48 
 49     public static void main(String[] _args) throws IOException, ParserConfigurationException, SAXException {
 50         //  Accelerator accelerator = new Accelerator(MethodHandles.lookup(), Backend::isJava);
 51         Accelerator accelerator = new Accelerator(MethodHandles.lookup());
 52 
 53 
 54         BufferedImage nasa = ImageIO.read(Objects.requireNonNull(ViolaJones.class.getResourceAsStream("/images/Nasa1996.jpg")));
 55         XMLHaarCascadeModel xmlCascade = XMLHaarCascadeModel.load(ViolaJonesRaw.class.getResourceAsStream("/cascades/haarcascade_frontalface_default.xml"));
 56    //     Cascade cascade = Cascade.create(accelerator, xmlHaarCascade);
 57         var cascade = Cascade.createFrom(accelerator,xmlCascade);
 58         var width = nasa.getWidth();
 59         var height = nasa.getHeight();
 60         S08x3RGBImage rgbImage = S08x3RGBImage.create(accelerator,width,height);
 61 
 62 
 63         // harViz.showIntegrals();
 64 
 65         var scaleTable = ScaleTable.createFrom(accelerator,new ScaleTable.Constraints(cascade,width,height));
 66 
 67 
 68         var greyImageF32 = F32Array2D.create(accelerator, width, height);
 69         var integralImageF32 = F32Array2D.create(accelerator, width, height);
 70         var integralSqImageF32 = F32Array2D.create(accelerator, width, height);
 71         var resultTable = ResultTable.create(accelerator, 1000);
 72         CoreJavaViolaJones.rgbToGreyScale(rgbImage, greyImageF32);
 73         CoreJavaViolaJones.createIntegralImage(greyImageF32, integralImageF32, integralSqImageF32);
 74 
 75         Viewer harViz = new Viewer(accelerator, nasa, rgbImage, cascade, integralImageF32, integralSqImageF32);
 76 
 77         harViz.showIntegrals();
 78 
 79 
 80         //   long floatToShortKernel = accelerator.bridge.getKernel(progHandle, "floatToShortKernel");
 81         //   long integralColKernel = accelerator.bridge.getKernel(progHandle, "integralColKernel");
 82         //   long integralRowKernel = accelerator.bridge.getKernel(progHandle, "integralRowKernel");
 83         //   long singlePassCascadeKernel = accelerator.bridge.getKernel(progHandle, "singlePassCascadeKernel");
 84 
 85 
 86         // openCLBridge.dump(cascadeMemorySegment, ((OpenCLStructLayout.Tools)cascadeInterface).getLayout());
 87         // openCLBridge.dump(scaleTable,scaleTableLayout.layout);
 88 
 89 
 90         //   openCLBridge.dump(treeTable, treeTableLayout.layout);
 91         // openCLBridge.dump(stageTable, stageTableLayout.layout);
 92         //   FloatBuffer integralImage = FloatBuffer.create(accelerator, rgbImageLayout.getElementCount());
 93         //   FloatBuffer integralSqImage = FloatBuffer.create(accelerator, rgbImageLayout.getElementCount());
 94 
 95         // MemorySegment integralImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
 96         // MemorySegment integralSqImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
 97 
 98         // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
 99 
100 
101         int groupSize = 256;
102         int rangeModGroupSize = scaleTable.rangeModGroupSize(groupSize);
103         //(scaleTable.multiScaleAccumulativeRange() / groupSize) + ((scaleTable.multiScaleAccumulativeRange() % groupSize) == 0 ? 0 : 1)) * groupSize;
104 
105 /*
106         OpenCLCodeBuilder c99 = (OpenCLCodeBuilder) accelerator.getCodeBuilder();
107         c99
108                 .typedef(FeatureTable.Feature.RectTable.Rect.class)
109                 .typedef(FeatureTable.Feature.LinkOrValue.class)
110                 .typedef(FeatureTable.Feature.class)
111                 .typedef(ScaleTable.Scale.class)
112                 .typedef(StageTable.Stage.class)
113                 .typedef(TreeTable.Tree.class)
114                 .typedef(ResultTable.Result.class)
115                 .typedef(Cascade.class)
116                 .append("""
117 
118                 #define SCOPE_START  ndrange_t ndrange;ndrange.id.x=get_global_id(0);ndrange.id.maxX=get_global_size(0);
119                 #ifdef NDRANGE_CUDA
120                 #define atomicInc(p) atomicAdd(p, 1)
121                 #else
122                 #define atomicInc(p) atom_add(p, 1)
123                 #endif
124 
125                 inline int b2i(i4 v){
126                    return v < 0 ? 256 + v : v;
127                 }
128                 inline int rgbToGrey(i4 r, i4 g, i4 b){
129                    return (29 * b2i(r) + 60 * b2i(g) + 11 * b2i(b)) / 100;
130                 }
131                 inline void integralColById(i4 id, __global cascade_t *cascadeContext, __global b1 *rgb, __global f4 *integral, __global f4 *integralSq){
132                    integralSq[id] = integral[id] = 0.0f;
133                    for (s32_t y = 1; y < cascadeContext->imageHeight; y++) {
134                        s32_t monoOffset = (y * cascadeContext->imageWidth) + id;
135                        f32_t lastSq = integralSq[monoOffset - cascadeContext->imageWidth];
136                        f32_t last = integral[monoOffset - cascadeContext->imageWidth];
137                        char r = rgb[monoOffset * 3 + 0];
138                        char g = rgb[monoOffset * 3 + 1];
139                        char b = rgb[monoOffset * 3 + 2];
140                        f32_t greyValue = rgbToGrey(r, g, b);
141                        f32_t greyValueSq = greyValue * greyValue;
142                        integralSq[monoOffset] = greyValueSq + lastSq;
143                        integral[monoOffset] = greyValue + last;
144                    }
145                 }
146                 __kernel void integralColKernel(__global cascade_t *cascadeContext, __global b1 *rgb, __global f4 *integral, __global f4 *integralSq){
147                      SCOPE_START
148                      integralColById(ndrange.id.x, cascadeContext, rgb,  integral, integralSq);
149                 }
150                 inline void integralRowById(i4 id, __global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq){
151                      for (s32_t x = 1; x < cascadeContext->imageWidth; x++) {
152                         s32_t monoOffset = (id * cascadeContext->imageWidth) + x;
153                         integral[monoOffset] = integral[monoOffset] + integral[monoOffset - 1];
154                      }
155                      for (s32_t x = 1; x < cascadeContext->imageWidth; x++) {
156                         s32_t monoOffset = (id * cascadeContext->imageWidth) + x;
157                         integralSq[monoOffset] = integralSq[monoOffset] + integralSq[monoOffset - 1];
158                      }
159                 }
160                 __kernel void integralRowKernel(__global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq){
161                    SCOPE_START
162                    integralRowById(ndrange.id.x,  cascadeContext, integral, integralSq);
163                 }
164                 __kernel void floatToShortKernel(__global cascade_t *cascadeContext, __global f4 *fromIntegral, __global s2 *toIntegral, __global f4 *fromIntegralSq, __global s2 *toIntegralSq){
165                    SCOPE_START
166                    toIntegral[ndrange.id.x] = (s16_t)(fromIntegral[ndrange.id.x]*(65536/fromIntegral[ndrange.id.maxX-1]));
167                    toIntegralSq[ndrange.id.x] = (s16_t)(fromIntegralSq[ndrange.id.x]*(65536/fromIntegralSq[ndrange.id.maxX-1]));
168                 }
169 
170 
171                 /
172                       A +-------+ B
173                         |       |       D-B-C+A
174                       C +-------+ D
175                 /
176                 inline float gradient(__global f4 *image, i4 imageWidth, i4 x, i4 y, i4 width, i4 height){
177                    f32_t A = image[(y * imageWidth) + x];
178                    f32_t D = image[((y + height) * imageWidth) + x + width];
179                    f32_t C = image[((y + height) * imageWidth) + x];
180                    f32_t B = image[(y * imageWidth) + x + width];
181                    return D-B-C+A;
182                 }
183                 inline boolean isAFaceStage(__global cascade_t *cascadeContext, __global scale_t *scale, i4 x, i4 y, f4 vnorm, __global f4 *integral, __global stage_t *stagePtr, __global tree_t *treeTable, __global feature_t *featureTable){
184                    f32_t sumOfThisStage = 0;
185                    for (s32_t treeId = stagePtr->firstTreeId; treeId < (stagePtr->firstTreeId+stagePtr->treeCount); treeId++) {
186                        // featureId from 0 to how many roots there are.... we use -1 for none! hence s32_t
187                        const __global tree_t *treePtr = &treeTable[treeId];
188                        s32_t featureId = treePtr->firstFeatureId;
189                        while (featureId >= 0) {
190                            const __global feature_t *featurePtr = &featureTable[featureId];
191                            f32_t featureGradientSum = .0f;
192                            for (s32_t i = 0; i < 3; i++) {
193                                const __global rect_t  *rect = &featurePtr->rects[i];
194                                featureGradientSum +=   featurePtr->rects[i].weight *
195                                    gradient(integral,  cascadeContext->imageWidth,
196                                        x + (int) (rect->x * scale->scaleValue),
197                                        y + (int) (rect->y * scale->scaleValue),
198                                        (int) (rect->width * scale->scaleValue),
199                                        (int) (rect->height * scale->scaleValue)
200                                    ) ;
201                            }
202                            if ((featureGradientSum * scale->invArea) < (featurePtr->threshold * vnorm)) {//left
203                               if (featurePtr->left.hasValue) {
204                                   sumOfThisStage += featurePtr->left.anon.value;
205                                   featureId = -1;
206                               } else {
207                                   featureId = treePtr->firstFeatureId+featurePtr->left.anon.featureId;
208                               }
209                            }else{ // right
210                               if (featurePtr->right.hasValue) {
211                                   sumOfThisStage += featurePtr->right.anon.value;
212                                   featureId = -1;
213                               } else {
214                                   featureId = treePtr->firstFeatureId+featurePtr->right.anon.featureId;
215                               }
216                            }
217                        }
218                    }
219                    return sumOfThisStage > stagePtr->threshold;
220                 }
221                 __kernel void singlePassCascadeKernel(__global cascade_t *cascadeContext, __global f4 *integral, __global f4 *integralSq, __global scale_t *scaleTable, __global result_t *resultTable, __global stage_t *stageTable, __global tree_t *treeTable, __global feature_t *featureTable){
222                    SCOPE_START
223 
224                    size_t gid = ndrange.id.x;
225                    if (gid < cascadeContext->multiScaleAccumulativeRange){
226                       s32_t i;
227                       // This is where we select the scale to use.
228                       for (i=0; gid >=scaleTable[i].accumGridSizeMax; i++)
229                          ;
230 
231                       __global scale_t *scale = &scaleTable[i];
232 
233                       s16_t x = (s16_t)(((gid-scale->accumGridSizeMin) % scale->gridWidth) * scale->scaledXInc);
234                       s16_t y = (s16_t)(((gid-scale->accumGridSizeMin) / scale->gridWidth) * scale->scaledYInc);
235 
236                       f32_t integralGradient = gradient(integral, cascadeContext->imageWidth, x, y, scale->scaledFeatureWidth, scale->scaledFeatureHeight) * scale->invArea;
237                       f32_t integralSqGradient = gradient(integralSq, cascadeContext->imageWidth, x, y, scale->scaledFeatureWidth, scale->scaledFeatureHeight) * scale->invArea;
238 
239                       f32_t vnorm = integralSqGradient - integralGradient * integralGradient;
240                       vnorm =  (vnorm > 1) ? sqrt(vnorm) : 1;
241 
242                       bool stillLooksLikeAFace = true;
243 
244                       for (s32_t stageId = 0; stillLooksLikeAFace && (stageId < cascadeContext->stageCount); stageId++) {
245                          __global stage_t *stagePtr = &stageTable[stageId];
246                          stillLooksLikeAFace =isAFaceStage(cascadeContext, scale,  x, y,  vnorm,  integral,  stagePtr, treeTable, featureTable);
247                       }
248                       if (stillLooksLikeAFace) {
249                          s32_t index = atomicInc(&cascadeContext->atomicResultTableCount);
250                          if (index<cascadeContext->maxResults){
251                             resultTable[index].x = x;
252                             resultTable[index].y = y;
253                             resultTable[index].width = scale->scaledFeatureWidth;
254                             resultTable[index].height = scale->scaledFeatureHeight;
255                          }
256                        }
257                    }
258                 }
259                 """
260              );
261 
262 
263 
264         long progHandle = accelerator.bridge.compileProgram(c99.toString());
265         if (accelerator.bridge.programOK(progHandle)) {
266             long floatToShortKernel = accelerator.bridge.getKernel(progHandle, "floatToShortKernel");
267             long integralColKernel = accelerator.bridge.getKernel(progHandle, "integralColKernel");
268             long integralRowKernel = accelerator.bridge.getKernel(progHandle, "integralRowKernel");
269             long singlePassCascadeKernel = accelerator.bridge.getKernel(progHandle, "singlePassCascadeKernel");
270 
271 
272             // openCLBridge.dump(cascadeMemorySegment, ((OpenCLStructLayout.Tools)cascadeInterface).getLayout());
273             // openCLBridge.dump(scaleTable,scaleTableLayout.layout);
274 
275 
276             //   openCLBridge.dump(treeTable, treeTableLayout.layout);
277             // openCLBridge.dump(stageTable, stageTableLayout.layout);
278             MemorySegment integralImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
279             MemorySegment integralSqImageMemorySegment = arena.allocateArray(JAVA_FLOAT, rgbImageLayout.getElementCount());
280 
281             // openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
282 
283             int groupSize = 256;
284             int range = ((multiScaleTable.multiScaleAccumulativeRange / groupSize) + ((multiScaleTable.multiScaleAccumulativeRange % groupSize) == 0 ? 0 : 1)) * groupSize;
285 
286             ImageLayout integralImage = new ImageLayout(new BufferedImage(rgbImageLayout.getWidth(), rgbImageLayout.getHeight(), BufferedImage.TYPE_USHORT_GRAY));
287             ImageLayout integralSqImage = new ImageLayout(new BufferedImage(rgbImageLayout.getWidth(), rgbImageLayout.getHeight(), BufferedImage.TYPE_USHORT_GRAY));
288             ImageLayout.Instance integralImageInstance = integralImage.instance(arena);
289             ImageLayout.Instance integralSqImageInstance = integralSqImage.instance(arena);
290 
291             ImageLayout.Instance rgbImageLayoutInstance = rgbImageLayout.instance(arena);
292 
293             HaarVisualizer harViz = new HaarVisualizer(rgbImageLayoutInstance, haarCascade, integralImageInstance, integralSqImageInstance);
294 
295             accelerator.bridge.ndrange(integralColKernel, rgbImageLayout.getWidth(),
296                     DeviceArgs.of()
297                             .s08_1dRO(cascade.segment())
298                             .s08_1dRO(rgbImageLayoutInstance.memorySegment)
299                             .f32_1dWO(integralImageMemorySegment)
300                             .f32_1dWO(integralSqImageMemorySegment)
301             );
302 
303             accelerator.bridge.ndrange(integralRowKernel, rgbImageLayout.getHeight(),
304                     DeviceArgs.of()
305                             .s08_1dRO(cascade.segment())
306                             .f32_1dRW(integralImageMemorySegment)
307                             .f32_1dRW(integralSqImageMemorySegment)
308             );
309 
310             // This allows us to visualize the integral or integralSq image.
311             // We map the integral + integralSq floats to a grey image
312             accelerator.bridge.ndrange(floatToShortKernel,
313                     rgbImageLayout.getElementCount(),
314                     DeviceArgs.of()
315                             .s08_1dRO(cascade.segment())
316                             .f32_1dRO(integralImageMemorySegment)
317                             .u16_1dWO(integralImageInstance.memorySegment)
318                             .f32_1dRO(integralSqImageMemorySegment)
319                             .u16_1dWO(integralSqImageInstance.memorySegment)
320             );
321             harViz.showIntegrals();
322 
323 
324             String mode = System.getProperty("mode", "bridge");
325             System.out.println("Mode =" + mode);
326 
327             long start = System.currentTimeMillis();
328 */
329 
330         if (true) {
331             long start = System.currentTimeMillis();
332             WorkStealer.usingAllProcessors()
333                     .forEachInRange(accelerator.range(NDRange.of(NDRange.Global1D.of(scaleTable.multiScaleAccumulativeRange()))), kc -> {
334                         ReferenceJavaViolaJones.findFeatures(
335                                 kc.gix,
336                                 xmlCascade,//cascade,//haarCascade, //or cascade
337                                 integralImageF32,
338                                 integralSqImageF32,
339                                 scaleTable,
340                                 resultTable);
341                     });
342             long ms = (System.currentTimeMillis() - start);
343             System.out.println("done " + ms + "ms");
344             harViz.showResults(resultTable, null, null, ms);
345         }
346         //   } else if (mode.equals("javaSegments")) {
347 
348       /*  WorkStealer.of(1)
349                 .forEachInRange(multiScaleTable.multiScaleAccumulativeRange, gid -> {
350                     ReferenceJavaViolaJones.findFeatures(
351                             gid,
352                             cascade,
353                             harViz,
354                             null,
355                             integralImageInstance.memorySegment,
356                             integralSqImageInstance.memorySegment,
357                             scaleTable,
358                             resultTable,
359                             stageTable,
360                             treeTable,
361                             featureTable);
362                 }); */
363 
364 
365                 /*
366             } else {
367                 accelerator.bridge.ndrange(singlePassCascadeKernel, range,
368                         DeviceArgs.of()
369                                 .s08_1dRW(cascade.segment()) // RW only  for atomicResult counter
370                                 .f32_1dRO(integralImageMemorySegment)
371                                 .f32_1dRO(integralSqImageMemorySegment)
372                                 .s08_1dRO(scaleTable.segment())
373                                 .s08_1dRW(resultTable.segment())
374                                 .s08_1dRO(stageTable.segment())
375                                 .s08_1dRO(treeTable.segment())
376                                 .s08_1dRO(featureTable.segment())
377 
378                 );
379             }
380             System.out.println("ms = " + (System.currentTimeMillis() - start));
381             //  openCLBridge.dump(cascadeMemorySegment, cascadeLayout.layout);
382             //  harViz.showResults(cascadeInstance.getAtomicResultTableCount(),cascadeInstance.getMaxResults(), resultTable, resultTableLayout);
383             harViz.showResults(cascade.getAtomicResultTableCount(), cascade.getMaxResults(), resultTable);
384 
385             accelerator.bridge.releaseKernel(integralColKernel);
386             accelerator.bridge.releaseKernel(integralRowKernel);
387             accelerator.bridge.releaseKernel(floatToShortKernel);
388             accelerator.bridge.releaseKernel(singlePassCascadeKernel);
389 
390             accelerator.bridge.releaseProgram(progHandle);
391         }
392         accelerator.bridge.release();
393 */
394 
395     }
396 }