Project

General

Profile

Statistics
| Branch: | Revision:

root / rgbdslam / external / siftgpu / src / SiftGPU / ProgramCL.cpp @ 9240aaa3

History | View | Annotate | Download (63.2 KB)

1
//////////////////////////////////////////////////////////////////////////////
2
//        File:                ProgramCL.cpp
3
//        Author:                Changchang Wu
4
//        Description :        implementation of CL related class.
5
//                            class ProgramCL                        A simple wrapper of Cg programs
6
//
7
//        Copyright (c) 2007 University of North Carolina at Chapel Hill
8
//        All Rights Reserved
9
//
10
//        Permission to use, copy, modify and distribute this software and its
11
//        documentation for educational, research and non-profit purposes, without
12
//        fee, and without a written agreement is hereby granted, provided that the
13
//        above copyright notice and the following paragraph appear in all copies.
14
//        
15
//        The University of North Carolina at Chapel Hill make no representations
16
//        about the suitability of this software for any purpose. It is provided
17
//        'as is' without express or implied warranty. 
18
//
19
//        Please send BUG REPORTS to ccwu@cs.unc.edu
20
//
21
////////////////////////////////////////////////////////////////////////////
22

    
23
#if defined(CL_SIFTGPU_ENABLED) 
24

    
25
#include <CL/opencl.h>
26
#include <GL/glew.h>
27

    
28
#include <iostream>
29
#include <iomanip>
30
#include <vector>
31
#include <strstream>
32
#include <algorithm>
33
#include <stdlib.h>
34
#include <math.h>
35
#include <string.h>
36
using namespace std;
37

    
38
#include "GlobalUtil.h"
39
#include "CLTexImage.h"
40
#include "ProgramCL.h"
41
#include "SiftGPU.h"
42

    
43

    
44
#if  defined(_WIN32) 
45
        #pragma comment (lib, "OpenCL.lib")
46
#endif
47

    
48
#ifndef _INC_WINDOWS
49
#ifndef WIN32_LEAN_AND_MEAN
50
        #define WIN32_LEAN_AND_MEAN
51
#endif
52
#include <windows.h>
53
#endif 
54

    
55
//////////////////////////////////////////////////////////////////////
56
// Construction/Destruction
57
//////////////////////////////////////////////////////////////////////
58

    
59
ProgramCL::ProgramCL()
60
{
61
        _program = NULL;
62
    _kernel = NULL;
63
    _valid = 0;
64
}
65

    
66
ProgramCL::~ProgramCL()
67
{
68
    if(_kernel)  clReleaseKernel(_kernel);
69
    if(_program) clReleaseProgram(_program);
70
}
71

    
72
ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1)
73
{
74
    const char * src[1] = {code};     cl_int status;
75

    
76
    _program = clCreateProgramWithSource(context, 1, src, NULL, &status);
77
    if(status != CL_SUCCESS) _valid = 0;
78

    
79
    status = clBuildProgram(_program, 0, NULL, 
80
        GlobalUtil::_debug ? 
81
        "-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" : 
82
        "-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL);
83

    
84
    if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;}
85
    else if(GlobalUtil::_debug) PrintBuildLog(device, 0); 
86

    
87
    _kernel = clCreateKernel(_program, name, &status); 
88
    if(status != CL_SUCCESS) _valid = 0;
89
}
90

    
91
void ProgramCL::PrintBuildLog(cl_device_id device, int all)
92
{
93
    char buffer[10240] = "\0"; 
94
    cl_int status = clGetProgramBuildInfo(
95
        _program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
96
    if(all )  
97
    {
98
        std::cerr << buffer  << endl; 
99
    }else
100
    {
101
        const char * pos = strstr(buffer, "ptxas");
102
        if(pos) std::cerr << pos << endl; 
103
    }
104
}
105

    
106
///////////////////////////////////////////////////////////////////////////////////
107
/////////////////////////////////PACKED VERSION?///////////////////////////////////
108

    
109
ProgramBagCL::ProgramBagCL()
110
{
111
    ////////////////////////////////////
112
    _context = NULL;   _queue = NULL;
113
    s_gray = s_sampling = NULL;
114
    s_packup = s_zero_pass = NULL;
115
    s_gray_pack = s_unpack = NULL;
116
    s_sampling_u = NULL;
117
    s_dog_pass   = NULL;
118
    s_grad_pass  = NULL;
119
    s_grad_pass2 = NULL;
120
    s_unpack_dog = NULL;
121
    s_unpack_grd = NULL;
122
    s_unpack_key = NULL;
123
    s_keypoint = NULL;
124
    f_gaussian_skip0 = NULL;
125
    f_gaussian_skip1 = NULL;
126
    f_gaussian_step = 0;
127

    
128
    ////////////////////////////////
129
        GlobalUtil::StartTimer("Initialize OpenCL");
130
    if(!InitializeContext()) return;
131
    GlobalUtil::StopTimer();
132

    
133
}
134

    
135

    
136

    
137
ProgramBagCL::~ProgramBagCL()
138
{
139
    if(s_gray)      delete s_gray;
140
        if(s_sampling)  delete s_sampling;
141
        if(s_zero_pass) delete s_zero_pass;
142
    if(s_packup)    delete s_packup;
143
    if(s_unpack)    delete s_unpack;
144
    if(s_gray_pack) delete s_gray_pack;
145
    if(s_sampling_u)  delete s_sampling_u;
146
    if(s_dog_pass)  delete s_dog_pass;
147
    if(s_grad_pass)  delete s_grad_pass;
148
    if(s_grad_pass2)  delete s_grad_pass2;
149
    if(s_unpack_dog) delete s_unpack_dog;
150
    if(s_unpack_grd) delete s_unpack_grd;
151
    if(s_unpack_key) delete s_unpack_key;
152
    if(s_keypoint)   delete s_keypoint;
153

    
154
    if(f_gaussian_skip1) delete f_gaussian_skip1;
155

    
156
    for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
157
    {
158
            if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i];
159
    }
160
    if(f_gaussian_step && _gaussian_step_num > 0) 
161
    {
162
            for(int i = 0; i< _gaussian_step_num; i++)
163
            {
164
                    delete f_gaussian_step[i];
165
            }
166
            delete[] f_gaussian_step;
167
    }
168

    
169
    //////////////////////////////////////
170
    if(_context) clReleaseContext(_context);
171
    if(_queue) clReleaseCommandQueue(_queue);
172
}
173

    
174
bool ProgramBagCL::InitializeContext()
175
{
176
    cl_uint num_platform, num_device;
177
    cl_int status;
178
    // Get OpenCL platform count
179
    status = clGetPlatformIDs (0, NULL, &num_platform);
180
    if (status != CL_SUCCESS || num_platform == 0) return false; 
181

    
182
    cl_platform_id platforms[16];
183
    if(num_platform > 16 ) num_platform = 16;
184
    status = clGetPlatformIDs (num_platform, platforms, NULL);
185
    _platform = platforms[0];
186

    
187
    ///////////////////////////////
188
    status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device);
189
    if(status != CL_SUCCESS || num_device == 0) return false;
190

    
191
    // Create the device list
192
    cl_device_id* devices = new cl_device_id [num_device];
193
    status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
194
    _device = (status == CL_SUCCESS? devices[0] : 0);  delete[] devices;   
195
    if(status != CL_SUCCESS)  return false;  
196

    
197

    
198
    if(GlobalUtil::_verbose)
199
    {
200
        cl_device_mem_cache_type is_gcache; 
201
        clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL);
202
        if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n";
203
        //else if(is_gcache == CL_READ_ONLY_CACHE) std::cout << "Read only cache for global memory\n";
204
        //else std::cout << "Read/Write cache for global memory\n";
205
    }
206

    
207
    //context;
208
    if(GlobalUtil::_UseSiftGPUEX)
209
    {
210
        cl_context_properties prop[] = {
211
        CL_CONTEXT_PLATFORM, (cl_context_properties)_platform,
212
        CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
213
        CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),  0 };
214
        _context = clCreateContext(prop, 1, &_device, NULL, NULL, &status);    
215
        if(status != CL_SUCCESS) return false;
216
    }else
217
    {
218
        _context = clCreateContext(0, 1, &_device, NULL, NULL, &status);    
219
        if(status != CL_SUCCESS) return false;
220
    }
221

    
222
    //command queue
223
    _queue = clCreateCommandQueue(_context, _device, 0, &status);
224
    return status == CL_SUCCESS;
225
}
226

    
227
void ProgramBagCL::InitProgramBag(SiftParam&param)
228
{
229
        GlobalUtil::StartTimer("Load Programs");
230
    LoadFixedShaders();
231
    LoadDynamicShaders(param);
232
    if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
233
    GlobalUtil::StopTimer();
234
}
235

    
236

    
237
void ProgramBagCL::UnloadProgram()
238
{
239

    
240
}
241

    
242
void ProgramBagCL::FinishCL()
243
{
244
    clFinish(_queue);
245
}
246

    
247
void ProgramBagCL::LoadFixedShaders()
248
{
249

    
250
 
251
    s_gray = new ProgramCL( "gray", 
252
        "__kernel void gray(__read_only  image2d_t input, __write_only image2d_t output) {\n"
253
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
254
        "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
255
        "float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
256
        "float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
257
            "float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
258
        "write_imagef(output, coord, result); }", _context, _device        );
259

    
260

    
261
        s_sampling = new ProgramCL("sampling",
262
        "__kernel void sampling(__read_only  image2d_t input, __write_only image2d_t output,\n"
263
        "                   int width, int height) {\n"
264
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
265
        "int x = get_global_id(0), y =  get_global_id(1); \n"
266
        "if( x >= width || y >= height) return;\n"
267
        "int xa = x + x,   ya = y + y; \n"
268
        "int xb = xa + 1,  yb = ya + 1; \n"
269
        "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
270
        "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
271
        "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
272
        "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
273
        "float4 result = (float4) (v1, v2, v3, v4);"
274
        "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
275

    
276
        s_sampling_k = new ProgramCL("sampling_k",
277
        "__kernel void sampling_k(__read_only  image2d_t input, __write_only image2d_t output, "
278
        "                   int width, int height,\n"
279
        "                   int step,  int halfstep) {\n"
280
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
281
        "int x = get_global_id(0), y =  get_global_id(1); \n"
282
        "if( x >= width || y >= height) return;\n"
283
        "int xa = x * step,   ya = y *step; \n"
284
        "int xb = xa + halfstep,  yb = ya + halfstep; \n"
285
        "float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
286
        "float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
287
        "float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
288
        "float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
289
        "float4 result = (float4) (v1, v2, v3, v4);"
290
        "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
291

    
292

    
293
        s_sampling_u = new ProgramCL("sampling_u",
294
        "__kernel void sampling_u(__read_only  image2d_t input, \n"
295
        "                   __write_only image2d_t output,\n"
296
        "                   int width, int height,\n"
297
        "                   float step, float halfstep) {\n"
298
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
299
        "int x = get_global_id(0), y =  get_global_id(1); \n"
300
        "if( x >= width || y >= height) return;\n"
301
        "float xa = x * step,       ya = y *step; \n"
302
        "float xb = xa + halfstep,  yb = ya + halfstep; \n"
303
        "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
304
        "float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
305
        "float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
306
        "float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
307
        "float4 result = (float4) (v1, v2, v3, v4);"
308
        "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
309

    
310

    
311
    s_zero_pass = new ProgramCL("zero_pass",
312
        "__kernel void zero_pass(__write_only image2d_t output){\n"
313
        "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
314
        "write_imagef(output, coord, (float4)(0.0));}", _context, _device);
315

    
316
    s_packup = new ProgramCL("packup",
317
        "__kernel void packup(__global float* input, __write_only image2d_t output,\n"
318
        "                   int twidth, int theight, int width){\n"
319
        "int2 coord = (int2)(get_global_id(0),  get_global_id(1));\n"
320
        "if(coord.x >= twidth || coord.y >= theight) return;\n"
321
        "int index0 = (coord.y + coord.y) * width; \n"
322
        "int index1 = index0 + coord.x;\n"
323
        "int x2 = min(width -1, coord.x); \n"
324
        "float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
325
        "int index2 = index1 + width; \n"
326
        "float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
327
        "write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
328

    
329
   s_dog_pass = new ProgramCL("dog_pass",
330
        "__kernel void dog_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
331
        "                   __write_only image2d_t dog, int width, int height) {\n"
332
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
333
        "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
334
        "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
335
        "if( coord.x >= width || coord.y >= height) return;\n"
336
        "float4 cc = read_imagef(tex , sampler, coord); \n"
337
        "float4 cp = read_imagef(texp, sampler, coord);\n"
338
        "write_imagef(dog, coord, cc - cp); }\n", _context, _device); 
339

    
340
   s_grad_pass = new ProgramCL("grad_pass",
341
        "__kernel void grad_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
342
        "                   __write_only image2d_t dog, int width, int height,\n"
343
        "                    __write_only image2d_t grad, __write_only image2d_t rot) {\n"
344
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
345
        "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
346
        "int x = get_global_id(0), y =  get_global_id(1); \n"
347
        "if( x >= width || y >= height) return;\n"
348
        "int2 coord = (int2) (x, y);\n"
349
        "float4 cc = read_imagef(tex , sampler, coord); \n"
350
        "float4 cp = read_imagef(texp, sampler, coord);\n"
351
        "float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
352
        "float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
353
            "float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
354
        "float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
355
        "write_imagef(dog, coord, cc - cp); \n"
356
        "float4 dx = (float4)(cc.y - cl.x,  cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
357
        "float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
358
        "write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
359
        "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device); 
360

    
361
    s_grad_pass2 = new ProgramCL("grad_pass2",
362
        "#define BLOCK_DIMX 32\n"
363
        "#define BLOCK_DIMY 14\n"
364
        "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
365
        "__kernel void grad_pass2(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
366
        "                   __write_only image2d_t dog, int width, int height,\n"
367
        "                   __write_only image2d_t grd, __write_only image2d_t rot){\n"//,  __local float* block) {\n"
368
        "__local float block[BLOCK_SIZE * 4]; \n"
369
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
370
        "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
371
        "int2 coord = (int2) (  get_global_id(0) - get_group_id(0) * 2 - 1, \n"
372
        "                       get_global_id(1) - get_group_id(1) * 2- 1); \n"
373
        "int idx =  mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
374
        "float4 cc = read_imagef(tex, sampler, coord);\n"
375
        "block[idx                 ] = cc.x;\n"
376
        "block[idx + BLOCK_SIZE    ] = cc.y;\n"
377
        "block[idx + BLOCK_SIZE * 2] = cc.z;\n"
378
        "block[idx + BLOCK_SIZE * 3] = cc.w;\n"
379
        "barrier(CLK_LOCAL_MEM_FENCE);\n"
380
        "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
381
        "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
382
        "if( coord.x >= width) return; \n"
383
        "if( coord.y >= height) return;\n"
384
        "float4 cp = read_imagef(texp, sampler, coord);\n"
385
        "float4 dx = (float4)(  cc.y - block[idx - 1 + BLOCK_SIZE], \n"
386
        "                       block[idx + 1] - cc.x, \n"
387
        "                       cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
388
        "                       block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
389
        "float4 dy = (float4)(  cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
390
        "                       cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
391
        //"                       cc.zw - block[idx - BLOCK_DIMX].zw, \n"
392
        "                       block[idx + BLOCK_DIMX] - cc.x,\n "
393
        "                       block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
394
        //"                       block[idx + BLOCK_DIMX].xy - cc.xy);\n"
395
        "write_imagef(dog, coord, cc - cp); \n"
396
        "write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
397
        "write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device); 
398
}
399

    
400
void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
401
{
402
    LoadKeypointShader();
403
    LoadGenListShader(param._dog_level_num, 0);
404
    CreateGaussianFilters(param);
405
}
406

    
407

    
408
void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam&param)
409
{
410
    float sigma = param.GetInitialSmoothSigma(octave_min);
411
    if(sigma == 0) 
412
    {
413
       f_gaussian_skip0 = NULL; 
414
    }else
415
    {
416
            for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++)
417
            {
418
                    if(f_gaussian_skip0_v[i]->_id == octave_min)
419
                    {
420
                            f_gaussian_skip0 = f_gaussian_skip0_v[i];
421
                            return ;
422
                    }
423
            }
424
            FilterCL * filter = CreateGaussianFilter(sigma); 
425
            filter->_id = octave_min;
426
            f_gaussian_skip0_v.push_back(filter);
427
            f_gaussian_skip0 = filter; 
428
    }
429

    
430
}
431

    
432
void ProgramBagCL::CreateGaussianFilters(SiftParam&param)
433
{
434
        if(param._sigma_skip0>0.0f) 
435
        {
436
                f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0);
437
                f_gaussian_skip0->_id = GlobalUtil::_octave_min_default; 
438
                f_gaussian_skip0_v.push_back(f_gaussian_skip0);
439
        }
440
        if(param._sigma_skip1>0.0f) 
441
        {
442
                f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1);
443
        }
444

    
445
        f_gaussian_step = new FilterCL*[param._sigma_num];
446
        for(int i = 0; i< param._sigma_num; i++)
447
        {
448
                f_gaussian_step[i] =  CreateGaussianFilter(param._sigma[i]);
449
        }
450
    _gaussian_step_num = param._sigma_num;
451
}
452

    
453

    
454
FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
455
{
456
        //pixel inside 3*sigma box
457
        int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
458
        int width = 2*sz + 1;
459

    
460
        //filter size truncation
461
        if(GlobalUtil::_MaxFilterWidth >0 && width > GlobalUtil::_MaxFilterWidth)
462
        {
463
                std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl;
464
                sz = GlobalUtil::_MaxFilterWidth>>1;
465
                width = 2 * sz + 1;
466
        }
467

    
468
        int i;
469
        float * kernel = new float[width];
470
        float   rv = 1.0f/(sigma*sigma);
471
        float   v, ksum =0; 
472

    
473
        // pre-compute filter
474
        for( i = -sz ; i <= sz ; ++i) 
475
        {
476
                kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;
477
                ksum += v;
478
        }
479

    
480
        //normalize the kernel
481
        rv = 1.0f / ksum;
482
        for(i = 0; i< width ;i++) kernel[i]*=rv;
483

    
484
    FilterCL * filter = CreateFilter(kernel, width);
485
    delete [] kernel;
486
    if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl;
487
    return filter;
488
}
489

    
490
FilterCL*  ProgramBagCL::CreateFilter(float kernel[], int width)
491
{
492
    FilterCL * filter = new FilterCL;
493
    filter->s_shader_h = CreateFilterH(kernel, width); 
494
    filter->s_shader_v = CreateFilterV(kernel, width);
495
    filter->_size = width; 
496
    filter->_id  = 0;
497
    return filter;
498
}
499

    
500
ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width)
501
{
502
        int halfwidth  = width >>1;
503
        float * pf = kernel + halfwidth;
504
        int nhpixel = (halfwidth+1)>>1;        //how many neighbour pixels need to be looked up
505
        int npixel  = (nhpixel<<1)+1;//
506
        float weight[3];
507

    
508
    ////////////////////////////
509
        char buffer[10240];
510
        ostrstream out(buffer, 10240);
511
        out<<setprecision(8);
512

    
513

    
514
    //CL_DEVICE_IMAGE2D_MAX_WIDTH
515
        out<<
516
          "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
517
          "__kernel void filter_h(__read_only  image2d_t input, \n"
518
          "          __write_only image2d_t output, int width_, int height_) {\n"
519
          "int x = get_global_id(0);\n"
520
          "int y = get_global_id(1); \n"
521
          "if( x > width_ || y > height_) return; \n"
522
          "float4 pc; int2 coord; \n"
523
          "float4 result = (float4)(0.0);\n";
524
    for(int i = 0 ; i < npixel ; i++)
525
        {
526
                out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n";
527
                out<<"pc= read_imagef(input, sampler, coord);\n";
528
                if(GlobalUtil::_PreciseBorder)        
529
        out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
530
                //for each sub-pixel j  in center, the weight of sub-pixel k 
531
                int xw = (i - nhpixel)*2;
532
                for(int j = 0; j < 3; j++)
533
                {
534
                        int xwn = xw  + j  -1;
535
                        weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
536
                }
537
                if(weight[1] == 0.0)
538
                {
539
                        out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n";
540
                }
541
                else
542
                {
543
                        out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n";
544
                        out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n";
545
                }        
546
        }
547
    out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
548
        return new ProgramCL("filter_h", buffer, _context, _device); 
549
}
550

    
551

    
552

    
553
ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width)
554
{
555

    
556
        int halfwidth  = width >>1;
557
        float * pf = kernel + halfwidth;
558
        int nhpixel = (halfwidth+1)>>1;        //how many neighbour pixels need to be looked up
559
        int npixel  = (nhpixel<<1)+1;//
560
        float weight[3];
561

    
562
    ////////////////////////////
563
        char buffer[10240];
564
        ostrstream out(buffer, 10240);
565
        out<<setprecision(8);
566

    
567

    
568
    //CL_DEVICE_IMAGE2D_MAX_WIDTH
569
        out<< 
570
          "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
571
          "__kernel void filter_v(__read_only  image2d_t input, \n"
572
          "          __write_only image2d_t output, int width_, int height_) {\n"
573
          "int x = get_global_id(0);\n"
574
          "int y = get_global_id(1); \n"
575
          "if( x > width_ || y >= height_) return; \n"
576
          "float4 pc; int2 coord; \n"
577
          "float4 result = (float4)(0.0);\n";
578
    for(int i = 0 ; i < npixel ; i++)
579
        {
580
                out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n";
581
                out<<"pc= read_imagef(input, sampler, coord);\n";
582
                if(GlobalUtil::_PreciseBorder)        
583
        out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
584
                //for each sub-pixel j  in center, the weight of sub-pixel k 
585
                int xw = (i - nhpixel)*2;
586
                for(int j = 0; j < 3; j++)
587
                {
588
                        int xwn = xw  + j  -1;
589
                        weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
590
                }
591
                if(weight[1] == 0.0)
592
                {
593
                        out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n";
594
                }
595
                else
596
                {
597
                        out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n";
598
                        out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n";
599
                }        
600
        }
601
    out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0';
602
        return new ProgramCL("filter_v", buffer, _context, _device); 
603

    
604
}
605

    
606
void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
607
{
608
    cl_kernel kernelh = filter->s_shader_h->_kernel;
609
    cl_kernel kernelv = filter->s_shader_v->_kernel;
610
    //////////////////////////////////////////////////////////////////
611

    
612
    cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
613
    cl_int w_ = w - 1, h_ = h - 1; 
614

    
615
    size_t dim0 = 16, dim1 = 16;
616
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
617

    
618
    clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
619
    clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
620
    clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
621
    clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
622
    status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL);
623
    CheckErrorCL(status, "ProgramBagCL::FilterImageH");
624
    if(status != CL_SUCCESS) return;
625

    
626
    clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
627
    clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
628
    clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
629
    clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
630
    size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0};
631
    status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
632
    CheckErrorCL(status, "ProgramBagCL::FilterImageV");
633
    //clReleaseEvent(event);
634
}
635

    
636
void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale)
637
{
638
    cl_kernel  kernel= s_sampling_u->_kernel; 
639
    float scale  = 1.0f / (1 << log_scale);
640
    float offset = scale * 0.5f; 
641
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
642
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
643
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
644
    clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
645
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
646
    clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale));
647
    clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset));
648

    
649
    size_t dim0 = 16, dim1 = 16;
650
    //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2; 
651
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
652
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
653
    CheckErrorCL(status, "ProgramBagCL::SampleImageU");
654
}
655

    
656
void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
657
{
658
    cl_kernel  kernel; 
659
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); 
660
    if(log_scale == 1)
661
    {
662
        kernel = s_sampling->_kernel;
663
        clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
664
        clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
665
        clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
666
        clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
667
    }else
668
    {
669
        cl_int fullstep = (1 << log_scale);
670
        cl_int halfstep = fullstep >> 1;
671
        kernel = s_sampling_k->_kernel;
672
        clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
673
        clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
674
        clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
675
        clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
676
        clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
677
        clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep));
678
    }
679
    size_t dim0 = 128, dim1 = 1;
680
    //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2; 
681
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
682
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
683
    CheckErrorCL(status, "ProgramBagCL::SampleImageD");
684
}
685

    
686
void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
687
{
688
    if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
689
}
690

    
691
void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
692
{
693
    if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
694
}
695

    
696
void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
697
{
698
    int margin = 0, use_gm2 = 1; 
699
    bool both_grad_dog = rot->_clData && grad->_clData;
700
    cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight();
701
    cl_kernel  kernel ; size_t dim0, dim1; 
702
    if(!both_grad_dog)  {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; }
703
    else if(use_gm2)    {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; }
704
    else                {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; }
705
    size_t gsz[2] = {   (w + dim0 - 1 - margin) / (dim0 - margin) * dim0,
706
                        (h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
707
    size_t lsz[2] = {dim0, dim1};
708
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData));
709
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData));
710
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData));
711
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
712
    clSetKernelArg(kernel, 4, sizeof(cl_int), &(h)); 
713
    if(both_grad_dog)
714
    {
715
        clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData));
716
        clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData));
717
    }
718
    ///////////////////////////////////////////////////////
719
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
720
    CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
721
}
722

    
723

    
724
void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge)
725
{
726
    cl_kernel  kernel = s_keypoint->_kernel; 
727
    cl_int w = key->GetImgWidth(), h = key->GetImgHeight();
728
        float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f);
729
        float threshold1 = Tdog;
730
        float threshold2 = (Tedge+1)*(Tedge+1)/Tedge;
731
        
732
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
733
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData));
734
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData));
735
    clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData));
736
    clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0));
737
    clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1));
738
    clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2));
739
    clSetKernelArg(kernel, 7, sizeof(cl_int), &(w));
740
    clSetKernelArg(kernel, 8, sizeof(cl_int), &(h)); 
741

    
742
    size_t dim0 = 8, dim1 = 8;
743
    //if( w * h / dim0 / dim1 < 16) dim1 /= 2; 
744
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
745
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
746
    CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
747
}
748

    
749
void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
750
{
751
    cl_kernel  kernel = s_unpack->_kernel; 
752
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
753
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
754
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
755
    clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
756
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
757
    const size_t dim0 = 16, dim1 = 16;
758
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
759
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
760

    
761
    CheckErrorCL(status, "ProgramBagCL::UnpackImage");
762
    FinishCL();
763

    
764
}
765

    
766
void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
767
{
768
    if(s_unpack_dog == NULL) return; 
769
    cl_kernel  kernel = s_unpack_dog->_kernel; 
770
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
771
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
772
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
773
    clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
774
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
775
    const size_t dim0 = 16, dim1 = 16;
776
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
777
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
778

    
779
    CheckErrorCL(status, "ProgramBagCL::UnpackImage");
780
    FinishCL();
781
}
782

    
783
void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
784
{
785
    if(s_unpack_grd == NULL) return; 
786
    cl_kernel  kernel = s_unpack_grd->_kernel; 
787
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
788
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
789
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
790
    clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
791
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
792
    const size_t dim0 = 16, dim1 = 16;
793
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
794
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
795

    
796
    CheckErrorCL(status, "ProgramBagCL::UnpackImage");
797
    FinishCL();
798
}
799
void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
800
{
801
    if(s_unpack_key == NULL) return;
802
    cl_kernel  kernel = s_unpack_key->_kernel; 
803
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight();
804
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData));
805
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData));
806
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData));
807
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(w));
808
    clSetKernelArg(kernel, 4, sizeof(cl_int), &(h));
809
    const size_t dim0 = 16, dim1 = 16;
810
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
811
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
812

    
813
    CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
814
    FinishCL();
815
}
816
void ProgramBagCL::LoadDescriptorShader()
817
{
818
        GlobalUtil::_DescriptorPPT = 16;
819
        LoadDescriptorShaderF2();
820
}
821

    
822
void ProgramBagCL::LoadDescriptorShaderF2()
823
{
824

    
825
}
826

    
827
void ProgramBagCL::LoadOrientationShader(void)
828
{
829

    
830
}
831

    
832
void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev)
833
{
834

    
835
}
836

    
837
void ProgramBagCL::LoadKeypointShader()
838
{
839
    int i;    char buffer[20240];
840
        ostrstream out(buffer, 20240);
841
        streampos pos;
842

    
843
        //tex(X)(Y)
844
        //X: (CLR) (CENTER 0, LEFT -1, RIGHT +1)  
845
        //Y: (CDU) (CENTER 0, DOWN -1, UP    +1) 
846
        out<<
847
    "__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
848
    "           __read_only image2d_t texD, __write_only image2d_t texK,\n"
849
    "          float THRESHOLD0, float THRESHOLD1, \n"
850
    "          float THRESHOLD2, int width, int height)\n"
851
        "{\n"
852
    "   sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
853
    "         CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
854
    "   int x = get_global_id(0), y = get_global_id(1);\n"
855
    "   if(x  >= width || y >= height) return; \n"
856
    "   int  xp = x - 1, xn = x + 1;\n"
857
    "   int  yp = y - 1, yn = y + 1;\n"
858
    "   int2 coord0 = (int2) (x, y); \n"
859
    "   int2 coord1 = (int2) (xp, y); \n"
860
    "   int2 coord2 = (int2) (xn, y); \n"
861
    "   int2 coord3 = (int2) (x, yp); \n"
862
    "   int2 coord4 = (int2) (x, yn); \n"
863
    "   int2 coord5 = (int2) (xp, yp); \n"
864
    "   int2 coord6 = (int2) (xp, yn); \n"
865
    "   int2 coord7 = (int2) (xn, yp); \n"
866
    "   int2 coord8 = (int2) (xn, yn); \n"
867
    "        float4 ccc = read_imagef(tex, sampler,coord0);\n"
868
        "        float4 clc = read_imagef(tex, sampler,coord1);\n"
869
        "        float4 crc = read_imagef(tex, sampler,coord2);\n"
870
        "        float4 ccd = read_imagef(tex, sampler,coord3);\n"
871
        "        float4 ccu = read_imagef(tex, sampler,coord4);\n"
872
        "        float4 cld = read_imagef(tex, sampler,coord5);\n"
873
        "        float4 clu = read_imagef(tex, sampler,coord6);\n"
874
        "        float4 crd = read_imagef(tex, sampler,coord7);\n"
875
        "        float4 cru = read_imagef(tex, sampler,coord8);\n"
876
    "        float4   cc = ccc;\n"
877
        "        float4  v1[4], v2[4];\n"
878
        "        v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
879
        "        v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
880
        "        v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
881
        "        v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
882
        "        v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
883
        "        v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
884
        "        v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
885
        "        v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
886
    "   float4 key4 = (float4)(0); \n";
887
        //test against 8 neighbours
888
        //use variable to identify type of extremum
889
        //1.0 for local maximum and -1.0 for minimum
890
    for(i = 0; i < 4; ++i)
891
        out<<
892
    "   if(cc.s"<<i<<" > THRESHOLD0){ \n"
893
    "           if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n"
894
    "        }else if(cc.s"<<i<<" < -THRESHOLD0){ \n"
895
    "           if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n"
896
    "   }";
897

    
898
        out<<
899
    "   if(x ==0) {key4.x =  key4.z= 0; }\n"
900
    "   else if(x + 1 == width) {key4.y =  key4.w = 0;}\n"
901
    "   if(y ==0) {key4.x =   key4.y = 0; }\n"
902
    "   else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
903
    "   float4 ak = fabs(key4); \n"
904
    "   float keysum = ak.x + ak.y + ak.z + ak.w; \n"
905
        "        float4 result = (float4)(0.0);\n"
906
        "        if(keysum == 1.0) {\n"
907
        "        float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
908
        
909
    //do edge supression first.. 
910
        //vector v1 is < (-1, 0), (1, 0), (0,-1), (0, 1)>
911
        //vector v2 is < (-1,-1), (-1,1), (1,-1), (1, 1)>
912
    for(i = 0; i < 4; ++i)
913
        out <<
914
        "        if(key4.s"<<i<<" != 0)\n"
915
        "        {\n"
916
        "                float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n"
917
        "                float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n"
918
        "                float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n"
919
        "                fx["<<i<<"] = D5.x;        fy["<<i<<"] = D5.y ;\n"
920
        "                fxx["<<i<<"] = D2.x + D2.y;\n"
921
        "                fyy["<<i<<"] = D2.z + D2.w;\n"
922
        "                fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n"
923
        "                float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n"
924
        "                float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
925
        "                float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n"
926
        "                if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
927
        "        }\n";
928

    
929
    out << 
930
        "        if(keysum == 1) {\n";
931
        ////////////////////////////////////////////////
932
        //read 9 pixels of upper/lower level
933
        out<<
934
        "        float4  v4[4], v5[4], v6[4];\n"
935
        "        ccc = read_imagef(texU, sampler,coord0);\n"
936
        "        clc = read_imagef(texU, sampler,coord1);\n"
937
        "        crc = read_imagef(texU, sampler,coord2);\n"
938
        "        ccd = read_imagef(texU, sampler,coord3);\n"
939
        "        ccu = read_imagef(texU, sampler,coord4);\n"
940
        "        cld = read_imagef(texU, sampler,coord5);\n"
941
        "        clu = read_imagef(texU, sampler,coord6);\n"
942
        "        crd = read_imagef(texU, sampler,coord7);\n"
943
        "        cru = read_imagef(texU, sampler,coord8);\n"
944
    "        float4 cu = ccc;\n"
945
        "        v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
946
        "        v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
947
        "        v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
948
        "        v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
949
        "        v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
950
        "        v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
951
        "        v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
952
        "        v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
953

    
954
    for(i = 0; i < 4; ++i)
955
        out <<
956
        "        if(key4.s"<<i<<" == 1.0)\n"
957
        "        {\n"
958
        "                if(cc.s"<<i<<" < cu.s"<<i<<" || \n"
959
    "           any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
960
        "        }else if(key4.s"<<i<<" == -1.0)\n"
961
        "        {\n"
962
        "                if(cc.s"<<i<<" > cu.s"<<i<<" || \n"
963
    "           any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n"
964
        "        }\n";
965

    
966
    out <<
967
        "        if(keysum == 1.0) { \n";
968
    out <<
969
        "        ccc = read_imagef(texD, sampler,coord0);\n"
970
        "        clc = read_imagef(texD, sampler,coord1);\n"
971
        "        crc = read_imagef(texD, sampler,coord2);\n"
972
        "        ccd = read_imagef(texD, sampler,coord3);\n"
973
        "        ccu = read_imagef(texD, sampler,coord4);\n"
974
        "        cld = read_imagef(texD, sampler,coord5);\n"
975
        "        clu = read_imagef(texD, sampler,coord6);\n"
976
        "        crd = read_imagef(texD, sampler,coord7);\n"
977
        "        cru = read_imagef(texD, sampler,coord8);\n"
978
    "        float4 cd = ccc;\n"
979
        "        v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
980
        "        v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
981
        "        v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
982
        "        v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
983
        "        v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
984
        "        v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
985
        "        v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
986
        "        v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
987
    for(i = 0; i < 4; ++i)
988
    out <<
989
        "        if(key4.s"<<i<<" == 1.0)\n"
990
        "        {\n"
991
        "                if(cc.s"<<i<<" < cd.s"<<i<<" ||\n"
992
    "           any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
993
        "        }else if(key4.s"<<i<<" == -1.0)\n"
994
        "        {\n"
995
        "                if(cc.s"<<i<<" > cd.s"<<i<<" ||\n"
996
    "           any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n"
997
        "        }\n";
998

    
999
    out << 
1000
        "        if(keysum==1.0) {\n";
1001
        //////////////////////////////////////////////////////////////////////
1002
        if(GlobalUtil::_SubpixelLocalization)
1003
    {
1004
            out <<
1005
            "        float4 offset = (float4)(0); \n";
1006
        for(i = 1; i < 4; ++i)
1007
        out <<
1008
            "        if(key4.s"<<i<<" != 0) \n"
1009
            "        {\n"
1010
            "                cu.s0 = cu.s"<<i<<";        cd.s0 = cd.s"<<i<<";        cc.s0 = cc.s"<<i<<";        \n"
1011
            "                v4[0] = v4["<<i<<"];        v5[0] = v5["<<i<<"];                                                \n"
1012
            "                fxy[0] = fxy["<<i<<"];        fxx[0] = fxx["<<i<<"];        fyy[0] = fyy["<<i<<"];        \n"
1013
            "                fx[0] = fx["<<i<<"];        fy[0] = fy["<<i<<"];                                                \n"
1014
            "        }\n";
1015

    
1016
        out <<        
1017
            "        float fs = 0.5*( cu.s0 - cd.s0 );                                \n"
1018
            "        float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
1019
            "        float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
1020
            "        float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
1021
            "        float4 A0, A1, A2 ;                        \n"
1022
            "        A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]);        \n"
1023
            "        A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]);        \n"
1024
            "        A2 = (float4)(fxs, fys, fss, -fs);        \n"
1025
        "        float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0));                \n"
1026
            "        float maxa = max(max(x3.x, x3.y), x3.z);        \n"
1027
            "        if(maxa >= 1e-10 ) \n"
1028
            "        {                                                                                                \n"
1029
            "                if(x3.y ==maxa )                                                        \n"
1030
            "                {                                                                                        \n"
1031
            "                        float4 TEMP = A1; A1 = A0; A0 = TEMP;        \n"
1032
            "                }else if( x3.z == maxa )                                        \n"
1033
            "                {                                                                                        \n"
1034
            "                        float4 TEMP = A2; A2 = A0; A0 = TEMP;        \n"
1035
            "                }                                                                                        \n"
1036
            "                A0 /= A0.x;                                                                        \n"
1037
            "                A1 -= A1.x * A0;                                                        \n"
1038
            "                A2 -= A2.x * A0;                                                        \n"
1039
        "                float2 x2 = fabs((float2)(A1.y, A2.y));                \n"
1040
            "                if( x2.y > x2.x )                                                        \n"
1041
            "                {                                                                                        \n"
1042
            "                        float4 TEMP = A2.yzwx;                                        \n"
1043
            "                        A2.yzw = A1.yzw;                                                \n"
1044
            "                        A1.yzw = TEMP.xyz;                                                        \n"
1045
            "                        x2.x = x2.y;                                                        \n"
1046
            "                }                                                                                        \n"
1047
            "                if(x2.x >= 1e-10) {                                                                \n"
1048
            "                        A1.yzw /= A1.y;                                                                \n"
1049
            "                        A2.yzw -= A2.y * A1.yzw;                                        \n"
1050
            "                        if(fabs(A2.z) >= 1e-10) {\n"
1051
            "                                offset.z = A2.w /A2.z;                                    \n"
1052
            "                                offset.y = A1.w - offset.z*A1.z;                            \n"
1053
            "                                offset.x = A0.w - offset.z*A0.z - offset.y*A0.y;        \n"
1054
        "                                if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
1055
        "                   || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
1056
            "                        }\n"
1057
            "                }\n"
1058
            "        }\n"
1059
            <<"\n"
1060
        "        float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1061
            "        result = (float4)(keyv,  offset.xyz);\n"
1062
            "        }}}}\n"
1063
        "   write_imagef(texK, coord0, result);\n "
1064
            "}\n"        <<'\0';
1065
    }
1066
        else 
1067
    {
1068
        out << "\n"
1069
        "        float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
1070
        "        result =  (float4)(keyv, 0, 0, 0);\n"
1071
        "        }}}}\n"
1072
        "   write_imagef(texK, coord0, result);\n "
1073
        "}\n"        <<'\0';
1074
    }
1075

    
1076
    s_keypoint = new ProgramCL("keypoint", buffer, _context, _device);
1077
}
1078

    
1079
void ProgramBagCL::LoadDisplayShaders()
1080
{
1081
    //"uniform sampler2DRect tex; void main(){\n"
1082
    //"vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy);        bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));\n"
1083
    //"float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}");
1084
        s_unpack = new ProgramCL("main", 
1085
    "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1086
    "                   int width, int height) {\n"
1087
    "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1088
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1089
    "if(x >= width || y >= height) return;\n"
1090
    "int xx = x / 2, yy = y / 2; \n"
1091
    "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1092
    "float v1 = (x & 1 ? vv.w : vv.z); \n"
1093
    "float v2 = (x & 1 ? vv.y : vv.x);\n"
1094
    "float v = y & 1 ? v1 : v2;\n"
1095
    "float4 result = (float4) (v, v, v, 1);"
1096
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1097

    
1098
        s_unpack_dog = new ProgramCL("main", 
1099
    "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1100
    "                   int width, int height) {\n"
1101
    "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1102
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1103
    "if(x >= width || y >= height) return;\n"
1104
    "int xx = x / 2, yy = y / 2; \n"
1105
    "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1106
    "float v1 = (x & 1 ? vv.w : vv.z); \n"
1107
    "float v2 = (x & 1 ? vv.y : vv.x);\n"
1108
    "float v0 = y & 1 ? v1 : v2;\n"
1109
    "float v = 0.5 + 20.0 * v0;\n "
1110
    "float4 result = (float4) (v, v, v, 1);"
1111
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1112
        
1113
    s_unpack_grd = new ProgramCL("main", 
1114
    "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1115
    "                   int width, int height) {\n"
1116
    "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1117
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1118
    "if(x >= width || y >= height) return;\n"
1119
    "int xx = x / 2, yy = y / 2; \n"
1120
    "float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
1121
    "float v1 = (x & 1 ? vv.w : vv.z); \n"
1122
    "float v2 = (x & 1 ? vv.y : vv.x);\n"
1123
    "float v0 = y & 1 ? v1 : v2;\n"
1124
    "float v = 5.0 * v0;\n "
1125
    "float4 result = (float4) (v, v, v, 1);"
1126
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1127

    
1128
        s_unpack_key = new ProgramCL("main", 
1129
    "__kernel void main(__read_only  image2d_t dog,\n"
1130
    "                   __read_only image2d_t key,\n"
1131
    "                   __write_only image2d_t output,\n"
1132
    "                   int width, int height) {\n"
1133
    "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1134
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1135
    "if(x >= width || y >= height) return;\n"
1136
    "int xx = x / 2, yy = y / 2; \n"
1137
    "float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
1138
    "int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
1139
    "int k1 = (x & 1 ? cc.w : cc.z); \n"
1140
    "int k2 = (x & 1 ? cc.y : cc.x);\n"
1141
    "int k0 = y & 1 ? k1 : k2;\n"
1142
    "float4 result;\n"
1143
    "if(k0 != 0){\n"
1144
    "   //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
1145
    "   result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0,  0.0, 1.0)); \n"
1146
    "}else{"
1147
        "float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
1148
        "float v1 = (x & 1 ? vv.w : vv.z); \n"
1149
        "float v2 = (x & 1 ? vv.y : vv.x);\n"
1150
        "float v0 = y & 1 ? v1 : v2;\n"
1151
        "float v = 0.5 + 20.0 * v0;\n "
1152
        "result = (float4) (v, v, v, 1);"
1153
    "}\n"
1154
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1155
}
1156

    
1157

    
1158
void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax)
1159
{
1160

    
1161
}
1162

    
1163
void ProgramBagCL::SetGradPassParam(int texP)
1164
{
1165

    
1166
}
1167

    
1168
void ProgramBagCL::SetGenListEndParam(int ktex)
1169
{
1170

    
1171
}
1172

    
1173
void ProgramBagCL::SetDogTexParam(int texU, int texD)
1174
{
1175

    
1176
}
1177

    
1178
void ProgramBagCL::SetGenListInitParam(int w, int h)
1179
{
1180
        float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f,  (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f};
1181

    
1182
}
1183

    
1184

    
1185
void ProgramBagCL::SetGenListStartParam(float width, int tex0)
1186
{
1187

    
1188
}
1189

    
1190

    
1191

    
1192
void ProgramBagCL::SetGenListStepParam(int tex, int tex0)
1193
{
1194

    
1195
}
1196

    
1197
void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size)
1198
{
1199

    
1200
}
1201

    
1202
void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step)
1203
{
1204

    
1205
}
1206

    
1207

    
1208
void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step)
1209
{
1210

    
1211

    
1212
}
1213

    
1214
void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth,  float width, float height, float sigma)
1215
{
1216

    
1217
}
1218

    
1219

    
1220

    
1221
const char* ProgramBagCL::GetErrorString(cl_int error)
1222
{
1223
    static const char* errorString[] = {
1224
        "CL_SUCCESS",
1225
        "CL_DEVICE_NOT_FOUND",
1226
        "CL_DEVICE_NOT_AVAILABLE",
1227
        "CL_COMPILER_NOT_AVAILABLE",
1228
        "CL_MEM_OBJECT_ALLOCATION_FAILURE",
1229
        "CL_OUT_OF_RESOURCES",
1230
        "CL_OUT_OF_HOST_MEMORY",
1231
        "CL_PROFILING_INFO_NOT_AVAILABLE",
1232
        "CL_MEM_COPY_OVERLAP",
1233
        "CL_IMAGE_FORMAT_MISMATCH",
1234
        "CL_IMAGE_FORMAT_NOT_SUPPORTED",
1235
        "CL_BUILD_PROGRAM_FAILURE",
1236
        "CL_MAP_FAILURE",
1237
        "",
1238
        "",
1239
        "",
1240
        "",
1241
        "",
1242
        "",
1243
        "",
1244
        "",
1245
        "",
1246
        "",
1247
        "",
1248
        "",
1249
        "",
1250
        "",
1251
        "",
1252
        "",
1253
        "",
1254
        "CL_INVALID_VALUE",
1255
        "CL_INVALID_DEVICE_TYPE",
1256
        "CL_INVALID_PLATFORM",
1257
        "CL_INVALID_DEVICE",
1258
        "CL_INVALID_CONTEXT",
1259
        "CL_INVALID_QUEUE_PROPERTIES",
1260
        "CL_INVALID_COMMAND_QUEUE",
1261
        "CL_INVALID_HOST_PTR",
1262
        "CL_INVALID_MEM_OBJECT",
1263
        "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
1264
        "CL_INVALID_IMAGE_SIZE",
1265
        "CL_INVALID_SAMPLER",
1266
        "CL_INVALID_BINARY",
1267
        "CL_INVALID_BUILD_OPTIONS",
1268
        "CL_INVALID_PROGRAM",
1269
        "CL_INVALID_PROGRAM_EXECUTABLE",
1270
        "CL_INVALID_KERNEL_NAME",
1271
        "CL_INVALID_KERNEL_DEFINITION",
1272
        "CL_INVALID_KERNEL",
1273
        "CL_INVALID_ARG_INDEX",
1274
        "CL_INVALID_ARG_VALUE",
1275
        "CL_INVALID_ARG_SIZE",
1276
        "CL_INVALID_KERNEL_ARGS",
1277
        "CL_INVALID_WORK_DIMENSION",
1278
        "CL_INVALID_WORK_GROUP_SIZE",
1279
        "CL_INVALID_WORK_ITEM_SIZE",
1280
        "CL_INVALID_GLOBAL_OFFSET",
1281
        "CL_INVALID_EVENT_WAIT_LIST",
1282
        "CL_INVALID_EVENT",
1283
        "CL_INVALID_OPERATION",
1284
        "CL_INVALID_GL_OBJECT",
1285
        "CL_INVALID_BUFFER_SIZE",
1286
        "CL_INVALID_MIP_LEVEL",
1287
        "CL_INVALID_GLOBAL_WORK_SIZE",
1288
    };
1289

    
1290
    const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
1291

    
1292
    const int index = -error;
1293

    
1294
    return (index >= 0 && index < errorCount) ? errorString[index] : "";
1295
}
1296

    
1297
bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location)
1298
{
1299
    if(error == CL_SUCCESS) return true;
1300
        const char *errstr = GetErrorString(error);
1301
        if(errstr && errstr[0]) std::cerr << errstr; 
1302
        else std::cerr  << "Error " << error;
1303
        if(location) std::cerr  << " at " << location;                
1304
        std::cerr  << "\n";
1305
    exit(0);
1306
    return false;
1307

    
1308
}
1309

    
1310

    
1311
////////////////////////////////////////////////////////////////////////
1312
////////////////////////////////////////////////////////////////////////
1313

    
1314
void ProgramBagCLN::LoadFixedShaders()
1315
{
1316
           s_sampling = new ProgramCL("sampling",
1317
        "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1318
        "__kernel void sampling(__read_only  image2d_t input, __write_only image2d_t output, "
1319
        "                   int width, int height) {\n"
1320
        "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1321
        "if( coord.x >= width || coord.y >= height) return;\n"
1322
        "write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }"  , _context, _device); 
1323
    
1324
    s_sampling_k = new ProgramCL("sampling_k",
1325
        "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1326
        "__kernel void sampling_k(__read_only  image2d_t input, __write_only image2d_t output, "
1327
        "                   int width, int height, int step) {\n"
1328
        "int x = get_global_id(0), y =  get_global_id(1); \n"
1329
        "if( x >= width || y >= height) return;\n"
1330
        "int xa = x * step,   ya = y *step; \n"
1331
        "float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
1332
        "write_imagef(output, (int2) (x, y), v1); }"  , _context, _device);
1333

    
1334

    
1335
        s_sampling_u = new ProgramCL("sampling_u",
1336
        "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
1337
        "__kernel void sampling_u(__read_only  image2d_t input, \n"
1338
        "                   __write_only image2d_t output,\n"
1339
        "                   int width, int height, float step) {\n"
1340
        "int x = get_global_id(0), y =  get_global_id(1); \n"
1341
        "if( x >= width || y >= height) return;\n"
1342
        "float xa = x * step,       ya = y *step; \n"
1343
        "float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
1344
        "write_imagef(output, (int2) (x, y), (float4)(v1)); }"  , _context, _device);
1345

    
1346
   s_dog_pass = new ProgramCL("dog_pass",
1347
        "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1348
        "__kernel void dog_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
1349
        "                   __write_only image2d_t dog, int width, int height) {\n"
1350
        "int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
1351
        "if( coord.x >= width || coord.y >= height) return;\n"
1352
        "float cc = read_imagef(tex , sampler, coord).x; \n"
1353
        "float cp = read_imagef(texp, sampler, coord).x;\n"
1354
        "write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device); 
1355

    
1356
   s_grad_pass = new ProgramCL("grad_pass",
1357
        "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1358
        "__kernel void grad_pass(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
1359
        "                   __write_only image2d_t dog, int width, int height, \n"
1360
        "                   __write_only image2d_t grad, __write_only image2d_t rot) {\n"
1361
        "int x = get_global_id(0), y =  get_global_id(1); \n"
1362
        "if( x >= width || y >= height) return;\n"
1363
        "int2 coord = (int2) (x, y);\n"
1364
        "float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
1365
        "float cc = read_imagef(tex , sampler, coord).x; \n"
1366
        "float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
1367
        "float cp = read_imagef(texp, sampler, coord).x;\n"
1368
        "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1369
            "float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
1370
        "float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
1371
        "float dx = cr - cl, dy = cu - cd; \n"
1372
            "float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
1373
        "write_imagef(grad, coord, (float4)(gg));\n"
1374
        "float oo = atan2(dy, dx + FLT_MIN);\n"
1375
        "write_imagef(rot, coord, (float4)(oo));}\n", _context, _device); 
1376

    
1377
   s_grad_pass2 = new ProgramCL("grad_pass2",
1378
        "#define BLOCK_DIMX 32\n"
1379
        "#define BLOCK_DIMY 14\n"
1380
        "#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
1381
        "__kernel void grad_pass2(__read_only image2d_t tex,  __read_only image2d_t texp,\n"
1382
        "                   __write_only image2d_t dog, int width, int height,\n"
1383
        "                   __write_only image2d_t grd, __write_only image2d_t rot){\n"//,  __local float* block) {\n"
1384
        "__local float block[BLOCK_SIZE]; \n"
1385
        "sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1386
        "                    CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 
1387
        "int2 coord = (int2) (  get_global_id(0) - get_group_id(0) * 2 - 1, \n"
1388
        "                       get_global_id(1) - get_group_id(1) * 2 - 1); \n"
1389
        "int idx =  mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
1390
        "float cc = read_imagef(tex, sampler, coord).x;\n"
1391
        "block[idx] = cc;\n"
1392
        "barrier(CLK_LOCAL_MEM_FENCE);\n"
1393
        "if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
1394
        "if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
1395
        "if( coord.x >= width) return; \n"
1396
        "if( coord.y >= height) return;\n"
1397
        "float cp = read_imagef(texp, sampler, coord).x;\n"
1398
        "float dx = block[idx + 1] - block[idx - 1];\n"
1399
        "float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
1400
        "write_imagef(dog, coord, (float4)(cc - cp)); \n"
1401
        "write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
1402
        "write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device); 
1403
}
1404

    
1405
void ProgramBagCLN::LoadDisplayShaders()
1406
{
1407
        s_unpack = new ProgramCL("main", 
1408
    "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1409
    "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1410
    "                   int width, int height) {\n"
1411
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1412
    "if(x >= width || y >= height) return;\n"
1413
    "float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1414
    "float4 result = (float4) (v, v, v, 1);"
1415
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1416

    
1417
    s_unpack_grd = new ProgramCL("main", 
1418
    "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
1419
    "           CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1420
    "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1421
    "                   int width, int height) {\n"
1422
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1423
    "if(x >= width || y >= height) return;\n"
1424
    "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1425
    "float v = 5.0 * v0;  float4 result = (float4) (v, v, v, 1);"
1426
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1427

    
1428
        s_unpack_dog = new ProgramCL("main", 
1429
    "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1430
    "__kernel void main(__read_only  image2d_t input, __write_only image2d_t output,\n"
1431
    "                   int width, int height) {\n"
1432
    "int x = get_global_id(0), y =  get_global_id(1); \n"
1433
    "if(x >= width || y >= height) return;\n"
1434
    "float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
1435
    "float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
1436
    "write_imagef(output, (int2) (x, y), result); }"  , _context, _device);
1437
}
1438

    
1439
ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width)
1440
{
1441
    ////////////////////////////
1442
        char buffer[10240];
1443
        ostrstream out(buffer, 10240);
1444
    out <<  "#define KERNEL_WIDTH " << width << "\n"
1445
        <<  "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n" 
1446
            "#define BLOCK_WIDTH 128\n"
1447
            "#define BLOCK_HEIGHT 1\n"
1448
            "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1449
            "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1450
            "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1451
            "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1452
            "__kernel void filter_h(__read_only  image2d_t input, \n"
1453
            "          __write_only image2d_t output, int width_, int height_, \n"
1454
            "          __constant float* weight) {\n"
1455
            "__local float data[CACHE_WIDTH]; \n"
1456
            "int x = get_global_id(0), y = get_global_id(1);\n"
1457
            "#pragma unroll\n"
1458
                "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1459
                "{\n"
1460
                    "    if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
1461
                    "    {\n"
1462
                        "        int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
1463
            "        data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
1464
                    "    }\n"
1465
                "}\n"
1466
            "barrier(CLK_LOCAL_MEM_FENCE); \n"
1467
            "if( x > width_ || y > height_) return; \n"
1468
            "float result = 0; \n"
1469
            "#pragma unroll\n"
1470
            "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1471
            "{\n"
1472
            "   result += data[get_local_id(0) + i] * weight[i];\n"
1473
            "}\n"
1474
         << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1475
        return new ProgramCL("filter_h", buffer, _context, _device); 
1476
}
1477

    
1478

    
1479

    
1480
ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width)
1481
{
1482
    ////////////////////////////
1483
        char buffer[10240];
1484
        ostrstream out(buffer, 10240);
1485
    out <<  "#define KERNEL_WIDTH " << width << "\n"
1486
        <<  "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n" 
1487
            "#define BLOCK_WIDTH 128\n"
1488
            "#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
1489
            "#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
1490
            "#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
1491
            "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
1492
            "           CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
1493
            "__kernel void filter_v(__read_only  image2d_t input, \n"
1494
            "          __write_only image2d_t output, int width_, int height_, \n"
1495
            "          __constant float* weight) {\n"
1496
            "__local float data[CACHE_WIDTH]; \n"
1497
            "int x = get_global_id(0), y = get_global_id(1);\n"
1498
            "#pragma unroll\n"
1499
                "for(int j = 0; j < CACHE_COUNT; ++j)\n"
1500
                "{\n"
1501
                    "    if(get_local_id(1) + j * BLOCK_WIDTH  < CACHE_WIDTH)\n"
1502
                    "    {\n"
1503
                        "        int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
1504
            "        data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
1505
                    "    }\n"
1506
                "}\n"
1507
            "barrier(CLK_LOCAL_MEM_FENCE); \n"
1508
            "if( x > width_ || y > height_) return; \n"
1509
            "float result = 0; \n"
1510
            "#pragma unroll\n"
1511
            "for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
1512
            "{\n"
1513
            "   result += data[get_local_id(1) + i] * weight[i];\n"
1514
            "}\n"
1515
         << "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0';
1516
        
1517
        return new ProgramCL("filter_v", buffer, _context, _device); 
1518
}
1519

    
1520
FilterCL*  ProgramBagCLN::CreateFilter(float kernel[], int width)
1521
{
1522
    FilterCL * filter = new FilterCL;
1523
    filter->s_shader_h = CreateFilterH(kernel, width); 
1524
    filter->s_shader_v = CreateFilterV(kernel, width);
1525
    filter->_weight = new CLTexImage(_context, _queue);
1526
    filter->_weight->InitBufferTex(width, 1, 1);
1527
    filter->_weight->CopyFromHost(kernel);
1528
    filter->_size = width; 
1529
    return filter;
1530
}
1531

    
1532

    
1533
void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
1534
{
1535
    cl_kernel kernelh = filter->s_shader_h->_kernel;
1536
    cl_kernel kernelv = filter->s_shader_v->_kernel;
1537
    //////////////////////////////////////////////////////////////////
1538

    
1539
    cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight();
1540
    cl_mem weight = (cl_mem) filter->_weight->_clData;
1541
    cl_int w_ = w - 1, h_ = h - 1; 
1542

    
1543

    
1544
    clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData);
1545
    clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData);
1546
    clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_);
1547
    clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_);
1548
    clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight);
1549

    
1550
    size_t dim00 = 128, dim01 = 1;
1551
    size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01};
1552
    status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL);
1553
    CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
1554
    if(status != CL_SUCCESS) return;
1555

    
1556

    
1557
    clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData);
1558
    clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData);
1559
    clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_);
1560
    clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_);
1561
    clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight); 
1562

    
1563
    size_t dim10 = 1, dim11 = 128;
1564
    size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11};
1565
    status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL);
1566
    CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
1567
    //clReleaseEvent(event);
1568
}
1569

    
1570
void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale)
1571
{
1572
    cl_kernel  kernel; 
1573
    cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); 
1574

    
1575
    cl_int fullstep = (1 << log_scale);
1576
    kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
1577
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData));
1578
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData));
1579
    clSetKernelArg(kernel, 2, sizeof(cl_int), &(w));
1580
    clSetKernelArg(kernel, 3, sizeof(cl_int), &(h));
1581
    if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep));
1582

    
1583
    size_t dim0 = 128, dim1 = 1;
1584
    //while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2; 
1585
    size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1};
1586
    cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL);
1587
    CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
1588
}
1589

    
1590

    
1591
#endif
1592