root / rgbdslam / external / siftgpu / src / SiftGPU / ProgramCL.cpp @ 9240aaa3
History | View | Annotate | Download (63.2 KB)
1 |
//////////////////////////////////////////////////////////////////////////////
|
---|---|
2 |
// File: ProgramCL.cpp
|
3 |
// Author: Changchang Wu
|
4 |
// Description : implementation of CL related class.
|
5 |
// class ProgramCL A simple wrapper of Cg programs
|
6 |
//
|
7 |
// Copyright (c) 2007 University of North Carolina at Chapel Hill
|
8 |
// All Rights Reserved
|
9 |
//
|
10 |
// Permission to use, copy, modify and distribute this software and its
|
11 |
// documentation for educational, research and non-profit purposes, without
|
12 |
// fee, and without a written agreement is hereby granted, provided that the
|
13 |
// above copyright notice and the following paragraph appear in all copies.
|
14 |
//
|
15 |
// The University of North Carolina at Chapel Hill make no representations
|
16 |
// about the suitability of this software for any purpose. It is provided
|
17 |
// 'as is' without express or implied warranty.
|
18 |
//
|
19 |
// Please send BUG REPORTS to ccwu@cs.unc.edu
|
20 |
//
|
21 |
////////////////////////////////////////////////////////////////////////////
|
22 |
|
23 |
#if defined(CL_SIFTGPU_ENABLED)
|
24 |
|
25 |
#include <CL/opencl.h> |
26 |
#include <GL/glew.h> |
27 |
|
28 |
#include <iostream> |
29 |
#include <iomanip> |
30 |
#include <vector> |
31 |
#include <strstream> |
32 |
#include <algorithm> |
33 |
#include <stdlib.h> |
34 |
#include <math.h> |
35 |
#include <string.h> |
36 |
using namespace std; |
37 |
|
38 |
#include "GlobalUtil.h" |
39 |
#include "CLTexImage.h" |
40 |
#include "ProgramCL.h" |
41 |
#include "SiftGPU.h" |
42 |
|
43 |
|
44 |
#if defined(_WIN32)
|
45 |
#pragma comment (lib, "OpenCL.lib") |
46 |
#endif
|
47 |
|
48 |
#ifndef _INC_WINDOWS
|
49 |
#ifndef WIN32_LEAN_AND_MEAN
|
50 |
#define WIN32_LEAN_AND_MEAN
|
51 |
#endif
|
52 |
#include <windows.h> |
53 |
#endif
|
54 |
|
55 |
//////////////////////////////////////////////////////////////////////
|
56 |
// Construction/Destruction
|
57 |
//////////////////////////////////////////////////////////////////////
|
58 |
|
59 |
ProgramCL::ProgramCL() |
60 |
{ |
61 |
_program = NULL;
|
62 |
_kernel = NULL;
|
63 |
_valid = 0;
|
64 |
} |
65 |
|
66 |
ProgramCL::~ProgramCL() |
67 |
{ |
68 |
if(_kernel) clReleaseKernel(_kernel);
|
69 |
if(_program) clReleaseProgram(_program);
|
70 |
} |
71 |
|
72 |
ProgramCL::ProgramCL(const char* name, const char * code, cl_context context, cl_device_id device) : _valid(1) |
73 |
{ |
74 |
const char * src[1] = {code}; cl_int status; |
75 |
|
76 |
_program = clCreateProgramWithSource(context, 1, src, NULL, &status); |
77 |
if(status != CL_SUCCESS) _valid = 0; |
78 |
|
79 |
status = clBuildProgram(_program, 0, NULL, |
80 |
GlobalUtil::_debug ? |
81 |
"-cl-fast-relaxed-math -cl-single-precision-constant -cl-nv-verbose" :
|
82 |
"-cl-fast-relaxed-math -cl-single-precision-constant", NULL, NULL); |
83 |
|
84 |
if(status != CL_SUCCESS) {PrintBuildLog(device, 1); _valid = 0;} |
85 |
else if(GlobalUtil::_debug) PrintBuildLog(device, 0); |
86 |
|
87 |
_kernel = clCreateKernel(_program, name, &status); |
88 |
if(status != CL_SUCCESS) _valid = 0; |
89 |
} |
90 |
|
91 |
void ProgramCL::PrintBuildLog(cl_device_id device, int all) |
92 |
{ |
93 |
char buffer[10240] = "\0"; |
94 |
cl_int status = clGetProgramBuildInfo( |
95 |
_program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); |
96 |
if(all )
|
97 |
{ |
98 |
std::cerr << buffer << endl; |
99 |
}else
|
100 |
{ |
101 |
const char * pos = strstr(buffer, "ptxas"); |
102 |
if(pos) std::cerr << pos << endl;
|
103 |
} |
104 |
} |
105 |
|
106 |
///////////////////////////////////////////////////////////////////////////////////
|
107 |
/////////////////////////////////PACKED VERSION?///////////////////////////////////
|
108 |
|
109 |
ProgramBagCL::ProgramBagCL() |
110 |
{ |
111 |
////////////////////////////////////
|
112 |
_context = NULL; _queue = NULL; |
113 |
s_gray = s_sampling = NULL;
|
114 |
s_packup = s_zero_pass = NULL;
|
115 |
s_gray_pack = s_unpack = NULL;
|
116 |
s_sampling_u = NULL;
|
117 |
s_dog_pass = NULL;
|
118 |
s_grad_pass = NULL;
|
119 |
s_grad_pass2 = NULL;
|
120 |
s_unpack_dog = NULL;
|
121 |
s_unpack_grd = NULL;
|
122 |
s_unpack_key = NULL;
|
123 |
s_keypoint = NULL;
|
124 |
f_gaussian_skip0 = NULL;
|
125 |
f_gaussian_skip1 = NULL;
|
126 |
f_gaussian_step = 0;
|
127 |
|
128 |
////////////////////////////////
|
129 |
GlobalUtil::StartTimer("Initialize OpenCL");
|
130 |
if(!InitializeContext()) return; |
131 |
GlobalUtil::StopTimer(); |
132 |
|
133 |
} |
134 |
|
135 |
|
136 |
|
137 |
ProgramBagCL::~ProgramBagCL() |
138 |
{ |
139 |
if(s_gray) delete s_gray; |
140 |
if(s_sampling) delete s_sampling; |
141 |
if(s_zero_pass) delete s_zero_pass; |
142 |
if(s_packup) delete s_packup; |
143 |
if(s_unpack) delete s_unpack; |
144 |
if(s_gray_pack) delete s_gray_pack; |
145 |
if(s_sampling_u) delete s_sampling_u; |
146 |
if(s_dog_pass) delete s_dog_pass; |
147 |
if(s_grad_pass) delete s_grad_pass; |
148 |
if(s_grad_pass2) delete s_grad_pass2; |
149 |
if(s_unpack_dog) delete s_unpack_dog; |
150 |
if(s_unpack_grd) delete s_unpack_grd; |
151 |
if(s_unpack_key) delete s_unpack_key; |
152 |
if(s_keypoint) delete s_keypoint; |
153 |
|
154 |
if(f_gaussian_skip1) delete f_gaussian_skip1; |
155 |
|
156 |
for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++) |
157 |
{ |
158 |
if(f_gaussian_skip0_v[i]) delete f_gaussian_skip0_v[i]; |
159 |
} |
160 |
if(f_gaussian_step && _gaussian_step_num > 0) |
161 |
{ |
162 |
for(int i = 0; i< _gaussian_step_num; i++) |
163 |
{ |
164 |
delete f_gaussian_step[i];
|
165 |
} |
166 |
delete[] f_gaussian_step;
|
167 |
} |
168 |
|
169 |
//////////////////////////////////////
|
170 |
if(_context) clReleaseContext(_context);
|
171 |
if(_queue) clReleaseCommandQueue(_queue);
|
172 |
} |
173 |
|
174 |
bool ProgramBagCL::InitializeContext()
|
175 |
{ |
176 |
cl_uint num_platform, num_device; |
177 |
cl_int status; |
178 |
// Get OpenCL platform count
|
179 |
status = clGetPlatformIDs (0, NULL, &num_platform); |
180 |
if (status != CL_SUCCESS || num_platform == 0) return false; |
181 |
|
182 |
cl_platform_id platforms[16];
|
183 |
if(num_platform > 16 ) num_platform = 16; |
184 |
status = clGetPlatformIDs (num_platform, platforms, NULL);
|
185 |
_platform = platforms[0];
|
186 |
|
187 |
///////////////////////////////
|
188 |
status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, 0, NULL, &num_device); |
189 |
if(status != CL_SUCCESS || num_device == 0) return false; |
190 |
|
191 |
// Create the device list
|
192 |
cl_device_id* devices = new cl_device_id [num_device];
|
193 |
status = clGetDeviceIDs(_platform, CL_DEVICE_TYPE_GPU, num_device, devices, NULL);
|
194 |
_device = (status == CL_SUCCESS? devices[0] : 0); delete[] devices; |
195 |
if(status != CL_SUCCESS) return false; |
196 |
|
197 |
|
198 |
if(GlobalUtil::_verbose)
|
199 |
{ |
200 |
cl_device_mem_cache_type is_gcache; |
201 |
clGetDeviceInfo(_device, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(is_gcache), &is_gcache, NULL); |
202 |
if(is_gcache == CL_NONE) std::cout << "No cache for global memory\n"; |
203 |
//else if(is_gcache == CL_READ_ONLY_CACHE) std::cout << "Read only cache for global memory\n";
|
204 |
//else std::cout << "Read/Write cache for global memory\n";
|
205 |
} |
206 |
|
207 |
//context;
|
208 |
if(GlobalUtil::_UseSiftGPUEX)
|
209 |
{ |
210 |
cl_context_properties prop[] = { |
211 |
CL_CONTEXT_PLATFORM, (cl_context_properties)_platform, |
212 |
CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(), |
213 |
CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(), 0 };
|
214 |
_context = clCreateContext(prop, 1, &_device, NULL, NULL, &status); |
215 |
if(status != CL_SUCCESS) return false; |
216 |
}else
|
217 |
{ |
218 |
_context = clCreateContext(0, 1, &_device, NULL, NULL, &status); |
219 |
if(status != CL_SUCCESS) return false; |
220 |
} |
221 |
|
222 |
//command queue
|
223 |
_queue = clCreateCommandQueue(_context, _device, 0, &status);
|
224 |
return status == CL_SUCCESS;
|
225 |
} |
226 |
|
227 |
void ProgramBagCL::InitProgramBag(SiftParam¶m)
|
228 |
{ |
229 |
GlobalUtil::StartTimer("Load Programs");
|
230 |
LoadFixedShaders(); |
231 |
LoadDynamicShaders(param); |
232 |
if(GlobalUtil::_UseSiftGPUEX) LoadDisplayShaders();
|
233 |
GlobalUtil::StopTimer(); |
234 |
} |
235 |
|
236 |
|
237 |
void ProgramBagCL::UnloadProgram()
|
238 |
{ |
239 |
|
240 |
} |
241 |
|
242 |
void ProgramBagCL::FinishCL()
|
243 |
{ |
244 |
clFinish(_queue); |
245 |
} |
246 |
|
247 |
void ProgramBagCL::LoadFixedShaders()
|
248 |
{ |
249 |
|
250 |
|
251 |
s_gray = new ProgramCL( "gray", |
252 |
"__kernel void gray(__read_only image2d_t input, __write_only image2d_t output) {\n"
|
253 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
254 |
"int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
|
255 |
"float4 weight = (float4)(0.299, 0.587, 0.114, 0.0);\n"
|
256 |
"float intensity = dot(weight, read_imagef(input,sampler, coord ));\n"
|
257 |
"float4 result= (float4)(intensity, intensity, intensity, 1.0);\n"
|
258 |
"write_imagef(output, coord, result); }", _context, _device );
|
259 |
|
260 |
|
261 |
s_sampling = new ProgramCL("sampling", |
262 |
"__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output,\n"
|
263 |
" int width, int height) {\n"
|
264 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
265 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
266 |
"if( x >= width || y >= height) return;\n"
|
267 |
"int xa = x + x, ya = y + y; \n"
|
268 |
"int xb = xa + 1, yb = ya + 1; \n"
|
269 |
"float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
|
270 |
"float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
|
271 |
"float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
|
272 |
"float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
|
273 |
"float4 result = (float4) (v1, v2, v3, v4);"
|
274 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
275 |
|
276 |
s_sampling_k = new ProgramCL("sampling_k", |
277 |
"__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
|
278 |
" int width, int height,\n"
|
279 |
" int step, int halfstep) {\n"
|
280 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
281 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
282 |
"if( x >= width || y >= height) return;\n"
|
283 |
"int xa = x * step, ya = y *step; \n"
|
284 |
"int xb = xa + halfstep, yb = ya + halfstep; \n"
|
285 |
"float v1 = read_imagef(input, sampler, (int2) (xa, ya)).x; \n"
|
286 |
"float v2 = read_imagef(input, sampler, (int2) (xb, ya)).x; \n"
|
287 |
"float v3 = read_imagef(input, sampler, (int2) (xa, yb)).x; \n"
|
288 |
"float v4 = read_imagef(input, sampler, (int2) (xb, yb)).x; \n"
|
289 |
"float4 result = (float4) (v1, v2, v3, v4);"
|
290 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
291 |
|
292 |
|
293 |
s_sampling_u = new ProgramCL("sampling_u", |
294 |
"__kernel void sampling_u(__read_only image2d_t input, \n"
|
295 |
" __write_only image2d_t output,\n"
|
296 |
" int width, int height,\n"
|
297 |
" float step, float halfstep) {\n"
|
298 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
|
299 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
300 |
"if( x >= width || y >= height) return;\n"
|
301 |
"float xa = x * step, ya = y *step; \n"
|
302 |
"float xb = xa + halfstep, yb = ya + halfstep; \n"
|
303 |
"float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
|
304 |
"float v2 = read_imagef(input, sampler, (float2) (xb, ya)).x; \n"
|
305 |
"float v3 = read_imagef(input, sampler, (float2) (xa, yb)).x; \n"
|
306 |
"float v4 = read_imagef(input, sampler, (float2) (xb, yb)).x; \n"
|
307 |
"float4 result = (float4) (v1, v2, v3, v4);"
|
308 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
309 |
|
310 |
|
311 |
s_zero_pass = new ProgramCL("zero_pass", |
312 |
"__kernel void zero_pass(__write_only image2d_t output){\n"
|
313 |
"int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
|
314 |
"write_imagef(output, coord, (float4)(0.0));}", _context, _device);
|
315 |
|
316 |
s_packup = new ProgramCL("packup", |
317 |
"__kernel void packup(__global float* input, __write_only image2d_t output,\n"
|
318 |
" int twidth, int theight, int width){\n"
|
319 |
"int2 coord = (int2)(get_global_id(0), get_global_id(1));\n"
|
320 |
"if(coord.x >= twidth || coord.y >= theight) return;\n"
|
321 |
"int index0 = (coord.y + coord.y) * width; \n"
|
322 |
"int index1 = index0 + coord.x;\n"
|
323 |
"int x2 = min(width -1, coord.x); \n"
|
324 |
"float v1 = input[index1 + coord.x], v2 = input[index1 + x2]; \n"
|
325 |
"int index2 = index1 + width; \n"
|
326 |
"float v3 = input[index2 + coord.x], v4 = input[index2 + x2]; \n "
|
327 |
"write_imagef(output, coord, (float4) (v1, v2, v3, v4));}", _context, _device);
|
328 |
|
329 |
s_dog_pass = new ProgramCL("dog_pass", |
330 |
"__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
|
331 |
" __write_only image2d_t dog, int width, int height) {\n"
|
332 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
|
333 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
334 |
"int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
|
335 |
"if( coord.x >= width || coord.y >= height) return;\n"
|
336 |
"float4 cc = read_imagef(tex , sampler, coord); \n"
|
337 |
"float4 cp = read_imagef(texp, sampler, coord);\n"
|
338 |
"write_imagef(dog, coord, cc - cp); }\n", _context, _device);
|
339 |
|
340 |
s_grad_pass = new ProgramCL("grad_pass", |
341 |
"__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
|
342 |
" __write_only image2d_t dog, int width, int height,\n"
|
343 |
" __write_only image2d_t grad, __write_only image2d_t rot) {\n"
|
344 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
|
345 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
346 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
347 |
"if( x >= width || y >= height) return;\n"
|
348 |
"int2 coord = (int2) (x, y);\n"
|
349 |
"float4 cc = read_imagef(tex , sampler, coord); \n"
|
350 |
"float4 cp = read_imagef(texp, sampler, coord);\n"
|
351 |
"float2 cl = read_imagef(tex, sampler, (int2)(x - 1, y)).yw;\n"
|
352 |
"float2 cr = read_imagef(tex, sampler, (int2)(x + 1, y)).xz;\n"
|
353 |
"float2 cd = read_imagef(tex, sampler, (int2)(x, y - 1)).zw;\n"
|
354 |
"float2 cu = read_imagef(tex, sampler, (int2)(x, y + 1)).xy;\n"
|
355 |
"write_imagef(dog, coord, cc - cp); \n"
|
356 |
"float4 dx = (float4)(cc.y - cl.x, cr.x - cc.x, cc.w - cl.y, cr.y - cc.z);\n"
|
357 |
"float4 dy = (float4)(cc.zw - cd.xy, cu.xy - cc.xy);\n"
|
358 |
"write_imagef(grad, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
|
359 |
"write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
|
360 |
|
361 |
s_grad_pass2 = new ProgramCL("grad_pass2", |
362 |
"#define BLOCK_DIMX 32\n"
|
363 |
"#define BLOCK_DIMY 14\n"
|
364 |
"#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
|
365 |
"__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
|
366 |
" __write_only image2d_t dog, int width, int height,\n"
|
367 |
" __write_only image2d_t grd, __write_only image2d_t rot){\n"//, __local float* block) {\n" |
368 |
"__local float block[BLOCK_SIZE * 4]; \n"
|
369 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
|
370 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
371 |
"int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
|
372 |
" get_global_id(1) - get_group_id(1) * 2- 1); \n"
|
373 |
"int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
|
374 |
"float4 cc = read_imagef(tex, sampler, coord);\n"
|
375 |
"block[idx ] = cc.x;\n"
|
376 |
"block[idx + BLOCK_SIZE ] = cc.y;\n"
|
377 |
"block[idx + BLOCK_SIZE * 2] = cc.z;\n"
|
378 |
"block[idx + BLOCK_SIZE * 3] = cc.w;\n"
|
379 |
"barrier(CLK_LOCAL_MEM_FENCE);\n"
|
380 |
"if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
|
381 |
"if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
|
382 |
"if( coord.x >= width) return; \n"
|
383 |
"if( coord.y >= height) return;\n"
|
384 |
"float4 cp = read_imagef(texp, sampler, coord);\n"
|
385 |
"float4 dx = (float4)( cc.y - block[idx - 1 + BLOCK_SIZE], \n"
|
386 |
" block[idx + 1] - cc.x, \n"
|
387 |
" cc.w - block[idx - 1 + 3 * BLOCK_SIZE], \n"
|
388 |
" block[idx + 1 + 2 * BLOCK_SIZE] - cc.z);\n"
|
389 |
"float4 dy = (float4)( cc.z - block[idx - BLOCK_DIMX + 2 * BLOCK_SIZE], \n"
|
390 |
" cc.w - block[idx - BLOCK_DIMX + 3 * BLOCK_SIZE],"
|
391 |
//" cc.zw - block[idx - BLOCK_DIMX].zw, \n"
|
392 |
" block[idx + BLOCK_DIMX] - cc.x,\n "
|
393 |
" block[idx + BLOCK_DIMX + BLOCK_SIZE] - cc.y);\n"
|
394 |
//" block[idx + BLOCK_DIMX].xy - cc.xy);\n"
|
395 |
"write_imagef(dog, coord, cc - cp); \n"
|
396 |
"write_imagef(grd, coord, 0.5 * sqrt(dx*dx + dy * dy));\n"
|
397 |
"write_imagef(rot, coord, atan2(dy, dx + (float4) (FLT_MIN)));}\n", _context, _device);
|
398 |
} |
399 |
|
400 |
void ProgramBagCL::LoadDynamicShaders(SiftParam& param)
|
401 |
{ |
402 |
LoadKeypointShader(); |
403 |
LoadGenListShader(param._dog_level_num, 0);
|
404 |
CreateGaussianFilters(param); |
405 |
} |
406 |
|
407 |
|
408 |
void ProgramBagCL::SelectInitialSmoothingFilter(int octave_min, SiftParam¶m) |
409 |
{ |
410 |
float sigma = param.GetInitialSmoothSigma(octave_min);
|
411 |
if(sigma == 0) |
412 |
{ |
413 |
f_gaussian_skip0 = NULL;
|
414 |
}else
|
415 |
{ |
416 |
for(unsigned int i = 0; i < f_gaussian_skip0_v.size(); i++) |
417 |
{ |
418 |
if(f_gaussian_skip0_v[i]->_id == octave_min)
|
419 |
{ |
420 |
f_gaussian_skip0 = f_gaussian_skip0_v[i]; |
421 |
return ;
|
422 |
} |
423 |
} |
424 |
FilterCL * filter = CreateGaussianFilter(sigma); |
425 |
filter->_id = octave_min; |
426 |
f_gaussian_skip0_v.push_back(filter); |
427 |
f_gaussian_skip0 = filter; |
428 |
} |
429 |
|
430 |
} |
431 |
|
432 |
void ProgramBagCL::CreateGaussianFilters(SiftParam¶m)
|
433 |
{ |
434 |
if(param._sigma_skip0>0.0f) |
435 |
{ |
436 |
f_gaussian_skip0 = CreateGaussianFilter(param._sigma_skip0); |
437 |
f_gaussian_skip0->_id = GlobalUtil::_octave_min_default; |
438 |
f_gaussian_skip0_v.push_back(f_gaussian_skip0); |
439 |
} |
440 |
if(param._sigma_skip1>0.0f) |
441 |
{ |
442 |
f_gaussian_skip1 = CreateGaussianFilter(param._sigma_skip1); |
443 |
} |
444 |
|
445 |
f_gaussian_step = new FilterCL*[param._sigma_num];
|
446 |
for(int i = 0; i< param._sigma_num; i++) |
447 |
{ |
448 |
f_gaussian_step[i] = CreateGaussianFilter(param._sigma[i]); |
449 |
} |
450 |
_gaussian_step_num = param._sigma_num; |
451 |
} |
452 |
|
453 |
|
454 |
FilterCL* ProgramBagCL::CreateGaussianFilter(float sigma)
|
455 |
{ |
456 |
//pixel inside 3*sigma box
|
457 |
int sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;// |
458 |
int width = 2*sz + 1; |
459 |
|
460 |
//filter size truncation
|
461 |
if(GlobalUtil::_MaxFilterWidth >0 && width > GlobalUtil::_MaxFilterWidth) |
462 |
{ |
463 |
std::cout<<"Filter size truncated from "<<width<<" to "<<GlobalUtil::_MaxFilterWidth<<endl; |
464 |
sz = GlobalUtil::_MaxFilterWidth>>1;
|
465 |
width = 2 * sz + 1; |
466 |
} |
467 |
|
468 |
int i;
|
469 |
float * kernel = new float[width]; |
470 |
float rv = 1.0f/(sigma*sigma); |
471 |
float v, ksum =0; |
472 |
|
473 |
// pre-compute filter
|
474 |
for( i = -sz ; i <= sz ; ++i)
|
475 |
{ |
476 |
kernel[i+sz] = v = exp(-0.5f * i * i *rv) ; |
477 |
ksum += v; |
478 |
} |
479 |
|
480 |
//normalize the kernel
|
481 |
rv = 1.0f / ksum; |
482 |
for(i = 0; i< width ;i++) kernel[i]*=rv; |
483 |
|
484 |
FilterCL * filter = CreateFilter(kernel, width); |
485 |
delete [] kernel;
|
486 |
if(GlobalUtil::_verbose && GlobalUtil::_timingL) std::cout<<"Filter: sigma = "<<sigma<<", size = "<<width<<"x"<<width<<endl; |
487 |
return filter;
|
488 |
} |
489 |
|
490 |
FilterCL* ProgramBagCL::CreateFilter(float kernel[], int width) |
491 |
{ |
492 |
FilterCL * filter = new FilterCL;
|
493 |
filter->s_shader_h = CreateFilterH(kernel, width); |
494 |
filter->s_shader_v = CreateFilterV(kernel, width); |
495 |
filter->_size = width; |
496 |
filter->_id = 0;
|
497 |
return filter;
|
498 |
} |
499 |
|
500 |
ProgramCL* ProgramBagCL::CreateFilterH(float kernel[], int width) |
501 |
{ |
502 |
int halfwidth = width >>1; |
503 |
float * pf = kernel + halfwidth;
|
504 |
int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up |
505 |
int npixel = (nhpixel<<1)+1;// |
506 |
float weight[3]; |
507 |
|
508 |
////////////////////////////
|
509 |
char buffer[10240]; |
510 |
ostrstream out(buffer, 10240);
|
511 |
out<<setprecision(8);
|
512 |
|
513 |
|
514 |
//CL_DEVICE_IMAGE2D_MAX_WIDTH
|
515 |
out<< |
516 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
|
517 |
"__kernel void filter_h(__read_only image2d_t input, \n"
|
518 |
" __write_only image2d_t output, int width_, int height_) {\n"
|
519 |
"int x = get_global_id(0);\n"
|
520 |
"int y = get_global_id(1); \n"
|
521 |
"if( x > width_ || y > height_) return; \n"
|
522 |
"float4 pc; int2 coord; \n"
|
523 |
"float4 result = (float4)(0.0);\n";
|
524 |
for(int i = 0 ; i < npixel ; i++) |
525 |
{ |
526 |
out<<"coord = (int2)(x + ("<< (i - nhpixel) << "), y);\n"; |
527 |
out<<"pc= read_imagef(input, sampler, coord);\n";
|
528 |
if(GlobalUtil::_PreciseBorder)
|
529 |
out<<"if(coord.x < 0) pc = pc.xxzz; else if (coord.x > width_) pc = pc.yyww; \n";
|
530 |
//for each sub-pixel j in center, the weight of sub-pixel k
|
531 |
int xw = (i - nhpixel)*2; |
532 |
for(int j = 0; j < 3; j++) |
533 |
{ |
534 |
int xwn = xw + j -1; |
535 |
weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
|
536 |
} |
537 |
if(weight[1] == 0.0) |
538 |
{ |
539 |
out<<"result += (float4)("<<weight[2]<<","<<weight[0]<<","<<weight[2]<<","<<weight[0]<<") * pc.yxwz;\n"; |
540 |
} |
541 |
else
|
542 |
{ |
543 |
out<<"result += (float4)("<<weight[1]<<", "<<weight[0]<<", "<<weight[1]<<", "<<weight[0]<<") * pc.xxzz;\n"; |
544 |
out<<"result += (float4)("<<weight[2]<<", "<<weight[1]<<", "<<weight[2]<<", "<<weight[1]<<") * pc.yyww;\n"; |
545 |
} |
546 |
} |
547 |
out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0'; |
548 |
return new ProgramCL("filter_h", buffer, _context, _device); |
549 |
} |
550 |
|
551 |
|
552 |
|
553 |
ProgramCL* ProgramBagCL::CreateFilterV(float kernel[], int width) |
554 |
{ |
555 |
|
556 |
int halfwidth = width >>1; |
557 |
float * pf = kernel + halfwidth;
|
558 |
int nhpixel = (halfwidth+1)>>1; //how many neighbour pixels need to be looked up |
559 |
int npixel = (nhpixel<<1)+1;// |
560 |
float weight[3]; |
561 |
|
562 |
////////////////////////////
|
563 |
char buffer[10240]; |
564 |
ostrstream out(buffer, 10240);
|
565 |
out<<setprecision(8);
|
566 |
|
567 |
|
568 |
//CL_DEVICE_IMAGE2D_MAX_WIDTH
|
569 |
out<< |
570 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
|
571 |
"__kernel void filter_v(__read_only image2d_t input, \n"
|
572 |
" __write_only image2d_t output, int width_, int height_) {\n"
|
573 |
"int x = get_global_id(0);\n"
|
574 |
"int y = get_global_id(1); \n"
|
575 |
"if( x > width_ || y >= height_) return; \n"
|
576 |
"float4 pc; int2 coord; \n"
|
577 |
"float4 result = (float4)(0.0);\n";
|
578 |
for(int i = 0 ; i < npixel ; i++) |
579 |
{ |
580 |
out<<"coord = (int2)(x, y + ("<< (i - nhpixel) << "));\n"; |
581 |
out<<"pc= read_imagef(input, sampler, coord);\n";
|
582 |
if(GlobalUtil::_PreciseBorder)
|
583 |
out<<"if(coord.y < 0) pc = pc.xyxy; else if (coord.y > height_) pc = pc.zwzw; \n";
|
584 |
//for each sub-pixel j in center, the weight of sub-pixel k
|
585 |
int xw = (i - nhpixel)*2; |
586 |
for(int j = 0; j < 3; j++) |
587 |
{ |
588 |
int xwn = xw + j -1; |
589 |
weight[j] = xwn < -halfwidth || xwn > halfwidth? 0 : pf[xwn];
|
590 |
} |
591 |
if(weight[1] == 0.0) |
592 |
{ |
593 |
out<<"result += (float4)("<<weight[2]<<","<<weight[2]<<","<<weight[0]<<","<<weight[0]<<") * pc.zwxy;\n"; |
594 |
} |
595 |
else
|
596 |
{ |
597 |
out<<"result += (float4)("<<weight[1]<<", "<<weight[1]<<", "<<weight[0]<<", "<<weight[0]<<") * pc.xyxy;\n"; |
598 |
out<<"result += (float4)("<<weight[2]<<", "<<weight[2]<<", "<<weight[1]<<", "<<weight[1]<<") * pc.zwzw;\n"; |
599 |
} |
600 |
} |
601 |
out << "write_imagef(output, (int2)(x, y), result); }\n" << '\0'; |
602 |
return new ProgramCL("filter_v", buffer, _context, _device); |
603 |
|
604 |
} |
605 |
|
606 |
void ProgramBagCL::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
|
607 |
{ |
608 |
cl_kernel kernelh = filter->s_shader_h->_kernel; |
609 |
cl_kernel kernelv = filter->s_shader_v->_kernel; |
610 |
//////////////////////////////////////////////////////////////////
|
611 |
|
612 |
cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
613 |
cl_int w_ = w - 1, h_ = h - 1; |
614 |
|
615 |
size_t dim0 = 16, dim1 = 16; |
616 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
617 |
|
618 |
clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData); |
619 |
clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData); |
620 |
clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_); |
621 |
clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_); |
622 |
status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz, lsz, 0, NULL, NULL); |
623 |
CheckErrorCL(status, "ProgramBagCL::FilterImageH");
|
624 |
if(status != CL_SUCCESS) return; |
625 |
|
626 |
clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData); |
627 |
clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData); |
628 |
clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_); |
629 |
clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_); |
630 |
size_t gsz2[2] = {(w + dim1 - 1) / dim1 * dim1, (h + dim0 - 1) / dim0 * dim0}, lsz2[2] = {dim1, dim0}; |
631 |
status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL); |
632 |
CheckErrorCL(status, "ProgramBagCL::FilterImageV");
|
633 |
//clReleaseEvent(event);
|
634 |
} |
635 |
|
636 |
void ProgramBagCL::SampleImageU(CLTexImage *dst, CLTexImage *src, int log_scale) |
637 |
{ |
638 |
cl_kernel kernel= s_sampling_u->_kernel; |
639 |
float scale = 1.0f / (1 << log_scale); |
640 |
float offset = scale * 0.5f; |
641 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
642 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
643 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
644 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
645 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
646 |
clSetKernelArg(kernel, 4, sizeof(cl_float), &(scale)); |
647 |
clSetKernelArg(kernel, 5, sizeof(cl_float), &(offset)); |
648 |
|
649 |
size_t dim0 = 16, dim1 = 16; |
650 |
//while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
|
651 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
652 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
653 |
CheckErrorCL(status, "ProgramBagCL::SampleImageU");
|
654 |
} |
655 |
|
656 |
void ProgramBagCL::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale) |
657 |
{ |
658 |
cl_kernel kernel; |
659 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
660 |
if(log_scale == 1) |
661 |
{ |
662 |
kernel = s_sampling->_kernel; |
663 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
664 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
665 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
666 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
667 |
}else
|
668 |
{ |
669 |
cl_int fullstep = (1 << log_scale);
|
670 |
cl_int halfstep = fullstep >> 1;
|
671 |
kernel = s_sampling_k->_kernel; |
672 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
673 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
674 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
675 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
676 |
clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep)); |
677 |
clSetKernelArg(kernel, 5, sizeof(cl_int), &(halfstep)); |
678 |
} |
679 |
size_t dim0 = 128, dim1 = 1; |
680 |
//while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
|
681 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
682 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
683 |
CheckErrorCL(status, "ProgramBagCL::SampleImageD");
|
684 |
} |
685 |
|
686 |
void ProgramBagCL::FilterInitialImage(CLTexImage* tex, CLTexImage* buf)
|
687 |
{ |
688 |
if(f_gaussian_skip0) FilterImage(f_gaussian_skip0, tex, tex, buf);
|
689 |
} |
690 |
|
691 |
void ProgramBagCL::FilterSampledImage(CLTexImage* tex, CLTexImage* buf)
|
692 |
{ |
693 |
if(f_gaussian_skip1) FilterImage(f_gaussian_skip1, tex, tex, buf);
|
694 |
} |
695 |
|
696 |
void ProgramBagCL::ComputeDOG(CLTexImage*tex, CLTexImage* texp, CLTexImage* dog, CLTexImage* grad, CLTexImage* rot)
|
697 |
{ |
698 |
int margin = 0, use_gm2 = 1; |
699 |
bool both_grad_dog = rot->_clData && grad->_clData;
|
700 |
cl_int w = tex->GetImgWidth(), h = tex->GetImgHeight(); |
701 |
cl_kernel kernel ; size_t dim0, dim1; |
702 |
if(!both_grad_dog) {kernel = s_dog_pass->_kernel; dim0 = 16; dim1 = 12; } |
703 |
else if(use_gm2) {kernel = s_grad_pass2->_kernel; dim0 = 32; dim1 = 14; margin = 2; } |
704 |
else {kernel = s_grad_pass->_kernel; dim0 = 16; dim1 = 20; } |
705 |
size_t gsz[2] = { (w + dim0 - 1 - margin) / (dim0 - margin) * dim0, |
706 |
(h + dim1 - 1 - margin) / (dim1 - margin) * dim1};
|
707 |
size_t lsz[2] = {dim0, dim1};
|
708 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(tex->_clData)); |
709 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(texp->_clData)); |
710 |
clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dog->_clData)); |
711 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(w)); |
712 |
clSetKernelArg(kernel, 4, sizeof(cl_int), &(h)); |
713 |
if(both_grad_dog)
|
714 |
{ |
715 |
clSetKernelArg(kernel, 5, sizeof(cl_mem), &(grad->_clData)); |
716 |
clSetKernelArg(kernel, 6, sizeof(cl_mem), &(rot->_clData)); |
717 |
} |
718 |
///////////////////////////////////////////////////////
|
719 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
720 |
CheckErrorCL(status, "ProgramBagCL::ComputeDOG");
|
721 |
} |
722 |
|
723 |
|
724 |
void ProgramBagCL::ComputeKEY(CLTexImage*dog, CLTexImage* key, float Tdog, float Tedge) |
725 |
{ |
726 |
cl_kernel kernel = s_keypoint->_kernel; |
727 |
cl_int w = key->GetImgWidth(), h = key->GetImgHeight(); |
728 |
float threshold0 = Tdog* (GlobalUtil::_SubpixelLocalization?0.8f:1.0f); |
729 |
float threshold1 = Tdog;
|
730 |
float threshold2 = (Tedge+1)*(Tedge+1)/Tedge; |
731 |
|
732 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData)); |
733 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &((dog + 1)->_clData)); |
734 |
clSetKernelArg(kernel, 2, sizeof(cl_mem), &((dog - 1)->_clData)); |
735 |
clSetKernelArg(kernel, 3, sizeof(cl_mem), &(key->_clData)); |
736 |
clSetKernelArg(kernel, 4, sizeof(cl_float), &(threshold0)); |
737 |
clSetKernelArg(kernel, 5, sizeof(cl_float), &(threshold1)); |
738 |
clSetKernelArg(kernel, 6, sizeof(cl_float), &(threshold2)); |
739 |
clSetKernelArg(kernel, 7, sizeof(cl_int), &(w)); |
740 |
clSetKernelArg(kernel, 8, sizeof(cl_int), &(h)); |
741 |
|
742 |
size_t dim0 = 8, dim1 = 8; |
743 |
//if( w * h / dim0 / dim1 < 16) dim1 /= 2;
|
744 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
745 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
746 |
CheckErrorCL(status, "ProgramBagCL::ComputeKEY");
|
747 |
} |
748 |
|
749 |
void ProgramBagCL::UnpackImage(CLTexImage*src, CLTexImage* dst)
|
750 |
{ |
751 |
cl_kernel kernel = s_unpack->_kernel; |
752 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
753 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
754 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
755 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
756 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
757 |
const size_t dim0 = 16, dim1 = 16; |
758 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
759 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
760 |
|
761 |
CheckErrorCL(status, "ProgramBagCL::UnpackImage");
|
762 |
FinishCL(); |
763 |
|
764 |
} |
765 |
|
766 |
void ProgramBagCL::UnpackImageDOG(CLTexImage*src, CLTexImage* dst)
|
767 |
{ |
768 |
if(s_unpack_dog == NULL) return; |
769 |
cl_kernel kernel = s_unpack_dog->_kernel; |
770 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
771 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
772 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
773 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
774 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
775 |
const size_t dim0 = 16, dim1 = 16; |
776 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
777 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
778 |
|
779 |
CheckErrorCL(status, "ProgramBagCL::UnpackImage");
|
780 |
FinishCL(); |
781 |
} |
782 |
|
783 |
void ProgramBagCL::UnpackImageGRD(CLTexImage*src, CLTexImage* dst)
|
784 |
{ |
785 |
if(s_unpack_grd == NULL) return; |
786 |
cl_kernel kernel = s_unpack_grd->_kernel; |
787 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
788 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
789 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
790 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
791 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
792 |
const size_t dim0 = 16, dim1 = 16; |
793 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
794 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
795 |
|
796 |
CheckErrorCL(status, "ProgramBagCL::UnpackImage");
|
797 |
FinishCL(); |
798 |
} |
799 |
void ProgramBagCL::UnpackImageKEY(CLTexImage*src, CLTexImage* dog, CLTexImage* dst)
|
800 |
{ |
801 |
if(s_unpack_key == NULL) return; |
802 |
cl_kernel kernel = s_unpack_key->_kernel; |
803 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
804 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(dog->_clData)); |
805 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(src->_clData)); |
806 |
clSetKernelArg(kernel, 2, sizeof(cl_mem), &(dst->_clData)); |
807 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(w)); |
808 |
clSetKernelArg(kernel, 4, sizeof(cl_int), &(h)); |
809 |
const size_t dim0 = 16, dim1 = 16; |
810 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
811 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
812 |
|
813 |
CheckErrorCL(status, "ProgramBagCL::UnpackImageKEY");
|
814 |
FinishCL(); |
815 |
} |
816 |
void ProgramBagCL::LoadDescriptorShader()
|
817 |
{ |
818 |
GlobalUtil::_DescriptorPPT = 16;
|
819 |
LoadDescriptorShaderF2(); |
820 |
} |
821 |
|
822 |
void ProgramBagCL::LoadDescriptorShaderF2()
|
823 |
{ |
824 |
|
825 |
} |
826 |
|
827 |
void ProgramBagCL::LoadOrientationShader(void) |
828 |
{ |
829 |
|
830 |
} |
831 |
|
832 |
void ProgramBagCL::LoadGenListShader(int ndoglev,int nlev) |
833 |
{ |
834 |
|
835 |
} |
836 |
|
837 |
void ProgramBagCL::LoadKeypointShader()
|
838 |
{ |
839 |
int i; char buffer[20240]; |
840 |
ostrstream out(buffer, 20240);
|
841 |
streampos pos; |
842 |
|
843 |
//tex(X)(Y)
|
844 |
//X: (CLR) (CENTER 0, LEFT -1, RIGHT +1)
|
845 |
//Y: (CDU) (CENTER 0, DOWN -1, UP +1)
|
846 |
out<< |
847 |
"__kernel void keypoint(__read_only image2d_t tex, __read_only image2d_t texU,\n"
|
848 |
" __read_only image2d_t texD, __write_only image2d_t texK,\n"
|
849 |
" float THRESHOLD0, float THRESHOLD1, \n"
|
850 |
" float THRESHOLD2, int width, int height)\n"
|
851 |
"{\n"
|
852 |
" sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
|
853 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;"
|
854 |
" int x = get_global_id(0), y = get_global_id(1);\n"
|
855 |
" if(x >= width || y >= height) return; \n"
|
856 |
" int xp = x - 1, xn = x + 1;\n"
|
857 |
" int yp = y - 1, yn = y + 1;\n"
|
858 |
" int2 coord0 = (int2) (x, y); \n"
|
859 |
" int2 coord1 = (int2) (xp, y); \n"
|
860 |
" int2 coord2 = (int2) (xn, y); \n"
|
861 |
" int2 coord3 = (int2) (x, yp); \n"
|
862 |
" int2 coord4 = (int2) (x, yn); \n"
|
863 |
" int2 coord5 = (int2) (xp, yp); \n"
|
864 |
" int2 coord6 = (int2) (xp, yn); \n"
|
865 |
" int2 coord7 = (int2) (xn, yp); \n"
|
866 |
" int2 coord8 = (int2) (xn, yn); \n"
|
867 |
" float4 ccc = read_imagef(tex, sampler,coord0);\n"
|
868 |
" float4 clc = read_imagef(tex, sampler,coord1);\n"
|
869 |
" float4 crc = read_imagef(tex, sampler,coord2);\n"
|
870 |
" float4 ccd = read_imagef(tex, sampler,coord3);\n"
|
871 |
" float4 ccu = read_imagef(tex, sampler,coord4);\n"
|
872 |
" float4 cld = read_imagef(tex, sampler,coord5);\n"
|
873 |
" float4 clu = read_imagef(tex, sampler,coord6);\n"
|
874 |
" float4 crd = read_imagef(tex, sampler,coord7);\n"
|
875 |
" float4 cru = read_imagef(tex, sampler,coord8);\n"
|
876 |
" float4 cc = ccc;\n"
|
877 |
" float4 v1[4], v2[4];\n"
|
878 |
" v1[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
|
879 |
" v1[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
|
880 |
" v1[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
|
881 |
" v1[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
|
882 |
" v2[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
|
883 |
" v2[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
|
884 |
" v2[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
|
885 |
" v2[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n"
|
886 |
" float4 key4 = (float4)(0); \n";
|
887 |
//test against 8 neighbours
|
888 |
//use variable to identify type of extremum
|
889 |
//1.0 for local maximum and -1.0 for minimum
|
890 |
for(i = 0; i < 4; ++i) |
891 |
out<< |
892 |
" if(cc.s"<<i<<" > THRESHOLD0){ \n" |
893 |
" if(all(isgreater((float4)(cc.s"<<i<<"), max(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = 1.0;\n" |
894 |
" }else if(cc.s"<<i<<" < -THRESHOLD0){ \n" |
895 |
" if(all(isless((float4)(cc.s"<<i<<"), min(v1["<<i<<"], v2["<<i<<"]))))key4.s"<<i<<" = -1.0;\n" |
896 |
" }";
|
897 |
|
898 |
out<< |
899 |
" if(x ==0) {key4.x = key4.z= 0; }\n"
|
900 |
" else if(x + 1 == width) {key4.y = key4.w = 0;}\n"
|
901 |
" if(y ==0) {key4.x = key4.y = 0; }\n"
|
902 |
" else if(y + 1 == height) {key4.z = key4.w = 0;}\n"
|
903 |
" float4 ak = fabs(key4); \n"
|
904 |
" float keysum = ak.x + ak.y + ak.z + ak.w; \n"
|
905 |
" float4 result = (float4)(0.0);\n"
|
906 |
" if(keysum == 1.0) {\n"
|
907 |
" float fxx[4], fyy[4], fxy[4], fx[4], fy[4];\n";
|
908 |
|
909 |
//do edge supression first..
|
910 |
//vector v1 is < (-1, 0), (1, 0), (0,-1), (0, 1)>
|
911 |
//vector v2 is < (-1,-1), (-1,1), (1,-1), (1, 1)>
|
912 |
for(i = 0; i < 4; ++i) |
913 |
out << |
914 |
" if(key4.s"<<i<<" != 0)\n" |
915 |
" {\n"
|
916 |
" float4 D2 = v1["<<i<<"].xyzw - cc.s"<<i<<";\n" |
917 |
" float2 D4 = v2["<<i<<"].xw - v2["<<i<<"].yz;\n" |
918 |
" float2 D5 = 0.5*(v1["<<i<<"].yw-v1["<<i<<"].xz); \n" |
919 |
" fx["<<i<<"] = D5.x; fy["<<i<<"] = D5.y ;\n" |
920 |
" fxx["<<i<<"] = D2.x + D2.y;\n" |
921 |
" fyy["<<i<<"] = D2.z + D2.w;\n" |
922 |
" fxy["<<i<<"] = 0.25*(D4.x + D4.y);\n" |
923 |
" float fxx_plus_fyy = fxx["<<i<<"] + fyy["<<i<<"];\n" |
924 |
" float score_up = fxx_plus_fyy*fxx_plus_fyy; \n"
|
925 |
" float score_down = (fxx["<<i<<"]*fyy["<<i<<"] - fxy["<<i<<"]*fxy["<<i<<"]);\n" |
926 |
" if( score_down <= 0 || score_up > THRESHOLD2 * score_down)keysum = 0;\n"
|
927 |
" }\n";
|
928 |
|
929 |
out << |
930 |
" if(keysum == 1) {\n";
|
931 |
////////////////////////////////////////////////
|
932 |
//read 9 pixels of upper/lower level
|
933 |
out<< |
934 |
" float4 v4[4], v5[4], v6[4];\n"
|
935 |
" ccc = read_imagef(texU, sampler,coord0);\n"
|
936 |
" clc = read_imagef(texU, sampler,coord1);\n"
|
937 |
" crc = read_imagef(texU, sampler,coord2);\n"
|
938 |
" ccd = read_imagef(texU, sampler,coord3);\n"
|
939 |
" ccu = read_imagef(texU, sampler,coord4);\n"
|
940 |
" cld = read_imagef(texU, sampler,coord5);\n"
|
941 |
" clu = read_imagef(texU, sampler,coord6);\n"
|
942 |
" crd = read_imagef(texU, sampler,coord7);\n"
|
943 |
" cru = read_imagef(texU, sampler,coord8);\n"
|
944 |
" float4 cu = ccc;\n"
|
945 |
" v4[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
|
946 |
" v4[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
|
947 |
" v4[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
|
948 |
" v4[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
|
949 |
" v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
|
950 |
" v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
|
951 |
" v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
|
952 |
" v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
|
953 |
|
954 |
for(i = 0; i < 4; ++i) |
955 |
out << |
956 |
" if(key4.s"<<i<<" == 1.0)\n" |
957 |
" {\n"
|
958 |
" if(cc.s"<<i<<" < cu.s"<<i<<" || \n" |
959 |
" any(isless((float4)(cc.s"<<i<<"), max(v4["<<i<<"], v6["<<i<<"]))))keysum = 0; \n" |
960 |
" }else if(key4.s"<<i<<" == -1.0)\n" |
961 |
" {\n"
|
962 |
" if(cc.s"<<i<<" > cu.s"<<i<<" || \n" |
963 |
" any(isgreater((float4)(cc.s"<<i<<"), min(v4["<<i<<"], v6["<<i<<"]))) )keysum = 0; \n" |
964 |
" }\n";
|
965 |
|
966 |
out << |
967 |
" if(keysum == 1.0) { \n";
|
968 |
out << |
969 |
" ccc = read_imagef(texD, sampler,coord0);\n"
|
970 |
" clc = read_imagef(texD, sampler,coord1);\n"
|
971 |
" crc = read_imagef(texD, sampler,coord2);\n"
|
972 |
" ccd = read_imagef(texD, sampler,coord3);\n"
|
973 |
" ccu = read_imagef(texD, sampler,coord4);\n"
|
974 |
" cld = read_imagef(texD, sampler,coord5);\n"
|
975 |
" clu = read_imagef(texD, sampler,coord6);\n"
|
976 |
" crd = read_imagef(texD, sampler,coord7);\n"
|
977 |
" cru = read_imagef(texD, sampler,coord8);\n"
|
978 |
" float4 cd = ccc;\n"
|
979 |
" v5[0] = (float4)(clc.y, ccc.y, ccd.z, ccc.z);\n"
|
980 |
" v5[1] = (float4)(ccc.x, crc.x, ccd.w, ccc.w);\n"
|
981 |
" v5[2] = (float4)(clc.w, ccc.w, ccc.x, ccu.x);\n"
|
982 |
" v5[3] = (float4)(ccc.z, crc.z, ccc.y, ccu.y);\n"
|
983 |
" v6[0] = (float4)(cld.w, clc.w, ccd.w, ccc.w);\n"
|
984 |
" v6[1] = (float4)(ccd.z, ccc.z, crd.z, crc.z);\n"
|
985 |
" v6[2] = (float4)(clc.y, clu.y, ccc.y, ccu.y);\n"
|
986 |
" v6[3] = (float4)(ccc.x, ccu.x, crc.x, cru.x);\n";
|
987 |
for(i = 0; i < 4; ++i) |
988 |
out << |
989 |
" if(key4.s"<<i<<" == 1.0)\n" |
990 |
" {\n"
|
991 |
" if(cc.s"<<i<<" < cd.s"<<i<<" ||\n" |
992 |
" any(isless((float4)(cc.s"<<i<<"), max(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n" |
993 |
" }else if(key4.s"<<i<<" == -1.0)\n" |
994 |
" {\n"
|
995 |
" if(cc.s"<<i<<" > cd.s"<<i<<" ||\n" |
996 |
" any(isgreater((float4)(cc.s"<<i<<"), min(v5["<<i<<"], v6["<<i<<"]))))keysum = 0; \n" |
997 |
" }\n";
|
998 |
|
999 |
out << |
1000 |
" if(keysum==1.0) {\n";
|
1001 |
//////////////////////////////////////////////////////////////////////
|
1002 |
if(GlobalUtil::_SubpixelLocalization)
|
1003 |
{ |
1004 |
out << |
1005 |
" float4 offset = (float4)(0); \n";
|
1006 |
for(i = 1; i < 4; ++i) |
1007 |
out << |
1008 |
" if(key4.s"<<i<<" != 0) \n" |
1009 |
" {\n"
|
1010 |
" cu.s0 = cu.s"<<i<<"; cd.s0 = cd.s"<<i<<"; cc.s0 = cc.s"<<i<<"; \n" |
1011 |
" v4[0] = v4["<<i<<"]; v5[0] = v5["<<i<<"]; \n" |
1012 |
" fxy[0] = fxy["<<i<<"]; fxx[0] = fxx["<<i<<"]; fyy[0] = fyy["<<i<<"]; \n" |
1013 |
" fx[0] = fx["<<i<<"]; fy[0] = fy["<<i<<"]; \n" |
1014 |
" }\n";
|
1015 |
|
1016 |
out << |
1017 |
" float fs = 0.5*( cu.s0 - cd.s0 ); \n"
|
1018 |
" float fss = cu.s0 + cd.s0 - cc.s0 - cc.s0;\n"
|
1019 |
" float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);\n"
|
1020 |
" float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);\n"
|
1021 |
" float4 A0, A1, A2 ; \n"
|
1022 |
" A0 = (float4)(fxx[0], fxy[0], fxs, -fx[0]); \n"
|
1023 |
" A1 = (float4)(fxy[0], fyy[0], fys, -fy[0]); \n"
|
1024 |
" A2 = (float4)(fxs, fys, fss, -fs); \n"
|
1025 |
" float4 x3 = fabs((float4)(fxx[0], fxy[0], fxs, 0)); \n"
|
1026 |
" float maxa = max(max(x3.x, x3.y), x3.z); \n"
|
1027 |
" if(maxa >= 1e-10 ) \n"
|
1028 |
" { \n"
|
1029 |
" if(x3.y ==maxa ) \n"
|
1030 |
" { \n"
|
1031 |
" float4 TEMP = A1; A1 = A0; A0 = TEMP; \n"
|
1032 |
" }else if( x3.z == maxa ) \n"
|
1033 |
" { \n"
|
1034 |
" float4 TEMP = A2; A2 = A0; A0 = TEMP; \n"
|
1035 |
" } \n"
|
1036 |
" A0 /= A0.x; \n"
|
1037 |
" A1 -= A1.x * A0; \n"
|
1038 |
" A2 -= A2.x * A0; \n"
|
1039 |
" float2 x2 = fabs((float2)(A1.y, A2.y)); \n"
|
1040 |
" if( x2.y > x2.x ) \n"
|
1041 |
" { \n"
|
1042 |
" float4 TEMP = A2.yzwx; \n"
|
1043 |
" A2.yzw = A1.yzw; \n"
|
1044 |
" A1.yzw = TEMP.xyz; \n"
|
1045 |
" x2.x = x2.y; \n"
|
1046 |
" } \n"
|
1047 |
" if(x2.x >= 1e-10) { \n"
|
1048 |
" A1.yzw /= A1.y; \n"
|
1049 |
" A2.yzw -= A2.y * A1.yzw; \n"
|
1050 |
" if(fabs(A2.z) >= 1e-10) {\n"
|
1051 |
" offset.z = A2.w /A2.z; \n"
|
1052 |
" offset.y = A1.w - offset.z*A1.z; \n"
|
1053 |
" offset.x = A0.w - offset.z*A0.z - offset.y*A0.y; \n"
|
1054 |
" if(fabs(cc.s0 + 0.5*dot((float4)(fx[0], fy[0], fs, 0), offset ))<=THRESHOLD1\n"
|
1055 |
" || any( isgreater(fabs(offset), (float4)(1.0)))) key4 = (float4)(0.0);\n"
|
1056 |
" }\n"
|
1057 |
" }\n"
|
1058 |
" }\n"
|
1059 |
<<"\n"
|
1060 |
" float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
|
1061 |
" result = (float4)(keyv, offset.xyz);\n"
|
1062 |
" }}}}\n"
|
1063 |
" write_imagef(texK, coord0, result);\n "
|
1064 |
"}\n" <<'\0'; |
1065 |
} |
1066 |
else
|
1067 |
{ |
1068 |
out << "\n"
|
1069 |
" float keyv = dot(key4, (float4)(1.0, 2.0, 3.0, 4.0));\n"
|
1070 |
" result = (float4)(keyv, 0, 0, 0);\n"
|
1071 |
" }}}}\n"
|
1072 |
" write_imagef(texK, coord0, result);\n "
|
1073 |
"}\n" <<'\0'; |
1074 |
} |
1075 |
|
1076 |
s_keypoint = new ProgramCL("keypoint", buffer, _context, _device); |
1077 |
} |
1078 |
|
1079 |
void ProgramBagCL::LoadDisplayShaders()
|
1080 |
{ |
1081 |
//"uniform sampler2DRect tex; void main(){\n"
|
1082 |
//"vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy); bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));\n"
|
1083 |
//"float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}");
|
1084 |
s_unpack = new ProgramCL("main", |
1085 |
"__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
|
1086 |
" int width, int height) {\n"
|
1087 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1088 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1089 |
"if(x >= width || y >= height) return;\n"
|
1090 |
"int xx = x / 2, yy = y / 2; \n"
|
1091 |
"float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
|
1092 |
"float v1 = (x & 1 ? vv.w : vv.z); \n"
|
1093 |
"float v2 = (x & 1 ? vv.y : vv.x);\n"
|
1094 |
"float v = y & 1 ? v1 : v2;\n"
|
1095 |
"float4 result = (float4) (v, v, v, 1);"
|
1096 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1097 |
|
1098 |
s_unpack_dog = new ProgramCL("main", |
1099 |
"__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
|
1100 |
" int width, int height) {\n"
|
1101 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1102 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1103 |
"if(x >= width || y >= height) return;\n"
|
1104 |
"int xx = x / 2, yy = y / 2; \n"
|
1105 |
"float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
|
1106 |
"float v1 = (x & 1 ? vv.w : vv.z); \n"
|
1107 |
"float v2 = (x & 1 ? vv.y : vv.x);\n"
|
1108 |
"float v0 = y & 1 ? v1 : v2;\n"
|
1109 |
"float v = 0.5 + 20.0 * v0;\n "
|
1110 |
"float4 result = (float4) (v, v, v, 1);"
|
1111 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1112 |
|
1113 |
s_unpack_grd = new ProgramCL("main", |
1114 |
"__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
|
1115 |
" int width, int height) {\n"
|
1116 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1117 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1118 |
"if(x >= width || y >= height) return;\n"
|
1119 |
"int xx = x / 2, yy = y / 2; \n"
|
1120 |
"float4 vv = read_imagef(input, sampler, (int2) (xx, yy)); \n"
|
1121 |
"float v1 = (x & 1 ? vv.w : vv.z); \n"
|
1122 |
"float v2 = (x & 1 ? vv.y : vv.x);\n"
|
1123 |
"float v0 = y & 1 ? v1 : v2;\n"
|
1124 |
"float v = 5.0 * v0;\n "
|
1125 |
"float4 result = (float4) (v, v, v, 1);"
|
1126 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1127 |
|
1128 |
s_unpack_key = new ProgramCL("main", |
1129 |
"__kernel void main(__read_only image2d_t dog,\n"
|
1130 |
" __read_only image2d_t key,\n"
|
1131 |
" __write_only image2d_t output,\n"
|
1132 |
" int width, int height) {\n"
|
1133 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1134 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1135 |
"if(x >= width || y >= height) return;\n"
|
1136 |
"int xx = x / 2, yy = y / 2; \n"
|
1137 |
"float4 kk = read_imagef(key, sampler, (int2) (xx, yy));\n"
|
1138 |
"int4 cc = isequal(fabs(kk.xxxx), (float4)(1.0, 2.0, 3.0, 4.0));\n"
|
1139 |
"int k1 = (x & 1 ? cc.w : cc.z); \n"
|
1140 |
"int k2 = (x & 1 ? cc.y : cc.x);\n"
|
1141 |
"int k0 = y & 1 ? k1 : k2;\n"
|
1142 |
"float4 result;\n"
|
1143 |
"if(k0 != 0){\n"
|
1144 |
" //result = kk.x > 0 ? ((float4)(1.0, 0, 0, 1.0)) : ((float4) (0.0, 1.0, 0.0, 1.0)); \n"
|
1145 |
" result = kk.x < 0 ? ((float4)(0, 1.0, 0, 1.0)) : ((float4) (1.0, 0.0, 0.0, 1.0)); \n"
|
1146 |
"}else{"
|
1147 |
"float4 vv = read_imagef(dog, sampler, (int2) (xx, yy));\n"
|
1148 |
"float v1 = (x & 1 ? vv.w : vv.z); \n"
|
1149 |
"float v2 = (x & 1 ? vv.y : vv.x);\n"
|
1150 |
"float v0 = y & 1 ? v1 : v2;\n"
|
1151 |
"float v = 0.5 + 20.0 * v0;\n "
|
1152 |
"result = (float4) (v, v, v, 1);"
|
1153 |
"}\n"
|
1154 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1155 |
} |
1156 |
|
1157 |
|
1158 |
void ProgramBagCL::SetMarginCopyParam(int xmax, int ymax) |
1159 |
{ |
1160 |
|
1161 |
} |
1162 |
|
1163 |
void ProgramBagCL::SetGradPassParam(int texP) |
1164 |
{ |
1165 |
|
1166 |
} |
1167 |
|
1168 |
void ProgramBagCL::SetGenListEndParam(int ktex) |
1169 |
{ |
1170 |
|
1171 |
} |
1172 |
|
1173 |
void ProgramBagCL::SetDogTexParam(int texU, int texD) |
1174 |
{ |
1175 |
|
1176 |
} |
1177 |
|
1178 |
void ProgramBagCL::SetGenListInitParam(int w, int h) |
1179 |
{ |
1180 |
float bbox[4] = {(w -1.0f) * 0.5f +0.25f, (w-1.0f) * 0.5f - 0.25f, (h - 1.0f) * 0.5f + 0.25f, (h-1.0f) * 0.5f - 0.25f}; |
1181 |
|
1182 |
} |
1183 |
|
1184 |
|
1185 |
void ProgramBagCL::SetGenListStartParam(float width, int tex0) |
1186 |
{ |
1187 |
|
1188 |
} |
1189 |
|
1190 |
|
1191 |
|
1192 |
void ProgramBagCL::SetGenListStepParam(int tex, int tex0) |
1193 |
{ |
1194 |
|
1195 |
} |
1196 |
|
1197 |
void ProgramBagCL::SetGenVBOParam(float width, float fwidth, float size) |
1198 |
{ |
1199 |
|
1200 |
} |
1201 |
|
1202 |
void ProgramBagCL::SetSimpleOrientationInput(int oTex, float sigma, float sigma_step) |
1203 |
{ |
1204 |
|
1205 |
} |
1206 |
|
1207 |
|
1208 |
void ProgramBagCL::SetFeatureOrientationParam(int gtex, int width, int height, float sigma, int otex, float step) |
1209 |
{ |
1210 |
|
1211 |
|
1212 |
} |
1213 |
|
1214 |
void ProgramBagCL::SetFeatureDescirptorParam(int gtex, int otex, float dwidth, float fwidth, float width, float height, float sigma) |
1215 |
{ |
1216 |
|
1217 |
} |
1218 |
|
1219 |
|
1220 |
|
1221 |
const char* ProgramBagCL::GetErrorString(cl_int error) |
1222 |
{ |
1223 |
static const char* errorString[] = { |
1224 |
"CL_SUCCESS",
|
1225 |
"CL_DEVICE_NOT_FOUND",
|
1226 |
"CL_DEVICE_NOT_AVAILABLE",
|
1227 |
"CL_COMPILER_NOT_AVAILABLE",
|
1228 |
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
1229 |
"CL_OUT_OF_RESOURCES",
|
1230 |
"CL_OUT_OF_HOST_MEMORY",
|
1231 |
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
1232 |
"CL_MEM_COPY_OVERLAP",
|
1233 |
"CL_IMAGE_FORMAT_MISMATCH",
|
1234 |
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
1235 |
"CL_BUILD_PROGRAM_FAILURE",
|
1236 |
"CL_MAP_FAILURE",
|
1237 |
"",
|
1238 |
"",
|
1239 |
"",
|
1240 |
"",
|
1241 |
"",
|
1242 |
"",
|
1243 |
"",
|
1244 |
"",
|
1245 |
"",
|
1246 |
"",
|
1247 |
"",
|
1248 |
"",
|
1249 |
"",
|
1250 |
"",
|
1251 |
"",
|
1252 |
"",
|
1253 |
"",
|
1254 |
"CL_INVALID_VALUE",
|
1255 |
"CL_INVALID_DEVICE_TYPE",
|
1256 |
"CL_INVALID_PLATFORM",
|
1257 |
"CL_INVALID_DEVICE",
|
1258 |
"CL_INVALID_CONTEXT",
|
1259 |
"CL_INVALID_QUEUE_PROPERTIES",
|
1260 |
"CL_INVALID_COMMAND_QUEUE",
|
1261 |
"CL_INVALID_HOST_PTR",
|
1262 |
"CL_INVALID_MEM_OBJECT",
|
1263 |
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
1264 |
"CL_INVALID_IMAGE_SIZE",
|
1265 |
"CL_INVALID_SAMPLER",
|
1266 |
"CL_INVALID_BINARY",
|
1267 |
"CL_INVALID_BUILD_OPTIONS",
|
1268 |
"CL_INVALID_PROGRAM",
|
1269 |
"CL_INVALID_PROGRAM_EXECUTABLE",
|
1270 |
"CL_INVALID_KERNEL_NAME",
|
1271 |
"CL_INVALID_KERNEL_DEFINITION",
|
1272 |
"CL_INVALID_KERNEL",
|
1273 |
"CL_INVALID_ARG_INDEX",
|
1274 |
"CL_INVALID_ARG_VALUE",
|
1275 |
"CL_INVALID_ARG_SIZE",
|
1276 |
"CL_INVALID_KERNEL_ARGS",
|
1277 |
"CL_INVALID_WORK_DIMENSION",
|
1278 |
"CL_INVALID_WORK_GROUP_SIZE",
|
1279 |
"CL_INVALID_WORK_ITEM_SIZE",
|
1280 |
"CL_INVALID_GLOBAL_OFFSET",
|
1281 |
"CL_INVALID_EVENT_WAIT_LIST",
|
1282 |
"CL_INVALID_EVENT",
|
1283 |
"CL_INVALID_OPERATION",
|
1284 |
"CL_INVALID_GL_OBJECT",
|
1285 |
"CL_INVALID_BUFFER_SIZE",
|
1286 |
"CL_INVALID_MIP_LEVEL",
|
1287 |
"CL_INVALID_GLOBAL_WORK_SIZE",
|
1288 |
}; |
1289 |
|
1290 |
const int errorCount = sizeof(errorString) / sizeof(errorString[0]); |
1291 |
|
1292 |
const int index = -error; |
1293 |
|
1294 |
return (index >= 0 && index < errorCount) ? errorString[index] : ""; |
1295 |
} |
1296 |
|
1297 |
bool ProgramBagCL::CheckErrorCL(cl_int error, const char* location) |
1298 |
{ |
1299 |
if(error == CL_SUCCESS) return true; |
1300 |
const char *errstr = GetErrorString(error); |
1301 |
if(errstr && errstr[0]) std::cerr << errstr; |
1302 |
else std::cerr << "Error " << error; |
1303 |
if(location) std::cerr << " at " << location; |
1304 |
std::cerr << "\n";
|
1305 |
exit(0);
|
1306 |
return false; |
1307 |
|
1308 |
} |
1309 |
|
1310 |
|
1311 |
////////////////////////////////////////////////////////////////////////
|
1312 |
////////////////////////////////////////////////////////////////////////
|
1313 |
|
1314 |
void ProgramBagCLN::LoadFixedShaders()
|
1315 |
{ |
1316 |
s_sampling = new ProgramCL("sampling", |
1317 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1318 |
"__kernel void sampling(__read_only image2d_t input, __write_only image2d_t output, "
|
1319 |
" int width, int height) {\n"
|
1320 |
"int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
|
1321 |
"if( coord.x >= width || coord.y >= height) return;\n"
|
1322 |
"write_imagef(output, coord, read_imagef(input, sampler, coord << 1)); }" , _context, _device);
|
1323 |
|
1324 |
s_sampling_k = new ProgramCL("sampling_k", |
1325 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1326 |
"__kernel void sampling_k(__read_only image2d_t input, __write_only image2d_t output, "
|
1327 |
" int width, int height, int step) {\n"
|
1328 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1329 |
"if( x >= width || y >= height) return;\n"
|
1330 |
"int xa = x * step, ya = y *step; \n"
|
1331 |
"float4 v1 = read_imagef(input, sampler, (int2) (xa, ya)); \n"
|
1332 |
"write_imagef(output, (int2) (x, y), v1); }" , _context, _device);
|
1333 |
|
1334 |
|
1335 |
s_sampling_u = new ProgramCL("sampling_u", |
1336 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n"
|
1337 |
"__kernel void sampling_u(__read_only image2d_t input, \n"
|
1338 |
" __write_only image2d_t output,\n"
|
1339 |
" int width, int height, float step) {\n"
|
1340 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1341 |
"if( x >= width || y >= height) return;\n"
|
1342 |
"float xa = x * step, ya = y *step; \n"
|
1343 |
"float v1 = read_imagef(input, sampler, (float2) (xa, ya)).x; \n"
|
1344 |
"write_imagef(output, (int2) (x, y), (float4)(v1)); }" , _context, _device);
|
1345 |
|
1346 |
s_dog_pass = new ProgramCL("dog_pass", |
1347 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1348 |
"__kernel void dog_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
|
1349 |
" __write_only image2d_t dog, int width, int height) {\n"
|
1350 |
"int2 coord = (int2)(get_global_id(0), get_global_id(1)); \n"
|
1351 |
"if( coord.x >= width || coord.y >= height) return;\n"
|
1352 |
"float cc = read_imagef(tex , sampler, coord).x; \n"
|
1353 |
"float cp = read_imagef(texp, sampler, coord).x;\n"
|
1354 |
"write_imagef(dog, coord, (float4)(cc - cp)); }\n", _context, _device);
|
1355 |
|
1356 |
s_grad_pass = new ProgramCL("grad_pass", |
1357 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1358 |
"__kernel void grad_pass(__read_only image2d_t tex, __read_only image2d_t texp,\n"
|
1359 |
" __write_only image2d_t dog, int width, int height, \n"
|
1360 |
" __write_only image2d_t grad, __write_only image2d_t rot) {\n"
|
1361 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1362 |
"if( x >= width || y >= height) return;\n"
|
1363 |
"int2 coord = (int2) (x, y);\n"
|
1364 |
"float cl = read_imagef(tex, sampler, (int2)(x - 1, y)).x;\n"
|
1365 |
"float cc = read_imagef(tex , sampler, coord).x; \n"
|
1366 |
"float cr = read_imagef(tex, sampler, (int2)(x + 1, y)).x;\n"
|
1367 |
"float cp = read_imagef(texp, sampler, coord).x;\n"
|
1368 |
"write_imagef(dog, coord, (float4)(cc - cp)); \n"
|
1369 |
"float cd = read_imagef(tex, sampler, (int2)(x, y - 1)).x;\n"
|
1370 |
"float cu = read_imagef(tex, sampler, (int2)(x, y + 1)).x;\n"
|
1371 |
"float dx = cr - cl, dy = cu - cd; \n"
|
1372 |
"float gg = 0.5 * sqrt(dx*dx + dy * dy);\n"
|
1373 |
"write_imagef(grad, coord, (float4)(gg));\n"
|
1374 |
"float oo = atan2(dy, dx + FLT_MIN);\n"
|
1375 |
"write_imagef(rot, coord, (float4)(oo));}\n", _context, _device);
|
1376 |
|
1377 |
s_grad_pass2 = new ProgramCL("grad_pass2", |
1378 |
"#define BLOCK_DIMX 32\n"
|
1379 |
"#define BLOCK_DIMY 14\n"
|
1380 |
"#define BLOCK_SIZE (BLOCK_DIMX * BLOCK_DIMY)\n"
|
1381 |
"__kernel void grad_pass2(__read_only image2d_t tex, __read_only image2d_t texp,\n"
|
1382 |
" __write_only image2d_t dog, int width, int height,\n"
|
1383 |
" __write_only image2d_t grd, __write_only image2d_t rot){\n"//, __local float* block) {\n" |
1384 |
"__local float block[BLOCK_SIZE]; \n"
|
1385 |
"sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
|
1386 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1387 |
"int2 coord = (int2) ( get_global_id(0) - get_group_id(0) * 2 - 1, \n"
|
1388 |
" get_global_id(1) - get_group_id(1) * 2 - 1); \n"
|
1389 |
"int idx = mad24(get_local_id(1), BLOCK_DIMX, get_local_id(0));\n"
|
1390 |
"float cc = read_imagef(tex, sampler, coord).x;\n"
|
1391 |
"block[idx] = cc;\n"
|
1392 |
"barrier(CLK_LOCAL_MEM_FENCE);\n"
|
1393 |
"if( get_local_id(0) == 0 || get_local_id(0) == BLOCK_DIMX - 1) return;\n"
|
1394 |
"if( get_local_id(1) == 0 || get_local_id(1) == BLOCK_DIMY - 1) return;\n"
|
1395 |
"if( coord.x >= width) return; \n"
|
1396 |
"if( coord.y >= height) return;\n"
|
1397 |
"float cp = read_imagef(texp, sampler, coord).x;\n"
|
1398 |
"float dx = block[idx + 1] - block[idx - 1];\n"
|
1399 |
"float dy = block[idx + BLOCK_DIMX ] - block[idx - BLOCK_DIMX];\n"
|
1400 |
"write_imagef(dog, coord, (float4)(cc - cp)); \n"
|
1401 |
"write_imagef(grd, coord, (float4)(0.5 * sqrt(dx*dx + dy * dy)));\n"
|
1402 |
"write_imagef(rot, coord, (float4)(atan2(dy, dx + FLT_MIN)));}\n", _context, _device);
|
1403 |
} |
1404 |
|
1405 |
void ProgramBagCLN::LoadDisplayShaders()
|
1406 |
{ |
1407 |
s_unpack = new ProgramCL("main", |
1408 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1409 |
"__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
|
1410 |
" int width, int height) {\n"
|
1411 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1412 |
"if(x >= width || y >= height) return;\n"
|
1413 |
"float v = read_imagef(input, sampler, (int2) (x, y)).x; \n"
|
1414 |
"float4 result = (float4) (v, v, v, 1);"
|
1415 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1416 |
|
1417 |
s_unpack_grd = new ProgramCL("main", |
1418 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |\n"
|
1419 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1420 |
"__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
|
1421 |
" int width, int height) {\n"
|
1422 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1423 |
"if(x >= width || y >= height) return;\n"
|
1424 |
"float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
|
1425 |
"float v = 5.0 * v0; float4 result = (float4) (v, v, v, 1);"
|
1426 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1427 |
|
1428 |
s_unpack_dog = new ProgramCL("main", |
1429 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1430 |
"__kernel void main(__read_only image2d_t input, __write_only image2d_t output,\n"
|
1431 |
" int width, int height) {\n"
|
1432 |
"int x = get_global_id(0), y = get_global_id(1); \n"
|
1433 |
"if(x >= width || y >= height) return;\n"
|
1434 |
"float v0 = read_imagef(input, sampler, (int2) (x, y)).x; \n"
|
1435 |
"float v = 0.5 + 20.0 * v0; float4 result = (float4) (v, v, v, 1);"
|
1436 |
"write_imagef(output, (int2) (x, y), result); }" , _context, _device);
|
1437 |
} |
1438 |
|
1439 |
ProgramCL* ProgramBagCLN::CreateFilterH(float kernel[], int width) |
1440 |
{ |
1441 |
////////////////////////////
|
1442 |
char buffer[10240]; |
1443 |
ostrstream out(buffer, 10240);
|
1444 |
out << "#define KERNEL_WIDTH " << width << "\n" |
1445 |
<< "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n" |
1446 |
"#define BLOCK_WIDTH 128\n"
|
1447 |
"#define BLOCK_HEIGHT 1\n"
|
1448 |
"#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
|
1449 |
"#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
|
1450 |
"#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
|
1451 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1452 |
"__kernel void filter_h(__read_only image2d_t input, \n"
|
1453 |
" __write_only image2d_t output, int width_, int height_, \n"
|
1454 |
" __constant float* weight) {\n"
|
1455 |
"__local float data[CACHE_WIDTH]; \n"
|
1456 |
"int x = get_global_id(0), y = get_global_id(1);\n"
|
1457 |
"#pragma unroll\n"
|
1458 |
"for(int j = 0; j < CACHE_COUNT; ++j)\n"
|
1459 |
"{\n"
|
1460 |
" if(get_local_id(0) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
|
1461 |
" {\n"
|
1462 |
" int fetch_index = min(x + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, width_);\n"
|
1463 |
" data[get_local_id(0) + j * BLOCK_WIDTH] = read_imagef(input, sampler, (int2)(fetch_index, y)).x;\n"
|
1464 |
" }\n"
|
1465 |
"}\n"
|
1466 |
"barrier(CLK_LOCAL_MEM_FENCE); \n"
|
1467 |
"if( x > width_ || y > height_) return; \n"
|
1468 |
"float result = 0; \n"
|
1469 |
"#pragma unroll\n"
|
1470 |
"for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
|
1471 |
"{\n"
|
1472 |
" result += data[get_local_id(0) + i] * weight[i];\n"
|
1473 |
"}\n"
|
1474 |
<< "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0'; |
1475 |
return new ProgramCL("filter_h", buffer, _context, _device); |
1476 |
} |
1477 |
|
1478 |
|
1479 |
|
1480 |
ProgramCL* ProgramBagCLN::CreateFilterV(float kernel[], int width) |
1481 |
{ |
1482 |
////////////////////////////
|
1483 |
char buffer[10240]; |
1484 |
ostrstream out(buffer, 10240);
|
1485 |
out << "#define KERNEL_WIDTH " << width << "\n" |
1486 |
<< "#define KERNEL_HALF_WIDTH " << (width / 2) << "\n" |
1487 |
"#define BLOCK_WIDTH 128\n"
|
1488 |
"#define CACHE_WIDTH (BLOCK_WIDTH + KERNEL_WIDTH - 1)\n"
|
1489 |
"#define CACHE_WIDTH_ALIGNED ((CACHE_WIDTH + 15) / 16 * 16)\n"
|
1490 |
"#define CACHE_COUNT (2 + (CACHE_WIDTH - 2) / BLOCK_WIDTH)\n"
|
1491 |
"const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n"
|
1492 |
" CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n"
|
1493 |
"__kernel void filter_v(__read_only image2d_t input, \n"
|
1494 |
" __write_only image2d_t output, int width_, int height_, \n"
|
1495 |
" __constant float* weight) {\n"
|
1496 |
"__local float data[CACHE_WIDTH]; \n"
|
1497 |
"int x = get_global_id(0), y = get_global_id(1);\n"
|
1498 |
"#pragma unroll\n"
|
1499 |
"for(int j = 0; j < CACHE_COUNT; ++j)\n"
|
1500 |
"{\n"
|
1501 |
" if(get_local_id(1) + j * BLOCK_WIDTH < CACHE_WIDTH)\n"
|
1502 |
" {\n"
|
1503 |
" int fetch_index = min(y + j * BLOCK_WIDTH - KERNEL_HALF_WIDTH, height_);\n"
|
1504 |
" data[get_local_id(1) + j * BLOCK_WIDTH ] = read_imagef(input, sampler, (int2)(x, fetch_index)).x;\n"
|
1505 |
" }\n"
|
1506 |
"}\n"
|
1507 |
"barrier(CLK_LOCAL_MEM_FENCE); \n"
|
1508 |
"if( x > width_ || y > height_) return; \n"
|
1509 |
"float result = 0; \n"
|
1510 |
"#pragma unroll\n"
|
1511 |
"for(int i = 0; i < KERNEL_WIDTH; ++i)\n"
|
1512 |
"{\n"
|
1513 |
" result += data[get_local_id(1) + i] * weight[i];\n"
|
1514 |
"}\n"
|
1515 |
<< "write_imagef(output, (int2)(x, y), (float4)(result)); }\n" << '\0'; |
1516 |
|
1517 |
return new ProgramCL("filter_v", buffer, _context, _device); |
1518 |
} |
1519 |
|
1520 |
FilterCL* ProgramBagCLN::CreateFilter(float kernel[], int width) |
1521 |
{ |
1522 |
FilterCL * filter = new FilterCL;
|
1523 |
filter->s_shader_h = CreateFilterH(kernel, width); |
1524 |
filter->s_shader_v = CreateFilterV(kernel, width); |
1525 |
filter->_weight = new CLTexImage(_context, _queue);
|
1526 |
filter->_weight->InitBufferTex(width, 1, 1); |
1527 |
filter->_weight->CopyFromHost(kernel); |
1528 |
filter->_size = width; |
1529 |
return filter;
|
1530 |
} |
1531 |
|
1532 |
|
1533 |
void ProgramBagCLN::FilterImage(FilterCL* filter, CLTexImage *dst, CLTexImage *src, CLTexImage*tmp)
|
1534 |
{ |
1535 |
cl_kernel kernelh = filter->s_shader_h->_kernel; |
1536 |
cl_kernel kernelv = filter->s_shader_v->_kernel; |
1537 |
//////////////////////////////////////////////////////////////////
|
1538 |
|
1539 |
cl_int status, w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
1540 |
cl_mem weight = (cl_mem) filter->_weight->_clData; |
1541 |
cl_int w_ = w - 1, h_ = h - 1; |
1542 |
|
1543 |
|
1544 |
clSetKernelArg(kernelh, 0, sizeof(cl_mem), &src->_clData); |
1545 |
clSetKernelArg(kernelh, 1, sizeof(cl_mem), &tmp->_clData); |
1546 |
clSetKernelArg(kernelh, 2, sizeof(cl_int), &w_); |
1547 |
clSetKernelArg(kernelh, 3, sizeof(cl_int), &h_); |
1548 |
clSetKernelArg(kernelh, 4, sizeof(cl_mem), &weight); |
1549 |
|
1550 |
size_t dim00 = 128, dim01 = 1; |
1551 |
size_t gsz1[2] = {(w + dim00 - 1) / dim00 * dim00, (h + dim01 - 1) / dim01 * dim01}, lsz1[2] = {dim00, dim01}; |
1552 |
status = clEnqueueNDRangeKernel(_queue, kernelh, 2, NULL, gsz1, lsz1, 0, NULL, NULL); |
1553 |
CheckErrorCL(status, "ProgramBagCLN::FilterImageH");
|
1554 |
if(status != CL_SUCCESS) return; |
1555 |
|
1556 |
|
1557 |
clSetKernelArg(kernelv, 0, sizeof(cl_mem), &tmp->_clData); |
1558 |
clSetKernelArg(kernelv, 1, sizeof(cl_mem), &dst->_clData); |
1559 |
clSetKernelArg(kernelv, 2, sizeof(cl_int), &w_); |
1560 |
clSetKernelArg(kernelv, 3, sizeof(cl_int), &h_); |
1561 |
clSetKernelArg(kernelv, 4, sizeof(cl_mem), &weight); |
1562 |
|
1563 |
size_t dim10 = 1, dim11 = 128; |
1564 |
size_t gsz2[2] = {(w + dim10 - 1) / dim10 * dim10, (h + dim11 - 1) / dim11 * dim11}, lsz2[2] = {dim10, dim11}; |
1565 |
status = clEnqueueNDRangeKernel(_queue, kernelv, 2, NULL, gsz2, lsz2, 0, NULL, NULL); |
1566 |
CheckErrorCL(status, "ProgramBagCLN::FilterImageV");
|
1567 |
//clReleaseEvent(event);
|
1568 |
} |
1569 |
|
1570 |
void ProgramBagCLN::SampleImageD(CLTexImage *dst, CLTexImage *src, int log_scale) |
1571 |
{ |
1572 |
cl_kernel kernel; |
1573 |
cl_int w = dst->GetImgWidth(), h = dst->GetImgHeight(); |
1574 |
|
1575 |
cl_int fullstep = (1 << log_scale);
|
1576 |
kernel = log_scale == 1? s_sampling->_kernel : s_sampling_k->_kernel;
|
1577 |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &(src->_clData)); |
1578 |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &(dst->_clData)); |
1579 |
clSetKernelArg(kernel, 2, sizeof(cl_int), &(w)); |
1580 |
clSetKernelArg(kernel, 3, sizeof(cl_int), &(h)); |
1581 |
if(log_scale > 1) clSetKernelArg(kernel, 4, sizeof(cl_int), &(fullstep)); |
1582 |
|
1583 |
size_t dim0 = 128, dim1 = 1; |
1584 |
//while( w * h / dim0 / dim1 < 8 && dim1 > 1) dim1 /= 2;
|
1585 |
size_t gsz[2] = {(w + dim0 - 1) / dim0 * dim0, (h + dim1 - 1) / dim1 * dim1}, lsz[2] = {dim0, dim1}; |
1586 |
cl_int status = clEnqueueNDRangeKernel(_queue, kernel, 2, NULL, gsz, lsz, 0, NULL, NULL); |
1587 |
CheckErrorCL(status, "ProgramBagCLN::SampleImageD");
|
1588 |
} |
1589 |
|
1590 |
|
1591 |
#endif
|
1592 |
|