Statistics
| Branch: | Revision:

root / rgbdslam / external / siftgpu / src / SiftGPU / ProgramCU.cu @ 9240aaa3

History | View | Annotate | Download (56.6 KB)

1
////////////////////////////////////////////////////////////////////////////
2
//        File:                ProgramCU.cu
3
//        Author:                Changchang Wu
4
//        Description : implementation of ProgramCU and all CUDA kernels
5
//
6
//        Copyright (c) 2007 University of North Carolina at Chapel Hill
7
//        All Rights Reserved
8
//
9
//        Permission to use, copy, modify and distribute this software and its
10
//        documentation for educational, research and non-profit purposes, without
11
//        fee, and without a written agreement is hereby granted, provided that the
12
//        above copyright notice and the following paragraph appear in all copies.
13
//        
14
//        The University of North Carolina at Chapel Hill make no representations
15
//        about the suitability of this software for any purpose. It is provided
16
//        'as is' without express or implied warranty. 
17
//
18
//        Please send BUG REPORTS to ccwu@cs.unc.edu
19
//
20
////////////////////////////////////////////////////////////////////////////
21

    
22
#if defined(CUDA_SIFTGPU_ENABLED)
23

    
24
#include "GL/glew.h"
25
#include "stdio.h"
26

    
27
#include "CuTexImage.h"
28
#include "ProgramCU.h"
29
#include "GlobalUtil.h"
30

    
31
//----------------------------------------------------------------
32
//Begin SiftGPU setting section.
33
//////////////////////////////////////////////////////////
34
#define IMUL(X,Y) __mul24(X,Y)
35
//#define FDIV(X,Y) ((X)/(Y))
36
#define FDIV(X,Y) __fdividef(X,Y)
37

    
38
/////////////////////////////////////////////////////////
39
//filter kernel width range (don't change this)
40
#define KERNEL_MAX_WIDTH 33
41
#define KERNEL_MIN_WIDTH 5
42

    
43
//////////////////////////////////////////////////////////
44
//horizontal filter block size (32, 64, 128, 256, 512)
45
#define FILTERH_TILE_WIDTH 128
46
//thread block for vertical filter. FILTERV_BLOCK_WIDTH can be (4, 8 or 16)
47
#define FILTERV_BLOCK_WIDTH 16
48
#define FILTERV_BLOCK_HEIGHT 32
49
//The corresponding image patch for a thread block
50
#define FILTERV_PIXEL_PER_THREAD 4
51
#define FILTERV_TILE_WIDTH FILTERV_BLOCK_WIDTH
52
#define FILTERV_TILE_HEIGHT (FILTERV_PIXEL_PER_THREAD * FILTERV_BLOCK_HEIGHT)
53

    
54

    
55
//////////////////////////////////////////////////////////
56
//thread block size for computing Difference of Gaussian
57
#define DOG_BLOCK_LOG_DIMX 7
58
#define DOG_BLOCK_LOG_DIMY 0
59
#define DOG_BLOCK_DIMX (1 << DOG_BLOCK_LOG_DIMX)
60
#define DOG_BLOCK_DIMY (1 << DOG_BLOCK_LOG_DIMY)
61

    
62
//////////////////////////////////////////////////////////
63
//thread block size for keypoint detection
64
#define KEY_BLOCK_LOG_DIMX 3
65
#define KEY_BLOCK_LOG_DIMY 3
66
#define KEY_BLOCK_DIMX (1<<KEY_BLOCK_LOG_DIMX)
67
#define KEY_BLOCK_DIMY (1<<KEY_BLOCK_LOG_DIMY)
68
//#define KEY_OFFSET_ONE
69
//make KEY_BLOCK_LOG_DIMX 4 will make the write coalesced..
70
//but it seems uncoalesced writes don't affect the speed
71

    
72
//////////////////////////////////////////////////////////
73
//thread block size for initializing list generation (64, 128, 256, 512 ...)
74
#define HIST_INIT_WIDTH 128
75
//thread block size for generating feature list (32, 64, 128, 256, 512, ...)
76
#define LISTGEN_BLOCK_DIM 128
77

    
78

    
79
/////////////////////////////////////////////////////////
80
//how many keypoint orientations to compute in a block
81
#define ORIENTATION_COMPUTE_PER_BLOCK 64
82
//how many keypoint descriptor to compute in a block (2, 4, 8, 16, 32)
83
#define DESCRIPTOR_COMPUTE_PER_BLOCK        4
84
#define DESCRIPTOR_COMPUTE_BLOCK_SIZE        (16 * DESCRIPTOR_COMPUTE_PER_BLOCK)
85
//how many keypoint descriptor to normalized in a block (32, ...)
86
#define DESCRIPTOR_NORMALIZ_PER_BLOCK        32
87

    
88

    
89

    
90
///////////////////////////////////////////
91
//Thread block size for visualization 
92
//(This doesn't affect the speed of computation)
93
#define BLOCK_LOG_DIM 4
94
#define BLOCK_DIM (1 << BLOCK_LOG_DIM)
95

    
96
//End SiftGPU setting section.
97
//----------------------------------------------------------------
98

    
99

    
100
__device__ __constant__ float d_kernel[KERNEL_MAX_WIDTH];
101
texture<float, 1, cudaReadModeElementType> texData;
102
texture<unsigned char, 1, cudaReadModeNormalizedFloat> texDataB;
103
texture<float2, 2, cudaReadModeElementType> texDataF2;
104
texture<float4, 1, cudaReadModeElementType> texDataF4;
105
texture<int4, 1, cudaReadModeElementType> texDataI4;
106
texture<int4, 1, cudaReadModeElementType> texDataList;
107

    
108
//template<int i>         __device__ float Conv(float *data)                {    return Conv<i-1>(data) + data[i]*d_kernel[i];}
109
//template<>                __device__ float Conv<0>(float *data)        {    return data[0] * d_kernel[0];                                        }
110

    
111
  
112
//////////////////////////////////////////////////////////////
113
template<int FW> __global__ void FilterH( float* d_result, int width)
114
{
115

    
116
        const int HALF_WIDTH = FW >> 1;
117
        const int CACHE_WIDTH = FILTERH_TILE_WIDTH + FW -1;
118
        const int CACHE_COUNT = 2 + (CACHE_WIDTH - 2)/ FILTERH_TILE_WIDTH;
119
        __shared__ float data[CACHE_WIDTH];
120
        const int bcol = IMUL(blockIdx.x, FILTERH_TILE_WIDTH);
121
        const int col =  bcol + threadIdx.x;
122
        const int index_min = IMUL(blockIdx.y, width);
123
        const int index_max = index_min + width - 1;
124
        int src_index = index_min + bcol - HALF_WIDTH + threadIdx.x;
125
        int cache_index = threadIdx.x;
126
        float value = 0;
127
#pragma unroll
128
        for(int j = 0; j < CACHE_COUNT; ++j)
129
        {
130
                if(cache_index < CACHE_WIDTH)
131
                {
132
                        int fetch_index = src_index < index_min? index_min : (src_index > index_max ? index_max : src_index);
133
                        data[cache_index] = tex1Dfetch(texData,fetch_index);
134
                        src_index += FILTERH_TILE_WIDTH;
135
                        cache_index += FILTERH_TILE_WIDTH;
136
                }
137
        }
138
        __syncthreads(); 
139
        if(col >= width) return;
140
#pragma unroll
141
        for(int i = 0; i < FW; ++i)
142
        {
143
                value += (data[threadIdx.x + i]* d_kernel[i]);
144
        }
145
//        value = Conv<FW-1>(data + threadIdx.x);
146
        d_result[index_min + col] = value;
147
}
148

    
149

    
150

    
151
////////////////////////////////////////////////////////////////////
152
template<int  FW>  __global__ void FilterV(float* d_result, int width, int height)
153
{
154
        const int HALF_WIDTH = FW >> 1;
155
        const int CACHE_WIDTH = FW + FILTERV_TILE_HEIGHT - 1;
156
        const int TEMP = CACHE_WIDTH & 0xf;
157
//add some extra space to avoid bank conflict
158
#if FILTERV_TILE_WIDTH == 16
159
        //make the stride 16 * n +/- 1
160
        const int EXTRA = (TEMP == 1 || TEMP == 0) ? 1 - TEMP : 15 - TEMP;
161
#elif FILTERV_TILE_WIDTH == 8
162
        //make the stride 16 * n +/- 2
163
        const int EXTRA = (TEMP == 2 || TEMP == 1 || TEMP == 0) ? 2 - TEMP : (TEMP == 15? 3 : 14 - TEMP);
164
#elif FILTERV_TILE_WIDTH == 4
165
        //make the stride 16 * n +/- 4
166
        const int EXTRA = (TEMP >=0 && TEMP <=4) ? 4 - TEMP : (TEMP > 12? 20 - TEMP : 12 - TEMP);
167
#else
168
#error
169
#endif
170
        const int CACHE_TRUE_WIDTH = CACHE_WIDTH + EXTRA;
171
        const int CACHE_COUNT = (CACHE_WIDTH + FILTERV_BLOCK_HEIGHT - 1) / FILTERV_BLOCK_HEIGHT;
172
        const int WRITE_COUNT = (FILTERV_TILE_HEIGHT + FILTERV_BLOCK_HEIGHT -1) / FILTERV_BLOCK_HEIGHT;
173
        __shared__ float data[CACHE_TRUE_WIDTH * FILTERV_TILE_WIDTH];
174
        const int row_block_first = IMUL(blockIdx.y, FILTERV_TILE_HEIGHT);
175
        const int col = IMUL(blockIdx.x, FILTERV_TILE_WIDTH) + threadIdx.x;
176
        const int row_first = row_block_first - HALF_WIDTH;
177
        const int data_index_max = IMUL(height - 1, width) + col;
178
        const int cache_col_start = threadIdx.y;        
179
        const int cache_row_start = IMUL(threadIdx.x, CACHE_TRUE_WIDTH);
180
        int cache_index = cache_col_start + cache_row_start;
181
        int data_index = IMUL(row_first + cache_col_start, width) + col;
182

    
183
        if(col < width) 
184
        {
185
#pragma unroll
186
                for(int i = 0; i < CACHE_COUNT; ++i)
187
                {
188
                        if(cache_col_start < CACHE_WIDTH - i * FILTERV_BLOCK_HEIGHT) 
189
                        {
190
                                int fetch_index = data_index < col ? col : (data_index > data_index_max? data_index_max : data_index);
191
                                data[cache_index + i * FILTERV_BLOCK_HEIGHT] = tex1Dfetch(texData,fetch_index);
192
                                data_index += IMUL(FILTERV_BLOCK_HEIGHT, width);
193
                        }
194
                }
195
        }
196
        __syncthreads();
197
        
198
        if(col >= width) return;
199

    
200
        int row = row_block_first + threadIdx.y;
201
        int index_start = cache_row_start + threadIdx.y;
202
#pragma unroll
203
        for(int i = 0; i < WRITE_COUNT;                ++i, 
204
                        row += FILTERV_BLOCK_HEIGHT, index_start += FILTERV_BLOCK_HEIGHT)
205
        {
206
                if(row < height)
207
                {
208
                        int index_dest = IMUL(row, width) + col;
209
                        float value = 0;
210
#pragma unroll
211
                        for(int i = 0; i < FW; ++i)
212
                        {
213
                                value += (data[index_start + i] * d_kernel[i]);
214
                        }
215
                        d_result[index_dest] = value;
216
                }
217
        }
218
}
219

    
220

    
221
template<int LOG_SCALE> __global__ void UpsampleKernel(float* d_result, int width)
222
{
223
        const int SCALE = (1 << LOG_SCALE), SCALE_MASK = (SCALE - 1);
224
        const float INV_SCALE = 1.0f / (float(SCALE));
225
        int col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
226
        if(col >= width) return;
227

    
228
        int row = blockIdx.y >> LOG_SCALE; 
229
        int index = row * width + col;
230
        int dst_row = blockIdx.y;
231
        int dst_idx= (width * dst_row + col) * SCALE;
232
        int helper = blockIdx.y & SCALE_MASK; 
233
        if (helper)
234
        {
235
                float v11 = tex1Dfetch(texData, index);
236
                float v12 = tex1Dfetch(texData, index + 1);
237
                index += width;
238
                float v21 = tex1Dfetch(texData, index);
239
                float v22 = tex1Dfetch(texData, index + 1);
240
                float w1 = INV_SCALE * helper, w2 = 1.0 - w1;
241
                float v1 = (v21 * w1  + w2 * v11);
242
                float v2 = (v22 * w1  + w2 * v12);
243
                d_result[dst_idx] = v1;
244
#pragma unroll
245
                for(int i = 1; i < SCALE; ++i)
246
                {
247
                        const float r2 = i * INV_SCALE;
248
                        const float r1 = 1.0f - r2; 
249
                        d_result[dst_idx +i] = v1 * r1 + v2 * r2;
250
                }
251
        }else
252
        {
253
                float v1 = tex1Dfetch(texData, index);
254
                float v2 = tex1Dfetch(texData, index + 1);
255
                d_result[dst_idx] = v1;
256
#pragma unroll
257
                for(int i = 1; i < SCALE; ++i)
258
                {
259
                        const float r2 = i * INV_SCALE;
260
                        const float r1 = 1.0f - r2; 
261
                        d_result[dst_idx +i] = v1 * r1 + v2 * r2;
262
                }
263
        }
264

    
265
}
266

    
267
////////////////////////////////////////////////////////////////////////////////////////
268
void ProgramCU::SampleImageU(CuTexImage *dst, CuTexImage *src, int log_scale)
269
{
270
        int width = src->GetImgWidth(), height = src->GetImgHeight();
271
        src->BindTexture(texData);
272
        dim3 grid((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height << log_scale);
273
        dim3 block(FILTERH_TILE_WIDTH);
274
        switch(log_scale)
275
        {
276
        case 1 :         UpsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, width);        break;
277
        case 2 :         UpsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, width);        break;
278
        case 3 :         UpsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, width);        break;
279
        default:        break;
280
        }
281
}
282

    
283
template<int LOG_SCALE> __global__ void DownsampleKernel(float* d_result, int src_width, int dst_width)
284
{
285
        const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
286
        if(dst_col >= dst_width) return;
287
        const int src_col = min((dst_col << LOG_SCALE), (src_width - 1));
288
        const int dst_row = blockIdx.y; 
289
        const int src_row = blockIdx.y << LOG_SCALE;
290
        const int src_idx = IMUL(src_row, src_width) + src_col;
291
        const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
292
        d_result[dst_idx] = tex1Dfetch(texData, src_idx);
293

    
294
}
295

    
296
__global__ void DownsampleKernel(float* d_result, int src_width, int dst_width, const int log_scale)
297
{
298
        const int dst_col = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
299
        if(dst_col >= dst_width) return;
300
        const int src_col = min((dst_col << log_scale), (src_width - 1));
301
        const int dst_row = blockIdx.y; 
302
        const int src_row = blockIdx.y << log_scale;
303
        const int src_idx = IMUL(src_row, src_width) + src_col;
304
        const int dst_idx = IMUL(dst_width, dst_row) + dst_col;
305
        d_result[dst_idx] = tex1Dfetch(texData, src_idx);
306

    
307
}
308

    
309
void ProgramCU::SampleImageD(CuTexImage *dst, CuTexImage *src, int log_scale)
310
{
311
        int src_width = src->GetImgWidth(), dst_width = dst->GetImgWidth() ;
312

    
313
        src->BindTexture(texData);
314
        dim3 grid((dst_width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, dst->GetImgHeight());
315
        dim3 block(FILTERH_TILE_WIDTH);
316
        switch(log_scale)
317
        {
318
        case 1 :         DownsampleKernel<1> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);        break;
319
        case 2 :        DownsampleKernel<2> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);        break;
320
        case 3 :         DownsampleKernel<3> <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width);        break;
321
        default:        DownsampleKernel    <<< grid, block>>> ((float*) dst->_cuData, src_width, dst_width, log_scale);
322
        }
323
}
324

    
325
__global__ void ChannelReduce_Kernel(float* d_result)
326
{
327
        int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
328
        d_result[index] = tex1Dfetch(texData, index*4);
329
}
330

    
331
__global__ void ChannelReduce_Convert_Kernel(float* d_result)
332
{
333
        int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
334
        float4 rgba = tex1Dfetch(texDataF4, index);
335
        d_result[index] = 0.299f * rgba.x + 0.587f* rgba.y + 0.114f * rgba.z;
336
}
337

    
338
void ProgramCU::ReduceToSingleChannel(CuTexImage* dst, CuTexImage* src, int convert_rgb)
339
{
340
        int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
341

    
342
        dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
343
        dim3 block(FILTERH_TILE_WIDTH);
344
        if(convert_rgb)
345
        {
346
                src->BindTexture(texDataF4);
347
                ChannelReduce_Convert_Kernel<<<grid, block>>>((float*)dst->_cuData);
348
        }else
349
        {
350
                src->BindTexture(texData);
351
                ChannelReduce_Kernel<<<grid, block>>>((float*)dst->_cuData);
352
        }
353
}
354

    
355
__global__ void ConvertByteToFloat_Kernel(float* d_result)
356
{
357
        int index = IMUL(blockIdx.x, FILTERH_TILE_WIDTH) + threadIdx.x;
358
        d_result[index] = tex1Dfetch(texDataB, index);
359
}
360

    
361
void ProgramCU::ConvertByteToFloat(CuTexImage*src, CuTexImage* dst)
362
{
363
        int width = src->GetImgWidth(), height = dst->GetImgHeight() ;
364
        dim3 grid((width * height +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH);
365
        dim3 block(FILTERH_TILE_WIDTH);
366
        src->BindTexture(texDataB);
367
        ConvertByteToFloat_Kernel<<<grid, block>>>((float*)dst->_cuData);
368
}
369

    
370
void ProgramCU::CreateFilterKernel(float sigma, float* kernel, int& width)
371
{
372
        int i, sz = int( ceil( GlobalUtil::_FilterWidthFactor * sigma -0.5) ) ;//
373
        width = 2*sz + 1;
374

    
375
        if(width > KERNEL_MAX_WIDTH)
376
        {
377
                //filter size truncation
378
                sz = KERNEL_MAX_WIDTH >> 1;
379
                width =KERNEL_MAX_WIDTH;
380
        }else if(width < KERNEL_MIN_WIDTH)
381
        {
382
                sz = KERNEL_MIN_WIDTH >> 1;
383
                width =KERNEL_MIN_WIDTH;
384
        }
385

    
386
        float   rv = 1.0f/(sigma*sigma), v, ksum =0; 
387

    
388
        // pre-compute filter
389
        for( i = -sz ; i <= sz ; ++i) 
390
        {
391
                kernel[i+sz] =  v = exp(-0.5f * i * i *rv) ;
392
                ksum += v;
393
        }
394

    
395
        //normalize the kernel
396
        rv = 1.0f/ksum;
397
        for(i = 0; i< width ;i++) kernel[i]*=rv;
398
}
399

    
400

    
401
template<int FW> void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf)
402
{
403
        int width = src->GetImgWidth(), height = src->GetImgHeight();
404

    
405
        //horizontal filtering
406
        src->BindTexture(texData);
407
        dim3 gridh((width +  FILTERH_TILE_WIDTH - 1)/ FILTERH_TILE_WIDTH, height);
408
        dim3 blockh(FILTERH_TILE_WIDTH);
409
        FilterH<FW><<<gridh, blockh>>>((float*)buf->_cuData, width);
410
        CheckErrorCUDA("FilterH");
411

    
412
        ///vertical filtering
413
        buf->BindTexture(texData);
414
        dim3 gridv((width + FILTERV_TILE_WIDTH - 1)/ FILTERV_TILE_WIDTH,  (height + FILTERV_TILE_HEIGHT - 1)/FILTERV_TILE_HEIGHT);
415
        dim3 blockv(FILTERV_TILE_WIDTH, FILTERV_BLOCK_HEIGHT);
416
        FilterV<FW><<<gridv, blockv>>>((float*)dst->_cuData, width, height); 
417
        CheckErrorCUDA("FilterV");
418
}
419

    
420
//////////////////////////////////////////////////////////////////////
421
// tested on 2048x1500 image, the time on pyramid construction is
422
// OpenGL version : 18ms
423
// CUDA version: 28 ms
424
void ProgramCU::FilterImage(CuTexImage *dst, CuTexImage *src, CuTexImage* buf, float sigma)
425
{
426
        float filter_kernel[KERNEL_MAX_WIDTH]; int width;
427
        CreateFilterKernel(sigma, filter_kernel, width);
428
        cudaMemcpyToSymbol(d_kernel, filter_kernel, width * sizeof(float), 0, cudaMemcpyHostToDevice);
429

    
430
        switch(width)
431
        {
432
                case 5:                FilterImage< 5>(dst, src, buf);        break;
433
                case 7:                FilterImage< 7>(dst, src, buf);        break;
434
                case 9:                FilterImage< 9>(dst, src, buf);        break;
435
                case 11:        FilterImage<11>(dst, src, buf);        break;
436
                case 13:        FilterImage<13>(dst, src, buf);        break;
437
                case 15:        FilterImage<15>(dst, src, buf);        break;
438
                case 17:        FilterImage<17>(dst, src, buf);        break;
439
                case 19:        FilterImage<19>(dst, src, buf);        break;
440
                case 21:        FilterImage<21>(dst, src, buf);        break;
441
                case 23:        FilterImage<23>(dst, src, buf);        break;
442
                case 25:        FilterImage<25>(dst, src, buf);        break;
443
                case 27:        FilterImage<27>(dst, src, buf);        break;
444
                case 29:        FilterImage<29>(dst, src, buf);        break;
445
                case 31:        FilterImage<31>(dst, src, buf);        break;
446
                case 33:        FilterImage<33>(dst, src, buf);        break;
447
                default:        break;
448
        }
449

    
450
}
451

    
452

    
453
texture<float, 1, cudaReadModeElementType> texC;
454
texture<float, 1, cudaReadModeElementType> texP;
455
texture<float, 1, cudaReadModeElementType> texN;
456

    
457
void __global__ ComputeDOG_Kernel(float* d_dog, float2* d_got, int width, int height)
458
{
459
        int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
460
        int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
461
        if(col < width && row < height) 
462
        {
463
                int index = IMUL(row, width) + col;
464
                float vp = tex1Dfetch(texP, index);
465
                float v = tex1Dfetch(texC, index);
466
                d_dog[index] = v - vp;
467
                float vxn = tex1Dfetch(texC, index + 1);
468
                float vxp = tex1Dfetch(texC, index - 1);
469
                float vyp = tex1Dfetch(texC, index - width);
470
                float vyn = tex1Dfetch(texC, index + width);
471
                float dx = vxn - vxp, dy = vyn - vyp;
472
                float grd = 0.5f * sqrt(dx * dx  + dy * dy);
473
                float rot = (grd == 0.0f? 0.0f : atan2(dy, dx));
474
                d_got[index] = make_float2(grd, rot);
475
        }
476
}
477

    
478
void __global__ ComputeDOG_Kernel(float* d_dog, int width, int height)
479
{
480
        int row = (blockIdx.y << DOG_BLOCK_LOG_DIMY) + threadIdx.y;
481
        int col = (blockIdx.x << DOG_BLOCK_LOG_DIMX) + threadIdx.x;
482
        if(col < width && row < height) 
483
        {
484
                int index = IMUL(row, width) + col;
485
                float vp = tex1Dfetch(texP, index);
486
                float v = tex1Dfetch(texC, index);
487
                d_dog[index] = v - vp;
488
        }
489
}
490

    
491
void ProgramCU::ComputeDOG(CuTexImage* gus, CuTexImage* dog, CuTexImage* got)
492
{
493
        int width = gus->GetImgWidth(), height = gus->GetImgHeight();
494
        dim3 grid((width + DOG_BLOCK_DIMX - 1)/ DOG_BLOCK_DIMX,  (height + DOG_BLOCK_DIMY - 1)/DOG_BLOCK_DIMY);
495
        dim3 block(DOG_BLOCK_DIMX, DOG_BLOCK_DIMY);
496
        gus->BindTexture(texC);
497
        (gus -1)->BindTexture(texP);
498
        if(got->_cuData)
499
                ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, (float2*) got->_cuData, width, height);
500
        else
501
                ComputeDOG_Kernel<<<grid, block>>>((float*) dog->_cuData, width, height);
502
}
503

    
504

    
505
#define READ_CMP_DOG_DATA(datai, tex, idx) \
506
                datai[0] = tex1Dfetch(tex, idx - 1);\
507
                datai[1] = tex1Dfetch(tex, idx);\
508
                datai[2] = tex1Dfetch(tex, idx + 1);\
509
                if(v > nmax)\
510
                {\
511
                           nmax = max(nmax, datai[0]);\
512
                           nmax = max(nmax, datai[1]);\
513
                           nmax = max(nmax, datai[2]);\
514
                           if(v < nmax) goto key_finish;\
515
                }else\
516
                {\
517
                           nmin = min(nmin, datai[0]);\
518
                           nmin = min(nmin, datai[1]);\
519
                           nmin = min(nmin, datai[2]);\
520
                           if(v > nmin) goto key_finish;\
521
                }
522

    
523

    
524
void __global__ ComputeKEY_Kernel(float4* d_key, int width, int colmax, int rowmax, 
525
                                        float dog_threshold0,  float dog_threshold, float edge_threshold, int subpixel_localization)
526
{
527
       float data[3][3], v;
528
       float datap[3][3], datan[3][3];
529
#ifdef KEY_OFFSET_ONE
530
       int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y + 1;
531
       int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x + 1;
532
#else
533
       int row = (blockIdx.y << KEY_BLOCK_LOG_DIMY) + threadIdx.y;
534
       int col = (blockIdx.x << KEY_BLOCK_LOG_DIMX) + threadIdx.x;
535
#endif
536
       int index = IMUL(row, width) + col;
537
           int idx[3] ={index - width, index, index + width};
538
       int in_image =0;
539
       float nmax, nmin, result = 0.0f;
540
           float dx = 0, dy = 0, ds = 0;
541
           bool offset_test_passed = true;
542
#ifdef KEY_OFFSET_ONE
543
       if(row < rowmax && col < colmax)
544
#else
545
       if(row > 0 && col > 0 && row < rowmax && col < colmax)
546
#endif
547
       {
548
                        in_image = 1;
549
                        data[1][1] = v = tex1Dfetch(texC, idx[1]);
550
                        if(fabs(v) <= dog_threshold0) goto key_finish;
551

    
552
                        data[1][0] = tex1Dfetch(texC, idx[1] - 1);
553
                        data[1][2] = tex1Dfetch(texC, idx[1] + 1);
554
                        nmax = max(data[1][0], data[1][2]);
555
                        nmin = min(data[1][0], data[1][2]);
556

    
557
                        if(v <=nmax && v >= nmin) goto key_finish;
558
                        //if((v > nmax && v < 0 )|| (v < nmin && v > 0)) goto key_finish;
559
                        READ_CMP_DOG_DATA(data[0], texC, idx[0]);
560
                        READ_CMP_DOG_DATA(data[2], texC, idx[2]);
561

    
562
                        //edge supression
563
                        float vx2 = v * 2.0f;
564
                        float fxx = data[1][0] + data[1][2] - vx2;
565
                        float fyy = data[0][1] + data[2][1] - vx2;
566
                        float fxy = 0.25f * (data[2][2] + data[0][0] - data[2][0] - data[0][2]);
567
                        float temp1 = fxx * fyy - fxy * fxy;
568
                        float temp2 = (fxx + fyy) * (fxx + fyy);
569
                        if(temp1 <=0 || temp2 > edge_threshold * temp1) goto key_finish;
570

    
571

    
572
                        //read the previous level
573
                        READ_CMP_DOG_DATA(datap[0], texP, idx[0]);
574
                        READ_CMP_DOG_DATA(datap[1], texP, idx[1]);
575
                        READ_CMP_DOG_DATA(datap[2], texP, idx[2]);
576

    
577

    
578
                        //read the next level
579
                        READ_CMP_DOG_DATA(datan[0], texN, idx[0]);
580
                        READ_CMP_DOG_DATA(datan[1], texN, idx[1]);
581
                        READ_CMP_DOG_DATA(datan[2], texN, idx[2]);
582
                        
583
                        if(subpixel_localization)
584
                        {
585
                                //subpixel localization
586
                                float fx = 0.5f * (data[1][2] - data[1][0]);
587
                                float fy = 0.5f * (data[2][1] - data[0][1]);
588
                                float fs = 0.5f * (datan[1][1] - datap[1][1]);
589

    
590
                                float fss = (datan[1][1] + datap[1][1] - vx2);
591
                                float fxs = 0.25f* (datan[1][2] + datap[1][0] - datan[1][0] - datap[1][2]);
592
                                float fys = 0.25f* (datan[2][1] + datap[0][1] - datan[0][1] - datap[2][1]);
593

    
594
                                //need to solve dx, dy, ds;
595
                                // |-fx|     | fxx fxy fxs |   |dx|
596
                                // |-fy|  =  | fxy fyy fys | * |dy|
597
                                // |-fs|     | fxs fys fss |   |ds|
598
                                float4 A0 = fxx > 0? make_float4(fxx, fxy, fxs, -fx) : make_float4(-fxx, -fxy, -fxs, fx);
599
                                float4 A1 = fxy > 0? make_float4(fxy, fyy, fys, -fy) : make_float4(-fxy, -fyy, -fys, fy);
600
                                float4 A2 = fxs > 0? make_float4(fxs, fys, fss, -fs) : make_float4(-fxs, -fys, -fss, fs);
601
                                float maxa = max(max(A0.x, A1.x), A2.x);
602
                                if(maxa >= 1e-10)
603
                                {
604
                                        if(maxa == A1.x)
605
                                        {
606
                                                float4 TEMP = A1; A1 = A0; A0 = TEMP;
607
                                        }else if(maxa == A2.x)
608
                                        {
609
                                                float4 TEMP = A2; A2 = A0; A0 = TEMP;
610
                                        }
611
                                        A0.y /= A0.x;        A0.z /= A0.x;        A0.w/= A0.x;
612
                                        A1.y -= A1.x * A0.y;        A1.z -= A1.x * A0.z;        A1.w -= A1.x * A0.w;
613
                                        A2.y -= A2.x * A0.y;        A2.z -= A2.x * A0.z;        A2.w -= A2.x * A0.w;
614
                                        if(abs(A2.y) > abs(A1.y))
615
                                        {
616
                                                float4 TEMP = A2;        A2 = A1; A1 = TEMP;
617
                                        }
618
                                        if(abs(A1.y) >= 1e-10) 
619
                                        {
620
                                                A1.z /= A1.y;        A1.w /= A1.y;
621
                                                A2.z -= A2.y * A1.z;        A2.w -= A2.y * A1.w;
622
                                                if(abs(A2.z) >= 1e-10) 
623
                                                {
624
                                                        ds = A2.w / A2.z;
625
                                                        dy = A1.w - ds * A1.z;
626
                                                        dx = A0.w - ds * A0.z - dy * A0.y;
627

    
628
                                                        offset_test_passed = 
629
                                                                fabs(data[1][1] + 0.5f * (dx * fx + dy * fy + ds * fs)) > dog_threshold
630
                                                                &&fabs(ds) < 1.0f && fabs(dx) < 1.0f && fabs(dy) < 1.0f;
631
                                                }
632
                                        }
633
                                }
634
                        }
635
                        if(offset_test_passed) result = v > nmax ? 1.0 : -1.0;
636
       }
637
key_finish:
638
       if(in_image) d_key[index] = make_float4(result, dx, dy, ds);
639
}
640

    
641

    
642
void ProgramCU::ComputeKEY(CuTexImage* dog, CuTexImage* key, float Tdog, float Tedge)
643
{
644
        int width = dog->GetImgWidth(), height = dog->GetImgHeight();
645
        float Tdog1 = (GlobalUtil::_SubpixelLocalization? 0.8f : 1.0f) * Tdog;
646
        CuTexImage* dogp = dog - 1;
647
        CuTexImage* dogn = dog + 1;
648
#ifdef KEY_OFFSET_ONE
649
        dim3 grid((width - 1 + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height - 1 + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
650
#else
651
        dim3 grid((width + KEY_BLOCK_DIMX - 1)/ KEY_BLOCK_DIMX,  (height + KEY_BLOCK_DIMY - 1)/KEY_BLOCK_DIMY);
652
#endif
653
        dim3 block(KEY_BLOCK_DIMX, KEY_BLOCK_DIMY);
654
        dogp->BindTexture(texP);
655
        dog ->BindTexture(texC);
656
        dogn->BindTexture(texN);
657
        Tedge = (Tedge+1)*(Tedge+1)/Tedge;
658
        ComputeKEY_Kernel<<<grid, block>>>((float4*) key->_cuData, width,
659
        width -1, height -1, Tdog1, Tdog, Tedge, GlobalUtil::_SubpixelLocalization);
660

    
661
}
662

    
663

    
664

    
665
void __global__ InitHist_Kernel(int4* hist, int ws, int wd, int height)
666
{
667
       int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
668
       int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
669
           if(row < height && col < wd)
670
           {
671
                        int hidx = IMUL(row, wd) + col;
672
                        int scol = col << 2;
673
                        int sidx = IMUL(row, ws) + scol;
674
                        int v[4] = {0, 0, 0, 0}; 
675
                        if(row > 0 && row < height -1)
676
                        {
677
#pragma unroll
678
                                for(int i = 0; i < 4 ; ++i, ++scol)
679
                                {
680
                                        float4 temp = tex1Dfetch(texDataF4, sidx +i);
681
                                        v[i] = (scol < ws -1 && scol > 0 && temp.x!=0) ? 1 : 0;
682
                                }
683
                        }
684
                        hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
685

    
686
           }
687
}
688

    
689

    
690

    
691
void ProgramCU::InitHistogram(CuTexImage* key, CuTexImage* hist)
692
{
693
        int ws = key->GetImgWidth(), hs = key->GetImgHeight();
694
        int wd = hist->GetImgWidth(), hd = hist->GetImgHeight();
695
        dim3 grid((wd  + HIST_INIT_WIDTH - 1)/ HIST_INIT_WIDTH,  hd);
696
        dim3 block(HIST_INIT_WIDTH, 1);
697
        key->BindTexture(texDataF4);
698
        InitHist_Kernel<<<grid, block>>>((int4*) hist->_cuData, ws, wd, hd);
699
}
700

    
701

    
702

    
703
void __global__ ReduceHist_Kernel(int4* d_hist, int ws, int wd, int height)
704
{
705
       int row = IMUL(blockIdx.y, blockDim.y) + threadIdx.y;
706
       int col = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
707
           if(row < height && col < wd)
708
           {
709
                        int hidx = IMUL(row, wd) + col;
710
                        int scol = col << 2;
711
                        int sidx = IMUL(row, ws) + scol;
712
                        int v[4] = {0, 0, 0, 0}; 
713
#pragma unroll
714
                        for(int i = 0; i < 4 && scol < ws; ++i, ++scol)
715
                        {
716
                                int4 temp = tex1Dfetch(texDataI4, sidx + i);
717
                                v[i] = temp.x + temp.y + temp.z + temp.w;
718
                        }
719
                        d_hist[hidx] = make_int4(v[0], v[1], v[2], v[3]);
720
           }
721
}
722

    
723
void ProgramCU::ReduceHistogram(CuTexImage*hist1, CuTexImage* hist2)
724
{
725
        int ws = hist1->GetImgWidth(), hs = hist1->GetImgHeight();
726
        int wd = hist2->GetImgWidth(), hd = hist2->GetImgHeight();
727
        int temp = (int)floor(logf(float(wd * 2/ 3)) / logf(2.0f));
728
        const int wi = min(7, max(temp , 0));
729
        hist1->BindTexture(texDataI4);
730

    
731
        const int BW = 1 << wi, BH =  1 << (7 - wi);
732
        dim3 grid((wd  + BW - 1)/ BW,  (hd + BH -1) / BH);
733
        dim3 block(BW, BH);
734
        ReduceHist_Kernel<<<grid, block>>>((int4*)hist2->_cuData, ws, wd, hd);
735
}
736

    
737

    
738
void __global__ ListGen_Kernel(int4* d_list, int width)
739
{
740
        int idx1 = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
741
    int4 pos = tex1Dfetch(texDataList, idx1);
742
        int idx2 = IMUL(pos.y, width) + pos.x;
743
        int4 temp = tex1Dfetch(texDataI4, idx2);
744
        int  sum1 = temp.x + temp.y;
745
        int  sum2 = sum1 + temp.z;
746
        pos.x <<= 2;
747
        if(pos.z >= sum2)
748
        {
749
                pos.x += 3;
750
                pos.z -= sum2;
751
        }else if(pos.z >= sum1)
752
        {
753
                pos.x += 2;
754
                pos.z -= sum1;
755
        }else if(pos.z >= temp.x)
756
        {
757
                pos.x += 1;
758
                pos.z -= temp.x;
759
        }
760
        d_list[idx1] = pos;
761
}
762

    
763
//input list (x, y) (x, y) ....
764
void ProgramCU::GenerateList(CuTexImage* list, CuTexImage* hist)
765
{
766
        int len = list->GetImgWidth();
767
        list->BindTexture(texDataList);
768
        hist->BindTexture(texDataI4);
769
        dim3  grid((len + LISTGEN_BLOCK_DIM -1) /LISTGEN_BLOCK_DIM);
770
        dim3  block(LISTGEN_BLOCK_DIM);
771
        ListGen_Kernel<<<grid, block>>>((int4*) list->_cuData, hist->GetImgWidth());
772
}
773

    
774
void __global__ ComputeOrientation_Kernel(float4* d_list, 
775
                                                                                  int list_len,
776
                                                                                  int width, int height, 
777
                                                                                  float sigma, float sigma_step, 
778
                                                                                  float gaussian_factor, float sample_factor,
779
                                                                                  int num_orientation,
780
                                                                                  int existing_keypoint, 
781
                                                                                  int subpixel, 
782
                                                                                  int keepsign)
783
{
784
        const float ten_degree_per_radius = 5.7295779513082320876798154814105;
785
        const float radius_per_ten_degrees = 1.0 / 5.7295779513082320876798154814105;
786
        int idx = IMUL(blockDim.x, blockIdx.x) + threadIdx.x;
787
        if(idx >= list_len) return;
788
        float4 key; 
789
        if(existing_keypoint)
790
        {
791
                key = tex1Dfetch(texDataF4, idx);                
792
        }else 
793
        {
794
                int4 ikey = tex1Dfetch(texDataList, idx);
795
                key.x = ikey.x + 0.5f;
796
                key.y = ikey.y + 0.5f;
797
                key.z = sigma;
798
                if(subpixel || keepsign)
799
                {
800
                        float4 offset = tex1Dfetch(texDataF4, IMUL(width, ikey.y) + ikey.x);
801
                        if(subpixel)
802
                        {
803
                                key.x += offset.y;
804
                                key.y += offset.z;
805
                                key.z *= pow(sigma_step, offset.w);
806
                        }
807
                        if(keepsign) key.z *= offset.x;
808
                }
809
        }
810
        if(num_orientation == 0)
811
        {
812
                key.w = 0;
813
                d_list[idx] = key;
814
                return;
815
        }
816
        float vote[37];
817
        float gsigma = key.z * gaussian_factor;
818
        float win = fabs(key.z) * sample_factor;
819
        float dist_threshold = win * win + 0.5;
820
        float factor = -0.5f / (gsigma * gsigma);
821
        float xmin = max(1.5f, floor(key.x - win) + 0.5f);
822
        float ymin = max(1.5f, floor(key.y - win) + 0.5f);
823
        float xmax = min(width - 1.5f, floor(key.x + win) + 0.5f);
824
        float ymax = min(height -1.5f, floor(key.y + win) + 0.5f);
825
#pragma unroll
826
        for(int i = 0; i < 36; ++i) vote[i] = 0.0f;
827
        for(float y = ymin; y <= ymax; y += 1.0f)
828
        {
829
                for(float x = xmin; x <= xmax; x += 1.0f)
830
                {
831
                        float dx = x - key.x;
832
                        float dy = y - key.y;
833
                        float sq_dist  = dx * dx + dy * dy;
834
                        if(sq_dist >= dist_threshold) continue;
835
                        float2 got = tex2D(texDataF2, x, y);
836
                        float weight = got.x * exp(sq_dist * factor);
837
                        float fidx = floor(got.y * ten_degree_per_radius);
838
                        int oidx = fidx;
839
                        if(oidx < 0) oidx += 36;
840
                        vote[oidx] += weight; 
841
                }
842
        }
843

    
844
        //filter the vote
845

    
846
        const float one_third = 1.0 /3.0;
847
#pragma unroll
848
        for(int i = 0; i < 6; ++i)
849
        {
850
                vote[36] = vote[0];
851
                float pre = vote[35];
852
#pragma unroll
853
                for(int j = 0; j < 36; ++j)
854
                {
855
                        float temp = one_third * (pre + vote[j] + vote[j + 1]);
856
                        pre = vote[j];                        vote[j] = temp;
857
                }
858
        }
859

    
860
        vote[36] = vote[0];
861
        if(num_orientation == 1 || existing_keypoint)
862
        {
863
                int index_max = 0;
864
                float max_vote = vote[0];
865
#pragma unroll
866
                for(int i = 1; i < 36; ++i)
867
                {
868
                        index_max =  vote[i] > max_vote? i : index_max;
869
                        max_vote = max(max_vote, vote[i]);
870
                }
871
                float pre = vote[index_max == 0? 35 : index_max -1];
872
                float next = vote[index_max + 1];
873
                float weight = max_vote;
874
                float off =  0.5f * FDIV(next - pre, weight + weight - next - pre);
875
                key.w = radius_per_ten_degrees * (index_max + 0.5f + off);
876
                d_list[idx] = key;
877
        
878
        }else
879
        {
880
                float max_vote = vote[0];
881
#pragma unroll
882
                for(int i = 1; i < 36; ++i)                max_vote = max(max_vote, vote[i]);
883

    
884
                float vote_threshold = max_vote * 0.8f;
885
                float pre = vote[35];
886
                float max_rot[2], max_vot[2] = {0, 0};
887
                int  ocount = 0;
888
#pragma unroll
889
                for(int i =0; i < 36; ++i)
890
                {
891
                        float next = vote[i + 1];
892
                        if(vote[i] > vote_threshold && vote[i] > pre && vote[i] > next)
893
                        {
894
                                float di = 0.5f * FDIV(next - pre, vote[i] + vote[i] - next - pre);
895
                                float rot = i + di + 0.5f;
896
                                float weight = vote[i];
897
                                ///
898
                                if(weight > max_vot[1])
899
                                {
900
                                        if(weight > max_vot[0])
901
                                        {
902
                                                max_vot[1] = max_vot[0];
903
                                                max_rot[1] = max_rot[0];
904
                                                max_vot[0] = weight;
905
                                                max_rot[0] = rot;
906
                                        }
907
                                        else
908
                                        {
909
                                                max_vot[1] = weight;
910
                                                max_rot[1] = rot;
911
                                        }
912
                                        ocount ++;
913
                                }
914
                        }
915
                        pre = vote[i];
916
                }
917
                float fr1 = max_rot[0] / 36.0f; 
918
                if(fr1 < 0) fr1 += 1.0f; 
919
                unsigned short us1 = ocount == 0? 65535 : ((unsigned short )floor(fr1 * 65535.0f));
920
                unsigned short us2 = 65535;
921
                if(ocount > 1)
922
                {
923
                        float fr2 = max_rot[1] / 36.0f; 
924
                        if(fr2 < 0) fr2 += 1.0f;
925
                        us2 = (unsigned short ) floor(fr2 * 65535.0f);
926
                }
927
                unsigned int uspack = (us2 << 16) | us1;
928
                key.w = __int_as_float(uspack);
929
                d_list[idx] = key;
930
        }
931

    
932
}
933

    
934

    
935

    
936

    
937
void ProgramCU::ComputeOrientation(CuTexImage* list, CuTexImage* got, CuTexImage*key, 
938
                                                                   float sigma, float sigma_step, int existing_keypoint)
939
{
940
        int len = list->GetImgWidth();
941
        if(len <= 0) return;
942
        int width = got->GetImgWidth(), height = got->GetImgHeight();
943
        if(existing_keypoint)
944
        {
945
                list->BindTexture(texDataF4);
946
        }else
947
        {
948
                list->BindTexture(texDataList);
949
                if(GlobalUtil::_SubpixelLocalization) key->BindTexture(texDataF4);
950
        }
951
        got->BindTexture2D(texDataF2);
952

    
953
        const int block_width = len < ORIENTATION_COMPUTE_PER_BLOCK ? 16 : ORIENTATION_COMPUTE_PER_BLOCK;
954
        dim3 grid((len + block_width -1) / block_width);
955
        dim3 block(block_width);
956

    
957
        ComputeOrientation_Kernel<<<grid, block>>>((float4*) list->_cuData, 
958
                len, width, height, sigma, sigma_step, 
959
                GlobalUtil::_OrientationGaussianFactor, 
960
                GlobalUtil::_OrientationGaussianFactor * GlobalUtil::_OrientationWindowFactor,
961
                GlobalUtil::_FixedOrientation? 0 : GlobalUtil::_MaxOrientation, 
962
                existing_keypoint, GlobalUtil::_SubpixelLocalization, GlobalUtil::_KeepExtremumSign);
963

    
964
        ProgramCU::CheckErrorCUDA("ComputeOrientation");
965
}
966

    
967
template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptor_Kernel(float4* d_des, int num, 
968
                                                                                         int width, int height, float window_factor)
969
{
970
        const float rpi = 4.0/ 3.14159265358979323846;
971
        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
972
        int fidx = idx >> 4;
973
        if(fidx >= num) return;
974
        float4 key = tex1Dfetch(texDataF4, fidx);
975
        int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
976
        float spt = fabs(key.z * window_factor);
977
        float s, c; __sincosf(key.w, &s, &c);
978
        float anglef = key.w > 3.14159265358979323846? key.w - (2.0 * 3.14159265358979323846) : key.w ; 
979
        float cspt = c * spt, sspt = s * spt;
980
        float crspt = c / spt, srspt = s / spt;
981
        float2 offsetpt, pt;
982
        float xmin, ymin, xmax, ymax, bsz;
983
        offsetpt.x = ix - 1.5f;
984
        offsetpt.y = iy - 1.5f;
985
        pt.x = cspt * offsetpt.x - sspt * offsetpt.y + key.x;
986
        pt.y = cspt * offsetpt.y + sspt * offsetpt.x + key.y;
987
        bsz =  fabs(cspt) + fabs(sspt);
988
        xmin = max(1.5f, floor(pt.x - bsz) + 0.5f);
989
        ymin = max(1.5f, floor(pt.y - bsz) + 0.5f);
990
        xmax = min(width - 1.5f, floor(pt.x + bsz) + 0.5f);
991
        ymax = min(height - 1.5f, floor(pt.y + bsz) + 0.5f);
992
        float des[9];
993
#pragma unroll
994
        for(int i =0; i < 9; ++i) des[i] = 0.0f;
995
        for(float y = ymin; y <= ymax; y += 1.0f)
996
        {
997
                for(float x = xmin; x <= xmax; x += 1.0f)
998
                {
999
                        float dx = x - pt.x;
1000
                        float dy = y - pt.y;
1001
                        float nx = crspt * dx + srspt * dy;
1002
                        float ny = crspt * dy - srspt * dx;
1003
                        float nxn = fabs(nx);
1004
                        float nyn = fabs(ny);
1005
                        if(nxn < 1.0f && nyn < 1.0f)
1006
                        {
1007
                                float2 cc = tex2D(texDataF2, x, y);
1008
                                float dnx = nx + offsetpt.x;
1009
                                float dny = ny + offsetpt.y;
1010
                                float ww = exp(-0.125f * (dnx * dnx + dny * dny));
1011
                                float wx = 1.0 - nxn;
1012
                                float wy = 1.0 - nyn;
1013
                                float weight = ww * wx * wy * cc.x;
1014
                                float theta = (anglef - cc.y) * rpi;
1015
                                if(theta < 0) theta += 8.0f;
1016
                                float fo = floor(theta);
1017
                                int fidx = fo;
1018
                                float weight1 = fo + 1.0f  - theta;
1019
                                float weight2 = theta - fo;
1020
                                if(DYNAMIC_INDEXING)
1021
                                {
1022
                                        des[fidx] += (weight1 * weight);
1023
                                        des[fidx + 1] += (weight2 * weight);
1024
                                        //this dynamic indexing part might be slow
1025
                                }else
1026
                                {
1027
                                        #pragma unroll
1028
                                        for(int k = 0; k < 8; ++k)
1029
                                        {
1030
                                                if(k == fidx) 
1031
                                                {
1032
                                                        des[k] += (weight1 * weight);
1033
                                                        des[k+1] += (weight2 * weight);
1034
                                                }
1035
                                        }
1036
                                }
1037
                        }
1038
                }
1039
        }
1040
        des[0] += des[8];
1041

    
1042
        int didx = idx << 1;
1043
        d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1044
        d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1045
}
1046

    
1047

    
1048
template <bool DYNAMIC_INDEXING> void __global__ ComputeDescriptorRECT_Kernel(float4* d_des, int num, 
1049
                                                                                         int width, int height, float window_factor)
1050
{
1051
        const float rpi = 4.0/ 3.14159265358979323846;
1052
        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1053
        int fidx = idx >> 4;
1054
        if(fidx >= num) return;
1055
        float4 key = tex1Dfetch(texDataF4, fidx);
1056
        int bidx = idx& 0xf, ix = bidx & 0x3, iy = bidx >> 2;
1057
    //float aspect_ratio = key.w / key.z;
1058
    //float aspect_sq = aspect_ratio * aspect_ratio;
1059
        float sptx = key.z * 0.25, spty = key.w * 0.25;
1060
        float xmin, ymin, xmax, ymax; float2 pt;
1061
        pt.x = sptx * (ix + 0.5f)  + key.x;
1062
        pt.y = spty * (iy + 0.5f)  + key.y;
1063
        xmin = max(1.5f, floor(pt.x - sptx) + 0.5f);
1064
        ymin = max(1.5f, floor(pt.y - spty) + 0.5f);
1065
        xmax = min(width - 1.5f, floor(pt.x + sptx) + 0.5f);
1066
        ymax = min(height - 1.5f, floor(pt.y + spty) + 0.5f);
1067
        float des[9];
1068
#pragma unroll
1069
        for(int i =0; i < 9; ++i) des[i] = 0.0f;
1070
        for(float y = ymin; y <= ymax; y += 1.0f)
1071
        {
1072
                for(float x = xmin; x <= xmax; x += 1.0f)
1073
                {
1074
                        float nx = (x - pt.x) / sptx;
1075
                        float ny = (y - pt.y) / spty;
1076
                        float nxn = fabs(nx);
1077
                        float nyn = fabs(ny);
1078
                        if(nxn < 1.0f && nyn < 1.0f)
1079
                        {
1080
                                float2 cc = tex2D(texDataF2, x, y);
1081
                                float wx = 1.0 - nxn;
1082
                                float wy = 1.0 - nyn;
1083
                                float weight =  wx * wy * cc.x;
1084
                                float theta = (- cc.y) * rpi;
1085
                                if(theta < 0) theta += 8.0f;
1086
                                float fo = floor(theta);
1087
                                int fidx = fo;
1088
                                float weight1 = fo + 1.0f  - theta;
1089
                                float weight2 = theta - fo;
1090
                                if(DYNAMIC_INDEXING)
1091
                                {
1092
                                        des[fidx] += (weight1 * weight);
1093
                                        des[fidx + 1] += (weight2 * weight);
1094
                                        //this dynamic indexing part might be slow
1095
                                }else
1096
                                {
1097
                                        #pragma unroll
1098
                                        for(int k = 0; k < 8; ++k)
1099
                                        {
1100
                                                if(k == fidx) 
1101
                                                {
1102
                                                        des[k] += (weight1 * weight);
1103
                                                        des[k+1] += (weight2 * weight);
1104
                                                }
1105
                                        }
1106
                                }
1107
                        }
1108
                }
1109
        }
1110
        des[0] += des[8];
1111

    
1112
        int didx = idx << 1;
1113
        d_des[didx] = make_float4(des[0], des[1], des[2], des[3]);
1114
        d_des[didx+1] = make_float4(des[4], des[5], des[6], des[7]);
1115
}
1116

    
1117
void __global__ NormalizeDescriptor_Kernel(float4* d_des, int num)
1118
{
1119
        float4 temp[32];
1120
        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1121
        if(idx >= num) return;
1122
        int sidx = idx << 5;
1123
        float norm1 = 0, norm2 = 0;
1124
#pragma unroll
1125
        for(int i = 0; i < 32; ++i)
1126
        {
1127
                temp[i] = tex1Dfetch(texDataF4, sidx +i);
1128
                norm1 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1129
                                 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1130
        }
1131
        norm1 = rsqrt(norm1);
1132

    
1133
#pragma unroll
1134
        for(int i = 0; i < 32; ++i)
1135
        {
1136
                temp[i].x = min(0.2f, temp[i].x * norm1);
1137
                temp[i].y = min(0.2f, temp[i].y * norm1);
1138
                temp[i].z = min(0.2f, temp[i].z * norm1);
1139
                temp[i].w = min(0.2f, temp[i].w * norm1);
1140
                norm2 += (temp[i].x * temp[i].x + temp[i].y * temp[i].y +
1141
                                 temp[i].z * temp[i].z + temp[i].w * temp[i].w);
1142
        }
1143

    
1144
        norm2 = rsqrt(norm2);
1145
#pragma unroll
1146
        for(int i = 0; i < 32; ++i)
1147
        {
1148
                temp[i].x *= norm2;                temp[i].y *= norm2;
1149
                temp[i].z *= norm2;                temp[i].w *= norm2;
1150
                d_des[sidx + i] = temp[i];
1151
        }
1152
}
1153

    
1154
void ProgramCU::ComputeDescriptor(CuTexImage*list, CuTexImage* got, CuTexImage* dtex, int rect, int stream)
1155
{
1156
        int num = list->GetImgWidth();
1157
        int width = got->GetImgWidth();
1158
        int height = got->GetImgHeight();
1159

    
1160
    dtex->InitTexture(num * 128, 1, 1);
1161
        got->BindTexture2D(texDataF2);
1162
        list->BindTexture(texDataF4);
1163
        int block_width = DESCRIPTOR_COMPUTE_BLOCK_SIZE;
1164
        dim3 grid((num * 16 + block_width -1) / block_width);
1165
        dim3 block(block_width);
1166

    
1167
    if(rect)
1168
    {
1169
            if(GlobalUtil::_UseDynamicIndexing)
1170
                    ComputeDescriptorRECT_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1171
            else
1172
                    ComputeDescriptorRECT_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1173
 
1174
    }else
1175
    {
1176
            if(GlobalUtil::_UseDynamicIndexing)
1177
                    ComputeDescriptor_Kernel<true><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1178
            else
1179
                    ComputeDescriptor_Kernel<false><<<grid, block>>>((float4*) dtex->_cuData, num, width, height, GlobalUtil::_DescriptorWindowFactor);
1180
    }
1181
        if(GlobalUtil::_NormalizedSIFT)
1182
        {
1183
                dtex->BindTexture(texDataF4);
1184
                const int block_width = DESCRIPTOR_NORMALIZ_PER_BLOCK;
1185
                dim3 grid((num + block_width -1) / block_width);
1186
                dim3 block(block_width);
1187
                NormalizeDescriptor_Kernel<<<grid, block>>>((float4*) dtex->_cuData, num);
1188
        }
1189
        CheckErrorCUDA("ComputeDescriptor");
1190
}
1191

    
1192
//////////////////////////////////////////////////////
1193
void ProgramCU::FinishCUDA()
1194
{
1195
        cudaThreadSynchronize();
1196
}
1197

    
1198
int ProgramCU::CheckErrorCUDA(const char* location)
1199
{
1200
        cudaError_t e = cudaGetLastError();
1201
        if(e)
1202
        {
1203
        if(location) fprintf(stderr, "%s:\t",  location);
1204
                fprintf(stderr, "%s\n",  cudaGetErrorString(e));
1205
                //assert(0);
1206
        return 1;
1207
        }else
1208
    {
1209
        return 0; 
1210
    }
1211
}
1212

    
1213
void __global__ ConvertDOG_Kernel(float* d_result, int width, int height)
1214
{
1215
        int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1216
        int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1217
        if(col < width && row < height)
1218
        {
1219
                int index = row * width  + col;
1220
                float v = tex1Dfetch(texData, index);
1221
                d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1222
                        0.5 : saturate(0.5+20.0*v);
1223
        }
1224
}
1225
///
1226
void ProgramCU::DisplayConvertDOG(CuTexImage* dog, CuTexImage* out)
1227
{
1228
        if(out->_cuData == NULL) return;
1229
        int width = dog->GetImgWidth(), height = dog ->GetImgHeight();
1230
        dog->BindTexture(texData);
1231
        dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
1232
        dim3 block(BLOCK_DIM, BLOCK_DIM);
1233
        ConvertDOG_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
1234
        ProgramCU::CheckErrorCUDA("DisplayConvertDOG");
1235
}
1236

    
1237
void __global__ ConvertGRD_Kernel(float* d_result, int width, int height)
1238
{
1239
        int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1240
        int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1241
        if(col < width && row < height)
1242
        {
1243
                int index = row * width  + col;
1244
                float v = tex1Dfetch(texData, index << 1);
1245
                d_result[index] = (col == 0 || row == 0 || col == width -1 || row == height -1)?
1246
                                0 : saturate(5 * v);
1247

    
1248
        }
1249
}
1250

    
1251

    
1252
void ProgramCU::DisplayConvertGRD(CuTexImage* got, CuTexImage* out)
1253
{
1254
        if(out->_cuData == NULL) return;
1255
        int width = got->GetImgWidth(), height = got ->GetImgHeight();
1256
        got->BindTexture(texData);
1257
        dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
1258
        dim3 block(BLOCK_DIM, BLOCK_DIM);
1259
        ConvertGRD_Kernel<<<grid, block>>>((float*) out->_cuData, width, height);
1260
        ProgramCU::CheckErrorCUDA("DisplayConvertGRD");
1261
}
1262

    
1263
void __global__ ConvertKEY_Kernel(float4* d_result, int width, int height)
1264
{
1265

    
1266
        int row = (blockIdx.y << BLOCK_LOG_DIM) + threadIdx.y;
1267
        int col = (blockIdx.x << BLOCK_LOG_DIM) + threadIdx.x;
1268
        if(col < width && row < height)
1269
        {
1270
                int index = row * width + col;
1271
                float4 keyv = tex1Dfetch(texDataF4, index);
1272
                int is_key = (keyv.x == 1.0f || keyv.x == -1.0f); 
1273
                int inside = col > 0 && row > 0 && row < height -1 && col < width - 1;
1274
                float v = inside? saturate(0.5 + 20 * tex1Dfetch(texData, index)) : 0.5;
1275
                d_result[index] = is_key && inside ? 
1276
                        (keyv.x > 0? make_float4(1.0f, 0, 0, 1.0f) : make_float4(0.0f, 1.0f, 0.0f, 1.0f)): 
1277
                        make_float4(v, v, v, 1.0f) ;
1278
        }
1279
}
1280
void ProgramCU::DisplayConvertKEY(CuTexImage* key, CuTexImage* dog, CuTexImage* out)
1281
{
1282
        if(out->_cuData == NULL) return;
1283
        int width = key->GetImgWidth(), height = key ->GetImgHeight();
1284
        dog->BindTexture(texData);
1285
        key->BindTexture(texDataF4);
1286
        dim3 grid((width + BLOCK_DIM - 1)/ BLOCK_DIM,  (height + BLOCK_DIM - 1)/BLOCK_DIM);
1287
        dim3 block(BLOCK_DIM, BLOCK_DIM);
1288
        ConvertKEY_Kernel<<<grid, block>>>((float4*) out->_cuData, width, height);
1289
}
1290

    
1291

    
1292
void __global__ DisplayKeyPoint_Kernel(float4 * d_result, int num)
1293
{
1294
        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1295
        if(idx >= num) return;
1296
        float4 v = tex1Dfetch(texDataF4, idx);
1297
        d_result[idx] = make_float4(v.x, v.y, 0, 1.0f);
1298
}
1299

    
1300
void ProgramCU::DisplayKeyPoint(CuTexImage* ftex, CuTexImage* out)
1301
{
1302
        int num = ftex->GetImgWidth();
1303
        int block_width = 64;
1304
        dim3 grid((num + block_width -1) /block_width);
1305
        dim3 block(block_width);
1306
        ftex->BindTexture(texDataF4);
1307
        DisplayKeyPoint_Kernel<<<grid, block>>>((float4*) out->_cuData, num);
1308
        ProgramCU::CheckErrorCUDA("DisplayKeyPoint");
1309
}
1310

    
1311
void __global__ DisplayKeyBox_Kernel(float4* d_result, int num)
1312
{
1313
        int idx = IMUL(blockIdx.x, blockDim.x) + threadIdx.x;
1314
        if(idx >= num) return;
1315
        int  kidx = idx / 10, vidx = idx - IMUL(kidx , 10);
1316
        float4 v = tex1Dfetch(texDataF4, kidx);
1317
        float sz = fabs(v.z * 3.0f);
1318
        ///////////////////////
1319
        float s, c;        __sincosf(v.w, &s, &c);
1320
        ///////////////////////
1321
        float dx = vidx == 0? 0 : ((vidx <= 4 || vidx >= 9)? sz : -sz);
1322
        float dy = vidx <= 1? 0 : ((vidx <= 2 || vidx >= 7)? -sz : sz);
1323
        float4 pos;
1324
        pos.x = v.x + c * dx - s * dy;
1325
        pos.y = v.y + c * dy + s * dx;
1326
        pos.z = 0;        pos.w = 1.0f;
1327
        d_result[idx]  = pos;
1328
}
1329

    
1330
void ProgramCU::DisplayKeyBox(CuTexImage* ftex, CuTexImage* out)
1331
{
1332
        int len = ftex->GetImgWidth();
1333
        int block_width = 32;
1334
        dim3 grid((len * 10 + block_width -1) / block_width);
1335
        dim3 block(block_width);
1336
        ftex->BindTexture(texDataF4);
1337
        DisplayKeyBox_Kernel<<<grid, block>>>((float4*) out->_cuData, len * 10);
1338
}
1339
///////////////////////////////////////////////////////////////////
1340
inline void CuTexImage:: BindTexture(textureReference& texRef)
1341
{
1342
         cudaBindTexture(NULL, &texRef, _cuData, &texRef.channelDesc, _numBytes);
1343
}
1344

    
1345
inline void CuTexImage::BindTexture2D(textureReference& texRef)
1346
{
1347
#if defined(SIFTGPU_ENABLE_LINEAR_TEX2D) 
1348
        cudaBindTexture2D(0, &texRef, _cuData, &texRef.channelDesc, _imgWidth, _imgHeight, _imgWidth* _numChannel* sizeof(float));
1349
#else
1350
        cudaChannelFormatDesc desc;
1351
        cudaGetChannelDesc(&desc, _cuData2D);
1352
        cudaBindTextureToArray(&texRef, _cuData2D, &desc);
1353
#endif
1354
}
1355

    
1356
int ProgramCU::CheckCudaDevice(int device)
1357
{
1358
    int count = 0, device_used; 
1359
    if(cudaGetDeviceCount(&count) || count <= 0)
1360
    {
1361
        ProgramCU::CheckErrorCUDA("CheckCudaDevice");
1362
        return 0;
1363
    }else if(count == 1)
1364
    {
1365
        cudaDeviceProp deviceProp;
1366
        cudaGetDeviceProperties(&deviceProp, 0);
1367
        if (deviceProp.major == 9999 && deviceProp.minor == 9999)
1368
        {
1369
            fprintf(stderr, "CheckCudaDevice: no device supporting CUDA.\n");
1370
            return 0;
1371
        }
1372
    }
1373
    if(device >0 && device < count)  
1374
    {
1375
        cudaSetDevice(device);
1376
        CheckErrorCUDA("cudaSetDevice\n"); 
1377
    }
1378
    cudaGetDevice(&device_used);
1379
    if(device != device_used) 
1380
        fprintf(stderr,  "\nERROR:   Cannot set device to %d\n"
1381
        "\nWARNING: Use # %d device instead (out of %d)\n", device, device_used, count);
1382
    return 1;
1383
}
1384

    
1385
////////////////////////////////////////////////////////////////////////////////////////
1386
// siftmatch funtions
1387
//////////////////////////////////////////////////////////////////////////////////////////
1388

    
1389
#define MULT_TBLOCK_DIMX 128
1390
#define MULT_TBLOCK_DIMY 1
1391
#define MULT_BLOCK_DIMX (MULT_TBLOCK_DIMX)
1392
#define MULT_BLOCK_DIMY (8 * MULT_TBLOCK_DIMY)
1393

    
1394

    
1395
texture<uint4, 1, cudaReadModeElementType> texDes1;
1396
texture<uint4, 1, cudaReadModeElementType> texDes2;
1397

    
1398
void __global__ MultiplyDescriptor_Kernel(int* d_result, int num1, int num2, int3* d_temp)
1399
{
1400
        int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY),  idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);
1401

    
1402
        int idx1 = idx01 + threadIdx.y, idx2 = idx02 + threadIdx.x;
1403
        __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1404
        int read_idx1 = idx01 * 8 +  threadIdx.x, read_idx2 = idx2 * 8;
1405
        int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1406
        int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1407

    
1408
        ///////////////////////////////////////////////////////////////
1409
        //Load feature descriptors
1410
        ///////////////////////////////////////////////////////////////
1411
#if MULT_BLOCK_DIMY == 16
1412
        uint4 v = tex1Dfetch(texDes1, read_idx1);
1413
        data1[cache_idx1]   = v.x;        data1[cache_idx1+1] = v.y;
1414
        data1[cache_idx1+2] = v.z;        data1[cache_idx1+3] = v.w;
1415
#elif MULT_BLOCK_DIMY == 8
1416
        if(threadIdx.x < 64)
1417
        {
1418
                uint4 v = tex1Dfetch(texDes1, read_idx1);
1419
                data1[cache_idx1]   = v.x;                data1[cache_idx1+1] = v.y;
1420
                data1[cache_idx1+2] = v.z;                data1[cache_idx1+3] = v.w;
1421
        }
1422
#else
1423
#error
1424
#endif
1425
        __syncthreads();
1426

    
1427
        ///
1428
        if(idx2 >= num2) return;
1429
        ///////////////////////////////////////////////////////////////////////////
1430
        //compare descriptors
1431

    
1432
        int results[MULT_BLOCK_DIMY];
1433
#pragma unroll
1434
        for(int i = 0; i < MULT_BLOCK_DIMY; ++i) results[i] = 0;
1435

    
1436
#pragma unroll
1437
        for(int i = 0; i < 8; ++i)
1438
        {
1439
                uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
1440
                unsigned char* p2 = (unsigned char*)(&v);
1441
#pragma unroll
1442
                for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1443
                {
1444
                        unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));
1445
                        results[k] +=          ( IMUL(p1[0], p2[0])        + IMUL(p1[1], p2[1])  
1446
                                                         + IMUL(p1[2], p2[2])          + IMUL(p1[3], p2[3])  
1447
                                                         + IMUL(p1[4], p2[4])          + IMUL(p1[5], p2[5])  
1448
                                                         + IMUL(p1[6], p2[6])          + IMUL(p1[7], p2[7])  
1449
                                                         + IMUL(p1[8], p2[8])          + IMUL(p1[9], p2[9])  
1450
                                                         + IMUL(p1[10], p2[10])        + IMUL(p1[11], p2[11])
1451
                                                         + IMUL(p1[12], p2[12])        + IMUL(p1[13], p2[13])
1452
                                                         + IMUL(p1[14], p2[14])        + IMUL(p1[15], p2[15]));
1453
                }
1454
        }
1455

    
1456
        int dst_idx = IMUL(idx1, num2)  + idx2;
1457
        if(d_temp)
1458
        {
1459
                int3 cmp_result = make_int3(0, -1, 0);
1460

    
1461
#pragma unroll
1462
                for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1463
                {
1464
                        if(idx1 + i < num1)
1465
                        {
1466
                                cmp_result = results[i] > cmp_result.x? 
1467
                                make_int3(results[i], idx1 + i, cmp_result.x) : 
1468
                                make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1469
                                d_result[dst_idx + IMUL(i, num2)] = results[i];
1470
                        }
1471
                }
1472
                d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; 
1473
        }else
1474
        {
1475
#pragma unroll
1476
                for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1477
                {
1478
                        if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = results[i];
1479
                }
1480
        }
1481

    
1482
}
1483

    
1484

    
1485
void ProgramCU::MultiplyDescriptor(CuTexImage* des1, CuTexImage* des2, CuTexImage* texDot, CuTexImage* texCRT)
1486
{
1487
        int num1 = des1->GetImgWidth() / 8;        
1488
        int num2 = des2->GetImgWidth() / 8;
1489
        dim3 grid(        (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1490
                (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1491
        dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1492
        texDot->InitTexture( num2,num1);
1493
        if(texCRT) texCRT->InitTexture(num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 32);
1494
        des1->BindTexture(texDes1);
1495
        des2->BindTexture(texDes2);
1496

    
1497
        MultiplyDescriptor_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2, 
1498
                                                                                                (texCRT? (int3*)texCRT->_cuData : NULL));
1499
        ProgramCU::CheckErrorCUDA("MultiplyDescriptor");
1500
}
1501

    
1502
texture<float, 1, cudaReadModeElementType> texLoc1;
1503
texture<float2, 1, cudaReadModeElementType> texLoc2;
1504
struct Matrix33{float mat[3][3];};
1505

    
1506

    
1507

    
1508
void __global__ MultiplyDescriptorG_Kernel(int* d_result, int num1, int num2, int3* d_temp,
1509
                                                                                   Matrix33 H, float hdistmax, Matrix33 F, float fdistmax)
1510
{
1511
        int idx01 = (blockIdx.y  * MULT_BLOCK_DIMY);        
1512
        int idx02 = (blockIdx.x  * MULT_BLOCK_DIMX);
1513

    
1514
        int idx1 = idx01 + threadIdx.y;        
1515
        int idx2 = idx02 + threadIdx.x;
1516
        __shared__ int data1[17 * 2 * MULT_BLOCK_DIMY];
1517
        __shared__ float loc1[MULT_BLOCK_DIMY * 2];
1518
        int read_idx1 = idx01 * 8 +  threadIdx.x ;
1519
        int read_idx2 = idx2 * 8;
1520
        int col4 = threadIdx.x & 0x3, row4 = threadIdx.x >> 2;
1521
        int cache_idx1 = IMUL(row4, 17) + (col4 << 2);
1522
#if MULT_BLOCK_DIMY == 16
1523
        uint4 v = tex1Dfetch(texDes1, read_idx1);
1524
        data1[cache_idx1]   = v.x;
1525
        data1[cache_idx1+1] = v.y;
1526
        data1[cache_idx1+2] = v.z;
1527
        data1[cache_idx1+3] = v.w;
1528
#elif MULT_BLOCK_DIMY == 8
1529
        if(threadIdx.x < 64)
1530
        {
1531
                uint4 v = tex1Dfetch(texDes1, read_idx1);
1532
                data1[cache_idx1]   = v.x;
1533
                data1[cache_idx1+1] = v.y;
1534
                data1[cache_idx1+2] = v.z;
1535
                data1[cache_idx1+3] = v.w;
1536
        }
1537
#else
1538
#error
1539
#endif
1540
        __syncthreads();
1541
        if(threadIdx.x < MULT_BLOCK_DIMY * 2)
1542
        {
1543
                loc1[threadIdx.x] = tex1Dfetch(texLoc1, 2 * idx01 + threadIdx.x);
1544
        }
1545
        __syncthreads();
1546
        if(idx2 >= num2) return;
1547
        int results[MULT_BLOCK_DIMY];
1548
        /////////////////////////////////////////////////////////////////////////////////////////////
1549
        //geometric verification
1550
        /////////////////////////////////////////////////////////////////////////////////////////////
1551
        int good_count = 0;
1552
        float2 loc2 = tex1Dfetch(texLoc2, idx2);
1553
#pragma unroll
1554
        for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1555
        {
1556

    
1557
                if(idx1 + i < num1)
1558
                {
1559
                        float* loci = loc1 + i * 2;
1560
                        float locx = loci[0], locy = loci[1];
1561
                        //homography
1562
                        float x[3], diff[2];
1563
                        x[0] = H.mat[0][0] * locx + H.mat[0][1] * locy + H.mat[0][2];
1564
                        x[1] = H.mat[1][0] * locx + H.mat[1][1] * locy + H.mat[1][2];
1565
                        x[2] = H.mat[2][0] * locx + H.mat[2][1] * locy + H.mat[2][2];
1566
                        diff[0] = fabs(FDIV(x[0], x[2]) - loc2.x);
1567
                        diff[1] = fabs(FDIV(x[1], x[2]) - loc2.y);
1568
                        if(diff[0] < hdistmax && diff[1] < hdistmax)
1569
                        {
1570
                                //check fundamental matrix
1571
                                float fx1[3], ftx2[3], x2fx1, se; 
1572
                                fx1[0] = F.mat[0][0] * locx + F.mat[0][1] * locy + F.mat[0][2];
1573
                                fx1[1] = F.mat[1][0] * locx + F.mat[1][1] * locy + F.mat[1][2];
1574
                                //fx1[2] = F.mat[2][0] * locx + F.mat[2][1] * locy + F.mat[2][2];
1575

    
1576
                                ftx2[0] = F.mat[0][0] * locx + F.mat[1][0] * locy + F.mat[2][0];
1577
                                ftx2[1] = F.mat[0][1] * locx + F.mat[1][1] * locy + F.mat[2][1];
1578
                                ftx2[2] = F.mat[0][2] * locx + F.mat[1][2] * locy + F.mat[2][2];
1579

    
1580
                                x2fx1 = locx * ftx2[0]  + locy * ftx2[1] + ftx2[2];
1581
                                se = FDIV(x2fx1 * x2fx1, fx1[0] * fx1[0] + fx1[1] * fx1[1] + ftx2[0] * ftx2[0] + ftx2[1] * ftx2[1]);
1582
                                results[i] = se < fdistmax? 0: -262144;
1583
                        }else
1584
                        {
1585
                                results[i] = -262144;
1586
                        }
1587
                }else
1588
                {
1589
                        results[i] = -262144;
1590
                }
1591
                good_count += (results[i] >=0);
1592
        }
1593
        /////////////////////////////////////////////////////////////////////////////////////////////
1594
        ///compare feature descriptors anyway
1595
        /////////////////////////////////////////////////////////////////////////////////////////////
1596
        if(good_count > 0)
1597
        {
1598
#pragma unroll
1599
                for(int i = 0; i < 8; ++i)
1600
                {
1601
                        uint4 v = tex1Dfetch(texDes2, read_idx2 + i);
1602
                        unsigned char* p2 = (unsigned char*)(&v);
1603
#pragma unroll
1604
                        for(int k = 0; k < MULT_BLOCK_DIMY; ++k)
1605
                        {
1606
                                unsigned char* p1 = (unsigned char*) (data1 + k * 34 + i *  4 + (i/4));
1607
                                results[k] +=          ( IMUL(p1[0], p2[0])        + IMUL(p1[1], p2[1])  
1608
                                                                 + IMUL(p1[2], p2[2])          + IMUL(p1[3], p2[3])  
1609
                                                                 + IMUL(p1[4], p2[4])          + IMUL(p1[5], p2[5])  
1610
                                                                 + IMUL(p1[6], p2[6])          + IMUL(p1[7], p2[7])  
1611
                                                                 + IMUL(p1[8], p2[8])          + IMUL(p1[9], p2[9])  
1612
                                                                 + IMUL(p1[10], p2[10])        + IMUL(p1[11], p2[11])
1613
                                                                 + IMUL(p1[12], p2[12])        + IMUL(p1[13], p2[13])
1614
                                                                 + IMUL(p1[14], p2[14])        + IMUL(p1[15], p2[15]));
1615
                        }
1616
                }
1617
        }
1618
        int dst_idx = IMUL(idx1, num2)  + idx2;
1619
        if(d_temp)
1620
        {
1621
                int3 cmp_result = make_int3(0, -1, 0);
1622
#pragma unroll
1623
                for(int i= 0; i < MULT_BLOCK_DIMY; ++i)
1624
                {
1625
                        if(idx1 + i < num1)
1626
                        {
1627
                                cmp_result = results[i] > cmp_result.x? 
1628
                                make_int3(results[i], idx1 + i, cmp_result.x) : 
1629
                                make_int3(cmp_result.x, cmp_result.y, max(cmp_result.z, results[i]));
1630
                                d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1631
                        }else
1632
                        {
1633
                                break;
1634
                        }
1635
                }
1636
                d_temp[ IMUL(blockIdx.y, num2) + idx2] = cmp_result; 
1637
        }else
1638
        {
1639
#pragma unroll
1640
                for(int i = 0; i < MULT_BLOCK_DIMY; ++i)
1641
                {
1642
                        if(idx1 + i < num1) d_result[dst_idx + IMUL(i, num2)] = max(results[i], 0);
1643
                        else break;
1644
                }
1645
        }
1646

    
1647
}
1648

    
1649

    
1650
void ProgramCU::MultiplyDescriptorG(CuTexImage* des1, CuTexImage* des2,
1651
                CuTexImage* loc1, CuTexImage* loc2, CuTexImage* texDot, CuTexImage* texCRT,
1652
                float H[3][3], float hdistmax, float F[3][3], float fdistmax)
1653
{
1654
        int num1 = des1->GetImgWidth() / 8;        
1655
        int num2 = des2->GetImgWidth() / 8;
1656
        Matrix33 MatF, MatH;
1657
        //copy the matrix
1658
        memcpy(MatF.mat, F, 9 * sizeof(float));
1659
        memcpy(MatH.mat, H, 9 * sizeof(float));
1660
        //thread blocks
1661
        dim3 grid(        (num2 + MULT_BLOCK_DIMX - 1)/ MULT_BLOCK_DIMX,
1662
                (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY);
1663
        dim3 block(MULT_TBLOCK_DIMX, MULT_TBLOCK_DIMY);
1664
        //intermediate results
1665
        texDot->InitTexture( num2,num1);
1666
        if(texCRT) texCRT->InitTexture( num2, (num1 + MULT_BLOCK_DIMY - 1)/MULT_BLOCK_DIMY, 3);
1667
        loc1->BindTexture(texLoc1);        
1668
        loc2->BindTexture(texLoc2);
1669
        des1->BindTexture(texDes1);        
1670
        des2->BindTexture(texDes2);
1671
        MultiplyDescriptorG_Kernel<<<grid, block>>>((int*)texDot->_cuData, num1, num2, 
1672
                                                                                                (texCRT? (int3*)texCRT->_cuData : NULL),
1673
                                                                                                MatH, hdistmax, MatF, fdistmax);
1674
}
1675

    
1676

    
1677
texture<int,  1, cudaReadModeElementType> texDOT;
1678

    
1679
#define ROWMATCH_BLOCK_WIDTH 32
1680
#define ROWMATCH_BLOCK_HEIGHT 1
1681

    
1682
void __global__  RowMatch_Kernel(int*d_dot, int* d_result, int num2, float distmax, float ratiomax)
1683
{
1684
#if ROWMATCH_BLOCK_HEIGHT == 1
1685
        __shared__ int dotmax[ROWMATCH_BLOCK_WIDTH];
1686
        __shared__ int dotnxt[ROWMATCH_BLOCK_WIDTH];
1687
        __shared__ int dotidx[ROWMATCH_BLOCK_WIDTH];
1688
        int        row = blockIdx.y;
1689
#else
1690
        __shared__ int x_dotmax[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1691
        __shared__ int x_dotnxt[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1692
        __shared__ int x_dotidx[ROWMATCH_BLOCK_HEIGHT][ROWMATCH_BLOCK_WIDTH];
1693
        int*        dotmax = x_dotmax[threadIdx.y];
1694
        int*        dotnxt = x_dotnxt[threadIdx.y];
1695
        int*        dotidx = x_dotidx[threadIdx.y];
1696
        int row = IMUL(blockIdx.y, ROWMATCH_BLOCK_HEIGHT) + threadIdx.y;
1697
#endif
1698

    
1699
        int base_address = IMUL(row , num2);
1700
        int t_dotmax = 0, t_dotnxt = 0, t_dotidx = -1;
1701
        for(int i = 0; i < num2; i += ROWMATCH_BLOCK_WIDTH)
1702
        {
1703
                if(threadIdx.x + i < num2)
1704
                {
1705
                        int v = tex1Dfetch(texDOT, base_address + threadIdx.x + i);//d_dot[base_address + threadIdx.x + i];//
1706
                        bool test = v > t_dotmax;
1707
                        t_dotnxt = test? t_dotmax : max(t_dotnxt, v);
1708
                        t_dotidx = test? (threadIdx.x + i) : t_dotidx;
1709
                        t_dotmax = test? v: t_dotmax;
1710
                }
1711
                __syncthreads();
1712
        }
1713
        dotmax[threadIdx.x] = t_dotmax;
1714
        dotnxt[threadIdx.x] = t_dotnxt;
1715
        dotidx[threadIdx.x] = t_dotidx;
1716
        __syncthreads();
1717
        
1718
#pragma unroll
1719
        for(int step = ROWMATCH_BLOCK_WIDTH/2; step >0; step /= 2)
1720
        {
1721
                if(threadIdx.x < step)
1722
                {
1723
                        int v1 = dotmax[threadIdx.x], v2 = dotmax[threadIdx.x + step];
1724
                        bool test =  v2 > v1;
1725
                        dotnxt[threadIdx.x] = test? max(v1, dotnxt[threadIdx.x + step]) :max(dotnxt[threadIdx.x], v2);
1726
                        dotidx[threadIdx.x] = test? dotidx[threadIdx.x + step] : dotidx[threadIdx.x];
1727
                        dotmax[threadIdx.x] = test? v2 : v1;
1728
                }
1729
                __syncthreads();
1730
        }
1731
        if(threadIdx.x == 0)
1732
        {
1733
                float dist =  acos(min(dotmax[0] * 0.000003814697265625f, 1.0));
1734
                float distn = acos(min(dotnxt[0] * 0.000003814697265625f, 1.0));
1735
                //float ratio = dist / distn;
1736
                d_result[row] = (dist < distmax) && (dist < distn * ratiomax) ? dotidx[0] : -1;//?  : -1;
1737
        }
1738

    
1739
}
1740

    
1741

    
1742
void ProgramCU::GetRowMatch(CuTexImage* texDot, CuTexImage* texMatch, float distmax, float ratiomax)
1743
{
1744
        int num1 = texDot->GetImgHeight();
1745
        int num2 = texDot->GetImgWidth();
1746
        dim3 grid(1, num1/ROWMATCH_BLOCK_HEIGHT);
1747
        dim3 block(ROWMATCH_BLOCK_WIDTH, ROWMATCH_BLOCK_HEIGHT);
1748
        texDot->BindTexture(texDOT);
1749
        RowMatch_Kernel<<<grid, block>>>((int*)texDot->_cuData,
1750
                (int*)texMatch->_cuData, num2, distmax, ratiomax);
1751
}
1752

    
1753
#define COLMATCH_BLOCK_WIDTH 32
1754

    
1755
//texture<int3,  1, cudaReadModeElementType> texCT;
1756

    
1757
void __global__  ColMatch_Kernel(int3*d_crt, int* d_result, int height, int num2, float distmax, float ratiomax)
1758
{
1759
        int col = COLMATCH_BLOCK_WIDTH * blockIdx.x + threadIdx.x;
1760
        if(col >= num2) return;
1761
        int3 result = d_crt[col];//tex1Dfetch(texCT, col);
1762
        int read_idx = col + num2;
1763
        for(int i = 1; i < height; ++i, read_idx += num2)
1764
        {
1765
                int3 temp = d_crt[read_idx];//tex1Dfetch(texCT, read_idx);
1766
                result = result.x < temp.x?
1767
                        make_int3(temp.x, temp.y, max(result.x, temp.z)) :
1768
                        make_int3(result.x, result.y, max(result.z, temp.x));
1769
        }
1770

    
1771
        float dist =  acos(min(result.x * 0.000003814697265625f, 1.0));
1772
        float distn = acos(min(result.z * 0.000003814697265625f, 1.0));
1773
                //float ratio = dist / distn;
1774
        d_result[col] = (dist < distmax) && (dist < distn * ratiomax) ? result.y : -1;//?  : -1;
1775

    
1776
}
1777

    
1778
void ProgramCU::GetColMatch(CuTexImage* texCRT, CuTexImage* texMatch, float distmax, float ratiomax)
1779
{
1780
        int height = texCRT->GetImgHeight();
1781
        int num2 = texCRT->GetImgWidth();
1782
        //texCRT->BindTexture(texCT);
1783
    dim3 grid((num2 + COLMATCH_BLOCK_WIDTH -1) / COLMATCH_BLOCK_WIDTH);
1784
    dim3 block(COLMATCH_BLOCK_WIDTH);
1785
        ColMatch_Kernel<<<grid, block>>>((int3*)texCRT->_cuData, (int*) texMatch->_cuData, height, num2, distmax, ratiomax);
1786
}
1787

    
1788
#endif